├── .github
└── workflows
│ └── ci.yaml
├── .gitignore
├── LICENSE
├── README.md
├── bricksrl
├── Pybricks
│ └── PybricksHubClass.py
├── __init__.py
└── environments
│ ├── __init__.py
│ ├── base
│ └── base_env.py
│ ├── dummy
│ ├── mixed_obs_dummy.py
│ └── vec_obs_dummy.py
│ ├── roboarm_mixed_v0
│ ├── RoboArmMixedEnv.py
│ └── client.py
│ ├── roboarm_v0
│ ├── RoboArmEnv.py
│ ├── RoboArmSim.py
│ └── client.py
│ ├── runaway_v0
│ ├── RunAwayEnv.py
│ └── client.py
│ ├── spinning_v0
│ ├── SpinningEnv.py
│ └── client.py
│ └── walker_v0
│ ├── WalkerEnv.py
│ ├── WalkerEnvSim.py
│ └── client.py
├── conf
├── README.md
├── agent
│ ├── bc.yaml
│ ├── cql.yaml
│ ├── droq.yaml
│ ├── iql.yaml
│ ├── random.yaml
│ ├── sac.yaml
│ └── td3.yaml
├── config.yaml
└── env
│ ├── roboarm-v0.yaml
│ ├── roboarm_mixed-v0.yaml
│ ├── roboarm_sim-v0.yaml
│ ├── runaway-v0.yaml
│ ├── spinning-v0.yaml
│ ├── walker-v0.yaml
│ └── walker_sim-v0.yaml
├── examples
├── README.md
├── custom_env.py
├── example_notebook.ipynb
└── torchrl_sac
│ ├── config.yaml
│ ├── train.py
│ └── utils.py
├── experiments
├── 2wheeler
│ ├── eval.py
│ ├── pretrain.py
│ └── train.py
├── helper
│ ├── __init__.py
│ ├── agents
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── behavior_cloning.py
│ │ ├── cql.py
│ │ ├── iql.py
│ │ ├── random.py
│ │ ├── sac.py
│ │ └── td3.py
│ ├── networks
│ │ ├── __init__.py
│ │ └── networks.py
│ └── utils.py
├── roboarm
│ ├── eval.py
│ ├── pretrain.py
│ └── train.py
└── walker
│ ├── eval.py
│ ├── pretrain.py
│ └── train.py
├── pyproject.toml
├── setup.py
└── tests
├── __init__.py
├── test_agents.py
└── test_env_sim.py
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | # Runs on both push and pull_request events to the main branch.
5 | push:
6 | branches:
7 | - '**' # or master, depending on your default branch
8 | pull_request:
9 | branches:
10 | - main # or master
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 |
17 | strategy:
18 | matrix:
19 | python-version: [3.8.18, 3.9]
20 |
21 | steps:
22 | - name: Checkout code
23 | uses: actions/checkout@v3
24 |
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: ${{ matrix.python-version }}
29 |
30 | - name: Install dependencies
31 | run: |
32 | python -m pip install --upgrade pip
33 | pip install -e .[dev]
34 |
35 | - name: List files # checking if the files are in the right place
36 | run: |
37 | ls
38 |
39 | - name: Run tests with pytest
40 | run: |
41 | pytest
42 |
43 | - name: Check code formatting with ufmt
44 | run: ufmt check .
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | .DS_Store
3 | __pycache__/
4 | *.py[cod]
5 | *$py.class
6 |
7 | # C extensions
8 | *.so
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | pip-wheel-metadata/
25 | share/python-wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .nox/
45 | .coverage
46 | .coverage.*
47 | .cache
48 | nosetests.xml
49 | coverage.xml
50 | *.cover
51 | *.py,cover
52 | .hypothesis/
53 | .pytest_cache/
54 |
55 | # Translations
56 | *.mo
57 | *.pot
58 |
59 | # Django stuff:
60 | *.log
61 | local_settings.py
62 | db.sqlite3
63 | db.sqlite3-journal
64 |
65 | # Flask stuff:
66 | instance/
67 | .webassets-cache
68 |
69 | # Scrapy stuff:
70 | .scrapy
71 |
72 | # Sphinx documentation
73 | docs/_build/
74 |
75 | # PyBuilder
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | .python-version
87 |
88 | # pipenv
89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
92 | # install all needed dependencies.
93 | #Pipfile.lock
94 |
95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
96 | __pypackages__/
97 |
98 | # Celery stuff
99 | celerybeat-schedule
100 | celerybeat.pid
101 |
102 | # SageMath parsed files
103 | *.sage.py
104 |
105 | # Environments
106 | .env
107 | .venv
108 | env/
109 | venv/
110 | ENV/
111 | env.bak/
112 | venv.bak/
113 |
114 | # Spyder project settings
115 | .spyderproject
116 | .spyproject
117 |
118 | # Rope project settings
119 | .ropeproject
120 |
121 | # mkdocs documentation
122 | /site
123 |
124 | # mypy
125 | .mypy_cache/
126 | .dmypy.json
127 | dmypy.json
128 |
129 | # Pyre type checker
130 | .pyre/
131 |
132 | # wandb
133 | wandb/
134 | # hydra
135 | outputs/
136 | # .pth files
137 | *.pth
138 |
139 | # vscode
140 | .vscode/
141 |
142 | # dev tools
143 | pytest.ini
144 | .pre-commit-config.yaml
145 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 www.compscience.org
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BricksRL
2 |
3 | 
4 | 
5 | [](https://arxiv.org/abs/2406.17490)
6 | [](https://bricksrl.github.io/ProjectPage/)
7 | [](https://discord.gg/qdTsFaVfZm)
8 |
9 |
10 | BricksRL allows the training of custom LEGO robots using deep reinforcement learning. By integrating [Pybricks](https://pybricks.com/) and [TorchRL](https://pytorch.org/rl/stable/index.html), it facilitates efficient real-world training via Bluetooth communication between LEGO hubs and a local computing device. Check out our [paper](https://arxiv.org/abs/2406.17490)!
11 |
12 | For additional information and building instructions for the robots, view the project page [BricksRL](https://bricksrl.github.io/ProjectPage/).
13 |
14 |
15 |
16 |
17 | ## Prerequisites
18 |
19 | Click me
20 |
21 | ### Enable web Bluetooth on chrome
22 |
23 | 1. Go to "chrome://flags/"
24 | 2. enable "Experimental Web Platform features"
25 | 3. restart chrome
26 | 4. Use beta.pybricks.com to edit and upload the client scripts for each environment
27 |
28 | ### Environment Setup
29 |
30 | 1. **Create a Conda environment:**
31 | ```bash
32 | conda create --name bricksrl python=3.8
33 | ```
34 | 2. **Activate the environment:**
35 | ```bash
36 | conda activate bricksrl
37 | ```
38 | 3. **Install PyTorch:**
39 | ```bash
40 | pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
41 |
42 | ```
43 | 4. **Install bricksrl and additional packages:**
44 | For regular users, install the package and all required dependencies by running:
45 | ```bash
46 | pip install -e .
47 | ```
48 |
49 | This will install the bricksrl package along with the dependencies listed in setup.py.
50 |
51 | 5. **(Optional) Install development tools:**
52 |
53 | If you are a developer and need to install development tools (e.g., pytest, ufmt, pre-commit), use the following command to install them as extras:
54 |
55 | ```bash
56 | pip install -e .[dev]
57 | ```
58 | This will install the development dependencies defined in the setup.py file along with the package.
59 |
60 |
61 |
62 |
63 | ## Usage
64 | ### Client
65 | Update your client script on the [Pybricks Hub](https://beta.pybricks.com/) whenever you want to run a new environment with your robot.
66 |
67 |
68 | ### Config
69 | Before running experiments, please review and modify the configuration settings according to your needs. Each environment and agent setup has its own specific configuration file under the configs/ directory. For more information checkout the [config README](conf/README.md).
70 |
71 |
72 | ### Robots
73 |
74 | Robots utilized for our experiments. Building instructions can be found [here](https://bricksrl.github.io/ProjectPage/).
75 |
76 | |  |  |  |
77 | |:--:|:--:|:--:|
78 | | **2Wheeler** | **Walker** | **RoboArm** |
79 |
80 |
81 | ## Run Experiments
82 | ### Train an Agent
83 |
84 | ```bash
85 | python experiments/walker/train.py
86 | ```
87 |
88 | ### Evaluate an Agent
89 | ```bash
90 | python experiments/walker/eval.py
91 | ```
92 |
93 | ## Results
94 |
95 | Click me
96 |
97 | Evaluation videos of the trained agents can be found [here](https://bricksrl.github.io/ProjectPage/).
98 |
99 | ### 2Wheeler Results:
100 |
101 |
102 |
103 | ### Walker Results:
104 |
105 |
106 |
107 | ### RoboArm Results:
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 | ### Offline RL
117 |
118 | Click me
119 |
120 | With the use of precollected [datasets](https://huggingface.co/datasets/compsciencelab/BricksRL-Datasets) we can pretrain agents with offline RL to perform a task without the need of real world interaction. Such pretrained policies can be evaluated directly or used for later training to fine tuning the pretrained policy on the real robot.
121 |
122 | #### Datasets
123 | The datasets can be downloaded from huggingface and contain expert and random transitions for the 2Wheeler (RunAway-v0 and Spinning-v0), Walker (Walker-v0) and RoboArm (RoboArm-v0) robots.
124 |
125 | ```bash
126 | git lfs install
127 | git clone git@hf.co:datasets/compsciencelab/BricksRL-Datasets
128 | ```
129 |
130 | The datasets consist of TensorDicts containing expert and random transitions, which can be directly loaded into the replay buffer. When initiating (pre-)training, simply provide the path to the desired TensorDict when prompted to load the replay buffer.
131 |
132 |
133 | #### Pretrain an Agent
134 |
135 | The execution of an experiment for offline training is similar to the online training except that you run the **pretrain.py** script:
136 |
137 | ```bash
138 | python experiments/walker/pretrain.py
139 | ```
140 |
141 | Trained policies can then be evaluated as before with:
142 |
143 | ```bash
144 | python experiments/walker/eval.py
145 | ```
146 |
147 | Or run training for fine-tuning the policy on the real robot:
148 |
149 | ```bash
150 | python experiments/walker/train.py
151 | ```
152 |
153 |
154 |
155 |
156 | ## Examples
157 |
158 | ### TorchRL and Custom Environment Examples
159 |
160 | Examples to use BricksRL environments with typical training scripts from [TorchRL's sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](examples/).
161 |
162 |
163 | We also provide a template to create your own [custom BricksRL enviornment](examples/custom_env.py) which subsequently can be used directly in the TorchRL examples.
164 |
165 | For more information see the examples [readme](examples/README.md).
166 |
167 |
168 | ### High-Level Examples
169 | In the [example notebook](examples/example_notebook.ipynb) we provide high-level training examples to train a **SAC agent** in the **RoboArmSim-v0** environment and a **TD3 agent** in the **WalkerSim-v0** enviornment.
170 | The examples are based on the experiments for our paper. Stand alone examples similar to the [TorchRL sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](examples/torchrl_sac).
171 |
172 | ## Citation
173 | If you use BricksRL in your work, please refer to this BibTeX entry to cite it:
174 |
175 | ```
176 | @article{dittert2024bricksrl,
177 | title={BricksRL: A Platform for Democratizing Robotics and Reinforcement Learning Research and Education with LEGO},
178 | author={Sebastian Dittert and Vincent Moens and Gianni De Fabritiis},
179 | journal={arXiv preprint arXiv:2406.17490},
180 | year={2024}
181 | }
182 | ```
--------------------------------------------------------------------------------
/bricksrl/Pybricks/PybricksHubClass.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import struct
3 |
4 | from bleak import BleakClient, BleakScanner
5 |
6 |
7 | class PybricksHub:
8 | """Class for connecting to a Pybricks Hub."""
9 |
10 | PYBRICKS_COMMAND_EVENT_CHAR_UUID = "c5f50002-8280-46da-89f4-6d8051e4aeef"
11 | HUB_NAME = "Pybricks Hub"
12 |
13 | def __init__(self, out_format_str: str, state_dim: int):
14 | self.device = None
15 | self.client = None
16 | self.rx_queue = asyncio.Queue(maxsize=8) # LifoQueue
17 | self.loop = asyncio.get_event_loop()
18 |
19 | self.exception_out_data = struct.pack(out_format_str, *([0.0] * state_dim))
20 | self.disconnected = False
21 | self.payload_buffer = None
22 |
23 | def connect(self) -> None:
24 | """Connect to the hub."""
25 | print("Connecting to the hub...")
26 | self.loop.run_until_complete(self._connect())
27 |
28 | async def _connect(self) -> None:
29 | """Connect to the hub."""
30 | try:
31 | # Find the device and initialize client.
32 | self.device = await BleakScanner.find_device_by_name(self.HUB_NAME)
33 | self.client = BleakClient(
34 | self.device, disconnected_callback=self._handle_disconnect
35 | )
36 |
37 | # Connect and get services
38 | print("Switch on the hub", flush=True)
39 | await self.client.connect()
40 | await self.client.start_notify(
41 | self.PYBRICKS_COMMAND_EVENT_CHAR_UUID, self._handle_rx
42 | )
43 |
44 | # Tell user to start program on the hub.
45 | print("Start the program on the hub now with the button.", flush=True)
46 | await asyncio.sleep(5)
47 |
48 | except Exception as e:
49 | # Handle exceptions.
50 | print(e)
51 | await self.disconnect()
52 |
53 | def send(self, data: bytes) -> None:
54 | """Send data to the hub as bytes."""
55 | self.loop.run_until_complete(self._send(data))
56 |
57 | async def _send(self, data: bytes) -> None:
58 | try:
59 | # Send some data to the hub.
60 | await self.client.write_gatt_char(
61 | self.PYBRICKS_COMMAND_EVENT_CHAR_UUID,
62 | b"\x06" + data, # Prepend "write stdin" command
63 | response=False,
64 | )
65 | except Exception as e:
66 | # Handle exceptions.
67 | print(e)
68 | await self.disconnect()
69 |
70 | def disconnect(self) -> None:
71 | """
72 | Disconnect from the hub.
73 | This method disconnects the hub from the client.
74 | """
75 | if self.client and not self.disconnected:
76 | asyncio.create_task(self._disconnect())
77 |
78 | async def _disconnect(self) -> None:
79 | try:
80 | # Disconnect when we are done.
81 | if self.client:
82 | await self.client.disconnect()
83 | except Exception as e:
84 | # Handle exceptions.
85 | print(e)
86 | finally:
87 | self.disconnected = True
88 |
89 | self.client = None
90 | self.device = None
91 | self.rx_char = None
92 |
93 | def _handle_disconnect(self, _) -> None:
94 | print("Hub was disconnected.")
95 | self.disconnect()
96 |
97 | async def _handle_rx(self, _, data: bytes) -> None:
98 | # add received data to the queue
99 | if data[0] == 0x01: # "write stdout" event (0x01)
100 | payload = data[1:]
101 | # print("Received:", payload)
102 | if (
103 | len(payload) != len(self.exception_out_data)
104 | and self.payload_buffer is None
105 | ):
106 | self.payload_buffer = payload
107 | elif (
108 | len(payload) != len(self.exception_out_data)
109 | and self.payload_buffer is not None
110 | ):
111 | self.payload_buffer += payload
112 | if self.payload_buffer.__len__() == len(self.exception_out_data):
113 | await self.rx_queue.put(self.payload_buffer)
114 | self.payload_buffer = None
115 | else:
116 | await self.rx_queue.put(payload)
117 |
118 | async def _read_data(self) -> bytes:
119 | try:
120 | # get data from the queue
121 | return await self.rx_queue.get()
122 | except asyncio.QueueEmpty:
123 | print("Queue is empty, returning zeros")
124 | return self.exception_out_data
125 |
126 | def read(self) -> bytes:
127 | """Read data from the hub and return it as a bytearray."""
128 | return self.loop.run_until_complete(self._read_data())
129 |
130 | def close(self) -> None:
131 | if not self.loop.is_closed():
132 | self.loop.run_until_complete(self._disconnect())
133 |
--------------------------------------------------------------------------------
/bricksrl/__init__.py:
--------------------------------------------------------------------------------
1 | from bricksrl.environments.base.base_env import BaseEnv
2 | from bricksrl.Pybricks.PybricksHubClass import PybricksHub
3 |
--------------------------------------------------------------------------------
/bricksrl/environments/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from torchrl.envs import (
3 | CatFrames,
4 | Compose,
5 | ObservationNorm,
6 | ToTensorImage,
7 | TransformedEnv,
8 | )
9 |
10 | from bricksrl.environments.roboarm_mixed_v0.RoboArmMixedEnv import RoboArmMixedEnv_v0
11 | from bricksrl.environments.roboarm_v0.RoboArmEnv import RoboArmEnv_v0
12 | from bricksrl.environments.roboarm_v0.RoboArmSim import RoboArmSimEnv_v0
13 | from bricksrl.environments.runaway_v0.RunAwayEnv import RunAwayEnv_v0
14 | from bricksrl.environments.spinning_v0.SpinningEnv import SpinningEnv_v0
15 | from bricksrl.environments.walker_v0.WalkerEnv import WalkerEnv_v0
16 | from bricksrl.environments.walker_v0.WalkerEnvSim import WalkerEnvSim_v0
17 |
18 | VIDEO_LOGGING_ENVS = ["roboarm_mixed-v0", "walker_mixed-v0"]
19 | ALL_2WHEELER_ENVS = ["spinning-v0", "runaway-v0"]
20 | ALL_WALKER_ENVS = [
21 | "walker-v0",
22 | "walker_sim-v0",
23 | ]
24 | ALL_ROBOARM_ENVS = [
25 | "roboarm-v0",
26 | "roboarm_mixed-v0",
27 | "roboarm_sim-v0",
28 | ]
29 | ALL_ENVS = ALL_2WHEELER_ENVS + ALL_WALKER_ENVS + ALL_ROBOARM_ENVS
30 |
31 |
32 | # TODO: maybe outsorce this to experiments/helper and not in bricksrl
33 | def make_env(config, pretrain=False):
34 | """
35 | Creates a new environment based on the provided configuration.
36 |
37 | Args:
38 | config: A configuration object containing the environment name and maximum episode steps.
39 | pretrain: A boolean indicating whether the environment is for pretraining.
40 |
41 | Returns:
42 | A tuple containing the new environment, its action spec, and its state spec.
43 | """
44 | env = make(name=config.env.name, env_conf=config.env, pretrain=pretrain)
45 | observation_keys = [key for key in env.observation_spec.keys()]
46 |
47 | transforms = []
48 | if config.env.frame_stack > 1:
49 | transforms.append(
50 | CatFrames(
51 | N=config.env.frame_stack,
52 | in_keys=observation_keys,
53 | out_key=observation_keys,
54 | )
55 | )
56 | if config.env.action_filter < 1:
57 | raise NotImplementedError("ActionFilterWrapper not implemented yet")
58 | # TODO: add this to torchrl
59 | # env = ActionFilterWrapper(
60 | # env, current_action_influence=config.env.action_filter
61 | # )
62 | normalize_keys = [key for key in observation_keys if key != "pixels"]
63 | obs_ranges = np.array(list(env.observation_ranges.values()))
64 | obs_mean = obs_ranges.mean(axis=-1) # mean of min and max
65 | obs_std = obs_ranges.std(axis=-1) # std of min and max
66 | transforms.append(
67 | ObservationNorm(
68 | in_keys=normalize_keys, loc=obs_mean, scale=obs_std, standard_normal=True
69 | )
70 | )
71 | if "pixels" in observation_keys:
72 | transforms.append(ToTensorImage(in_keys=["pixels"], from_int=True))
73 |
74 | env = TransformedEnv(env, Compose(*transforms))
75 |
76 | action_spec = env.action_spec
77 | state_spec = env.observation_spec
78 |
79 | return env, action_spec, state_spec
80 |
81 |
82 | def make(name="RunAway", env_conf=None, pretrain=False):
83 | if name == "runaway-v0":
84 | return RunAwayEnv_v0(
85 | max_episode_steps=env_conf.max_episode_steps,
86 | min_distance=env_conf.min_distance,
87 | verbose=env_conf.verbose,
88 | pretrain=pretrain,
89 | )
90 | elif name == "spinning-v0":
91 | return SpinningEnv_v0(
92 | max_episode_steps=env_conf.max_episode_steps,
93 | sleep_time=env_conf.sleep_time,
94 | verbose=env_conf.verbose,
95 | pretrain=pretrain,
96 | )
97 | elif name == "walker-v0":
98 | return WalkerEnv_v0(
99 | max_episode_steps=env_conf.max_episode_steps,
100 | verbose=env_conf.verbose,
101 | sleep_time=env_conf.sleep_time,
102 | pretrain=pretrain,
103 | )
104 | elif name == "walker_sim-v0":
105 | return WalkerEnvSim_v0(
106 | max_episode_steps=env_conf.max_episode_steps,
107 | noise=env_conf.noise,
108 | low_action_angle=env_conf.low_action_angle,
109 | high_action_angle=env_conf.high_action_angle,
110 | verbose=env_conf.verbose,
111 | )
112 | elif name == "roboarm-v0":
113 | return RoboArmEnv_v0(
114 | max_episode_steps=env_conf.max_episode_steps,
115 | verbose=env_conf.verbose,
116 | sleep_time=env_conf.sleep_time,
117 | reward_signal=env_conf.reward_signal,
118 | pretrain=pretrain,
119 | )
120 | elif name == "roboarm_sim-v0":
121 | return RoboArmSimEnv_v0(
122 | max_episode_steps=env_conf.max_episode_steps,
123 | verbose=env_conf.verbose,
124 | noise=env_conf.noise,
125 | reward_signal=env_conf.reward_signal,
126 | )
127 | elif name == "roboarm_mixed-v0":
128 | return RoboArmMixedEnv_v0(
129 | max_episode_steps=env_conf.max_episode_steps,
130 | sleep_time=env_conf.sleep_time,
131 | verbose=env_conf.verbose,
132 | reward_signal=env_conf.reward_signal,
133 | camera_id=env_conf.camera_id,
134 | goal_radius=env_conf.goal_radius,
135 | pretrain=pretrain,
136 | )
137 | else:
138 | print("Environment not found")
139 |
--------------------------------------------------------------------------------
/bricksrl/environments/base/base_env.py:
--------------------------------------------------------------------------------
1 | import struct
2 | import sys
3 |
4 | import numpy as np
5 |
6 | import torch
7 | from bricksrl.Pybricks.PybricksHubClass import PybricksHub
8 | from tensordict import TensorDict, TensorDictBase
9 | from torchrl.envs import EnvBase
10 |
11 |
12 | class BaseEnv(EnvBase):
13 | """
14 | The base class for reinforcement learning environments used with the Lego robots.
15 |
16 | Args:
17 | action_dim (int): The dimensionality of the action space.
18 | state_dim (int): The dimensionality of the state space.
19 | use_hub (bool): Whether to use the Pybricks hub for communication, if False, only
20 | the observation spec and action specs are created and can be used.
21 | Can be helpful for testing and debugging as you dont connect to the hub.
22 | verbose (bool): Whether to print verbose output.
23 | """
24 |
25 | def __init__(
26 | self,
27 | action_dim: int,
28 | state_dim: int,
29 | use_hub: bool = True,
30 | verbose: bool = False,
31 | ):
32 | self.verbose = verbose
33 | self.action_dim = action_dim
34 | self.state_dim = state_dim
35 |
36 | self.action_format_str = "!" + "f" * self.action_dim
37 | self.state_format_str = "!" + "f" * self.state_dim
38 |
39 | self.expected_bytesize = struct.calcsize(self.state_format_str)
40 |
41 | # buffer state in case of missing data
42 | self.buffered_state = np.zeros(self.state_dim, dtype=np.float32)
43 |
44 | if use_hub:
45 | self.hub = PybricksHub(
46 | state_dim=state_dim, out_format_str=self.state_format_str
47 | )
48 | self.hub.connect()
49 | print("Connected to hub.")
50 | else:
51 | self.hub = None
52 | super().__init__(batch_size=torch.Size([1]))
53 |
54 | def send_to_hub(self, action: np.array) -> None:
55 | """
56 | Sends the given action to the hub as bytes.
57 |
58 | Args:
59 | action (np.array): The action to send to the hub as a numpy array.
60 |
61 | Raises:
62 | AssertionError: If the shape of the action does not match the action dimension.
63 | """
64 | assert (
65 | action.shape[0] == self.action_dim
66 | ), "Action shape does not match action dimension."
67 | byte_action = struct.pack(self.action_format_str, *action)
68 | if self.verbose:
69 | print("Sending data size: ", len(byte_action))
70 | print("Sending data: ", byte_action)
71 | self.hub.send(byte_action)
72 |
73 | def read_from_hub(self) -> np.array:
74 | """
75 | Reads the current state of the environment from the hub and returns it as a numpy array.
76 |
77 | Returns:
78 | np.array: The current state of the environment as a numpy array.
79 | """
80 | byte_state = self.hub.read()
81 | if self.verbose:
82 | print("Reading data size: ", sys.getsizeof(byte_state))
83 | print("Reading data: ", byte_state)
84 | print("len: ", len(byte_state))
85 |
86 | if len(byte_state) != self.expected_bytesize:
87 | print(
88 | "State has size {} but should have size {}.".format(
89 | len(byte_state), struct.calcsize(self.state_format_str)
90 | )
91 | )
92 | print("Returning previous state.")
93 | state = self.buffered_state
94 | print("State: ", state)
95 | else:
96 | state = np.array([struct.unpack(self.state_format_str, byte_state)])
97 | self.buffered_state = state
98 | assert (
99 | state.shape[1] == self.state_dim
100 | ), f"State has shape {state.shape[0]} and does not match state dimension: {self.state_dim}."
101 | return state
102 |
103 | def sample_random_action(self, tensordict: TensorDictBase) -> TensorDictBase:
104 | """
105 | Sample a random action from the action space.
106 |
107 | Returns:
108 | TensorDictBase: A dictionary containing the sampled action.
109 | """
110 | if tensordict is not None:
111 | tensordict.set("action", self.action_spec.rand())
112 | return tensordict
113 | else:
114 | return TensorDict({"action": self.action_spec.rand()}, [])
115 |
116 | def close(self) -> None:
117 | if self.hub is not None:
118 | self.hub.close()
119 |
120 | def _step(
121 | self,
122 | ):
123 | raise NotImplementedError
124 |
125 | def _reset(
126 | self,
127 | ):
128 | raise NotImplementedError
129 |
130 | def _set_seed(self, seed: int):
131 | np.random.seed(seed)
132 | torch.manual_seed(seed)
133 |
134 |
135 | class BaseSimEnv(EnvBase):
136 | """
137 | The base class for reinforcement learning environments used to simulate Lego robots.
138 |
139 | Args:
140 | action_dim (int): The dimensionality of the action space.
141 | state_dim (int): The dimensionality of the state space.
142 | verbose (bool): Whether to print verbose output.
143 | use_hub (bool): This argument is kept for compatibility but is not used in the simulation environment.
144 | """
145 |
146 | def __init__(
147 | self,
148 | action_dim: int,
149 | state_dim: int,
150 | verbose: bool = False,
151 | use_hub: bool = False,
152 | ):
153 | self.verbose = verbose
154 | self.action_dim = action_dim
155 | self.state_dim = state_dim
156 |
157 | super().__init__(batch_size=torch.Size([1]))
158 |
159 | def sample_random_action(self, tensordict: TensorDictBase) -> TensorDictBase:
160 | """
161 | Sample a random action from the action space.
162 |
163 | Returns:
164 | TensorDictBase: A dictionary containing the sampled action.
165 | """
166 | if tensordict is not None:
167 | tensordict.set("action", self.action_spec.rand())
168 | return tensordict
169 | else:
170 | return TensorDict({"action": self.action_spec.rand()}, [])
171 |
172 | def _step(
173 | self,
174 | ):
175 | raise NotImplementedError
176 |
177 | def _reset(
178 | self,
179 | ):
180 | raise NotImplementedError
181 |
182 | def _set_seed(self, seed: int):
183 | """
184 | Sets the seed for the environment's random number generator.
185 |
186 | Args:
187 | seed (int): The seed to set.
188 | """
189 | np.random.seed(seed)
190 | torch.manual_seed(seed)
191 |
--------------------------------------------------------------------------------
/bricksrl/environments/dummy/mixed_obs_dummy.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | import numpy as np
4 |
5 | import torch
6 |
7 | from tensordict import TensorDict, TensorDictBase
8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
9 | from torchrl.envs import EnvBase
10 |
11 |
12 | class MixedObsDummyEnv(EnvBase):
13 | """
14 | MixedObsDummyEnv is a dummy environment for testing purposes.
15 | It does not connec to Pybricks
16 |
17 | """
18 |
19 | action_dim = 4
20 | state_dim = 7
21 | observation_key = "observation"
22 | pixel_observation_key = "pixels"
23 |
24 | def __init__(self, max_episode_steps=10, img_shape=(64, 64, 3)):
25 | self.max_episode_steps = max_episode_steps
26 | self._batch_size = torch.Size([1])
27 | self.action_spec = BoundedTensorSpec(
28 | low=-torch.ones((1, self.action_dim)),
29 | high=torch.ones((1, self.action_dim)),
30 | shape=(1, self.action_dim),
31 | )
32 |
33 | observation_spec = BoundedTensorSpec(
34 | low=-torch.ones((1, self.state_dim)),
35 | high=torch.ones((1, self.state_dim)),
36 | )
37 |
38 | pixel_observation_spec = BoundedTensorSpec(
39 | low=torch.zeros((1,) + img_shape, dtype=torch.uint8),
40 | high=torch.ones((1,) + img_shape, dtype=torch.uint8) * 255,
41 | )
42 |
43 | self.observation_spec = CompositeSpec(shape=(1,))
44 | self.observation_spec.set(self.observation_key, observation_spec)
45 | self.observation_spec.set(self.pixel_observation_key, pixel_observation_spec)
46 | super().__init__(batch_size=self._batch_size)
47 |
48 | def _set_seed(self, seed: int):
49 | return super()._set_seed(seed)
50 |
51 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
52 | """
53 | Reset the environment and return the initial state.
54 |
55 | Returns:
56 | TensorDictBase: The initial state of the environment.
57 | """
58 | # TODO solve this fake action sending before to receive first state
59 | self.episode_step_iter = 0
60 | observation = self.observation_spec[self.observation_key].rand()
61 | pixel_observation = self.observation_spec[self.pixel_observation_key].rand()
62 | return TensorDict(
63 | {
64 | self.observation_key: observation.float(),
65 | self.pixel_observation_key: pixel_observation,
66 | },
67 | batch_size=[1],
68 | )
69 |
70 | def reward(
71 | self,
72 | action: np.ndarray,
73 | next_state: np.ndarray,
74 | ) -> Tuple[float, bool]:
75 | """ """
76 | return 0.0, False
77 |
78 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
79 | """ """
80 | action = tensordict.get("action").cpu().numpy()
81 | observation = self.observation_spec[self.observation_key].rand()
82 | pixel_observation = self.observation_spec[self.pixel_observation_key].rand()
83 |
84 | reward, done = self.reward(
85 | action=action,
86 | next_state=observation,
87 | )
88 | next_tensordict = TensorDict(
89 | {
90 | self.observation_key: observation.float(),
91 | self.pixel_observation_key: pixel_observation,
92 | "reward": torch.tensor([reward]).float(),
93 | "done": torch.tensor([done]).bool(),
94 | },
95 | batch_size=[1],
96 | )
97 |
98 | # increment episode step counter
99 | self.episode_step_iter += 1
100 | if self.episode_step_iter >= self.max_episode_steps:
101 | next_tensordict.set("done", torch.tensor([True]))
102 | return next_tensordict
103 |
--------------------------------------------------------------------------------
/bricksrl/environments/dummy/vec_obs_dummy.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | import numpy as np
4 |
5 | import torch
6 |
7 | from tensordict import TensorDict, TensorDictBase
8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
9 | from torchrl.envs import EnvBase
10 |
11 |
12 | class VecObsDummyEnv(EnvBase):
13 | """
14 | VecObsDummyEnv is a dummy environment for testing purposes.
15 | It does not connec to Pybricks
16 |
17 | """
18 |
19 | action_dim = 4
20 | state_dim = 7
21 | observation_key = "observation"
22 |
23 | def __init__(self, max_episode_steps=10):
24 | self.max_episode_steps = max_episode_steps
25 | self._batch_size = torch.Size([1])
26 | self.action_spec = BoundedTensorSpec(
27 | low=-torch.ones((1, self.action_dim)),
28 | high=torch.ones((1, self.action_dim)),
29 | shape=(1, self.action_dim),
30 | )
31 |
32 | observation_spec = BoundedTensorSpec(
33 | low=-torch.ones((1, self.state_dim)),
34 | high=torch.ones((1, self.state_dim)),
35 | )
36 |
37 | self.observation_spec = CompositeSpec(shape=(1,))
38 | self.observation_spec.set(self.observation_key, observation_spec)
39 | super().__init__(batch_size=self._batch_size)
40 |
41 | def _set_seed(self, seed: int):
42 | return super()._set_seed(seed)
43 |
44 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
45 | """
46 | Reset the environment and return the initial state.
47 |
48 | Returns:
49 | TensorDictBase: The initial state of the environment.
50 | """
51 | # TODO solve this fake action sending before to receive first state
52 | self.episode_step_iter = 0
53 | observation = self.observation_spec[self.observation_key].rand()
54 | return TensorDict(
55 | {
56 | self.observation_key: observation.float(),
57 | },
58 | batch_size=[1],
59 | )
60 |
61 | def reward(
62 | self,
63 | action: np.ndarray,
64 | next_state: np.ndarray,
65 | ) -> Tuple[float, bool]:
66 | """ """
67 | return 0.0, False
68 |
69 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
70 | """ """
71 | action = tensordict.get("action").cpu().numpy()
72 | next_observation = self.observation_spec[self.observation_key].rand()
73 |
74 | reward, done = self.reward(
75 | action=action,
76 | next_state=next_observation,
77 | )
78 | next_tensordict = TensorDict(
79 | {
80 | self.observation_key: next_observation.float(),
81 | "reward": torch.tensor([reward]).float(),
82 | "done": torch.tensor([done]).bool(),
83 | },
84 | batch_size=[1],
85 | )
86 |
87 | # increment episode step counter
88 | self.episode_step_iter += 1
89 | if self.episode_step_iter >= self.max_episode_steps:
90 | next_tensordict.set("done", torch.tensor([True]))
91 | return next_tensordict
92 |
93 |
94 | class VecGoalObsDummyEnv(EnvBase):
95 | """
96 | VecGoalObsDummyEnv is a dummy environment for testing purposes.
97 | It does not connec to Pybricks
98 |
99 | """
100 |
101 | action_dim = 4
102 | state_dim = 7
103 | observation_key = "observation"
104 | goal_observation_key = "goal_observation"
105 |
106 | def __init__(self, max_episode_steps=10):
107 | self.max_episode_steps = max_episode_steps
108 | self._batch_size = torch.Size([1])
109 | self.action_spec = BoundedTensorSpec(
110 | low=-torch.ones((1, self.action_dim)),
111 | high=torch.ones((1, self.action_dim)),
112 | shape=(1, self.action_dim),
113 | )
114 |
115 | observation_spec = BoundedTensorSpec(
116 | low=-torch.ones((1, self.state_dim)),
117 | high=torch.ones((1, self.state_dim)),
118 | )
119 |
120 | self.observation_spec = CompositeSpec(shape=(1,))
121 | self.observation_spec.set(self.observation_key, observation_spec)
122 | self.observation_spec.set(self.goal_observation_key, observation_spec)
123 | super().__init__(batch_size=self._batch_size)
124 |
125 | def _set_seed(self, seed: int):
126 | return super()._set_seed(seed)
127 |
128 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
129 | """
130 | Reset the environment and return the initial state.
131 |
132 | Returns:
133 | TensorDictBase: The initial state of the environment.
134 | """
135 | # TODO solve this fake action sending before to receive first state
136 | self.episode_step_iter = 0
137 | observation = self.observation_spec[self.observation_key].rand()
138 | goal_observation = self.observation_spec[self.goal_observation_key].rand()
139 | return TensorDict(
140 | {
141 | self.observation_key: observation.float(),
142 | self.goal_observation_key: goal_observation.float(),
143 | },
144 | batch_size=[1],
145 | )
146 |
147 | def reward(
148 | self,
149 | action: np.ndarray,
150 | next_state: np.ndarray,
151 | ) -> Tuple[float, bool]:
152 | """ """
153 | return 0.0, False
154 |
155 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
156 | """ """
157 | action = tensordict.get("action").cpu().numpy()
158 | next_observation = self.observation_spec[self.observation_key].rand()
159 | goal = tensordict.get(self.goal_observation_key)
160 |
161 | reward, done = self.reward(
162 | action=action,
163 | next_state=next_observation,
164 | )
165 | next_tensordict = TensorDict(
166 | {
167 | self.observation_key: next_observation.float(),
168 | self.goal_observation_key: goal.float(),
169 | "reward": torch.tensor([reward]).float(),
170 | "done": torch.tensor([done]).bool(),
171 | },
172 | batch_size=[1],
173 | )
174 |
175 | # increment episode step counter
176 | self.episode_step_iter += 1
177 | if self.episode_step_iter >= self.max_episode_steps:
178 | next_tensordict.set("done", torch.tensor([True]))
179 | return next_tensordict
180 |
--------------------------------------------------------------------------------
/bricksrl/environments/roboarm_mixed_v0/client.py:
--------------------------------------------------------------------------------
1 | import ustruct
2 | from micropython import kbd_intr
3 | from pybricks.hubs import InventorHub
4 | from pybricks.parameters import Port
5 | from pybricks.pupdevices import Motor
6 | from pybricks.tools import wait
7 | from uselect import poll
8 | from usys import stdin, stdout
9 |
10 | kbd_intr(-1)
11 | hub = InventorHub()
12 |
13 | # Initialize and set the motors
14 | high_motor_range = (-150, 10)
15 | high_motor = Motor(Port.A)
16 | high_motor.run_target(speed=400, target_angle=-70)
17 |
18 | low_motor_range = (10, 75)
19 | low_motor = Motor(Port.D)
20 | low_motor.control.limits(500, 1000, 900)
21 | low_motor.run_target(speed=200, target_angle=40)
22 |
23 | rotation_motor_range = (-140, 40)
24 | rotation_motor = Motor(Port.B, gears=[20, 60])
25 | motors = {"HM": high_motor, "LM": low_motor, "RM": rotation_motor}
26 |
27 |
28 | def get_current_motor_angles():
29 | angles = {}
30 | for k, v in motors.items():
31 | angle = normalize_angle(get_angle(v))
32 | angles.update({k: angle})
33 | return angles
34 |
35 |
36 | def run_angle(motor, angle, speed=300):
37 | motor.run_angle(speed=speed, rotation_angle=angle, wait=False)
38 |
39 |
40 | def get_angle(motor):
41 | return motor.angle()
42 |
43 |
44 | def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360):
45 | # Normalize angle to be within -179 to 179 degrees
46 | while angle <= low_angle:
47 | angle += original_one_round
48 | while angle > high_angle:
49 | angle -= original_one_round
50 | return angle
51 |
52 |
53 | def transform_range(value, old_min, old_max, new_min, new_max):
54 | """
55 | Transform a value from one range to another.
56 |
57 | Parameters:
58 | value (float): The value to transform.
59 | old_min (float): The minimum value of the old range.
60 | old_max (float): The maximum value of the old range.
61 | new_min (float): The minimum value of the new range.
62 | new_max (float): The maximum value of the new range.
63 |
64 | Returns:
65 | float: The transformed value.
66 | """
67 | # Compute the scale factor between the old and new ranges
68 | scale = (new_max - new_min) / (old_max - old_min)
69 | # Apply the transformation
70 | return new_min + (value - old_min) * scale
71 |
72 |
73 | keyboard = poll()
74 | keyboard.register(stdin)
75 | motor_speed = 250
76 |
77 | while True:
78 |
79 | while not keyboard.poll(0):
80 | wait(1)
81 |
82 | # Read action values for the motors
83 | data = stdin.buffer.read(12)
84 | rotation_action, low_action, high_action = ustruct.unpack("!fff", data)
85 |
86 | # Transform action range for motors
87 | high_action = transform_range(high_action, -1, 1, -60, 60)
88 | low_action = transform_range(low_action, -1, 1, -30, 30)
89 | rotation_action = transform_range(rotation_action, -1, 1, -90, 90)
90 |
91 | angles = get_current_motor_angles()
92 |
93 | # Adjust high action to ensure it stays within range after being applied
94 | if angles["HM"] + high_action > max(high_motor_range):
95 | high_action = max(high_motor_range) - angles["HM"]
96 | elif angles["HM"] + high_action < min(high_motor_range):
97 | high_action = min(high_motor_range) - angles["HM"]
98 | high_motor.run_angle(speed=motor_speed, rotation_angle=high_action, wait=False)
99 |
100 | # Adjust low action to ensure it stays within range after being applied
101 | if angles["LM"] + low_action > max(low_motor_range):
102 | low_action = max(low_motor_range) - angles["LM"]
103 | elif angles["LM"] + low_action < min(low_motor_range):
104 | low_action = min(low_motor_range) - angles["LM"]
105 | low_motor.run_angle(speed=motor_speed, rotation_angle=low_action, wait=False)
106 |
107 | # Adjust rotation action to ensure it stays within range after being applied
108 | if angles["RM"] + rotation_action > max(rotation_motor_range):
109 | rotation_action = max(rotation_motor_range) - angles["RM"]
110 | elif angles["RM"] + rotation_action < min(rotation_motor_range):
111 | rotation_action = min(rotation_motor_range) - angles["RM"]
112 | rotation_motor.control.limits(250, 200, 500)
113 | rotation_motor.run_angle(
114 | speed=motor_speed, rotation_angle=rotation_action, wait=False
115 | )
116 |
117 | # Small delay to let motors arrive target angle
118 | wait(250)
119 |
120 | # Sometimes low angle jumps out of range and cant move back this corrects those cases
121 | if low_angle < 10:
122 | low_motor.run_target(speed=200, target_angle=10)
123 |
124 | # Read sensors to get current state of the robot
125 | high_angle = high_motor.angle()
126 | low_angle = low_motor.angle()
127 | rotation_angle = rotation_motor.angle()
128 |
129 | # Send current state back to environment
130 | out_msg = ustruct.pack(
131 | "!fff",
132 | high_angle,
133 | low_angle,
134 | rotation_angle,
135 | )
136 | stdout.buffer.write(out_msg)
137 |
--------------------------------------------------------------------------------
/bricksrl/environments/roboarm_v0/RoboArmEnv.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import Tuple
3 |
4 | import numpy as np
5 | import torch
6 |
7 | from bricksrl.environments.base.base_env import BaseEnv
8 | from numpy import linalg
9 | from tensordict import TensorDict, TensorDictBase
10 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
11 |
12 |
13 | class RoboArmEnv_v0(BaseEnv):
14 | """ """
15 |
16 | action_dim = 4 # (Grab_motor_action, high_motor_action, low_motor_action, rotation_motor_action)
17 |
18 | state_dim = 4 # (GM, HM, LM, RM)
19 |
20 | observation_ranges = {
21 | "GM": (-148, -44),
22 | "HM": (-150, 10),
23 | "LM": (10, 70),
24 | "RM": (-180, 179),
25 | }
26 |
27 | observation_key = "observation"
28 | goal_observation_key = "goal_observation"
29 |
30 | def __init__(
31 | self,
32 | max_episode_steps: int = 50,
33 | sleep_time: float = 0.0,
34 | verbose: bool = False,
35 | pretrain: bool = False,
36 | reward_signal: str = "dense",
37 | ):
38 | self.sleep_time = sleep_time
39 |
40 | assert reward_signal in [
41 | "dense",
42 | "sparse",
43 | ], "Reward signal must be dense or sparse."
44 | self.reward_signal = reward_signal
45 | self.max_episode_steps = max_episode_steps
46 | self._batch_size = torch.Size([1])
47 |
48 | # Define action spec
49 | self.action_spec = BoundedTensorSpec(
50 | low=-1,
51 | high=1,
52 | shape=(1, self.action_dim),
53 | )
54 |
55 | self.goal_thresholds = np.array(
56 | [50]
57 | ) # everythin below 20 is very good. 50 is good!
58 | # Observation 4 motors (GM, HM, LM, RM) + goal positions (GGM, GHM, GLM, GRM)
59 | # Define observation spec
60 | bounds = torch.tensor(
61 | [
62 | self.observation_ranges["GM"],
63 | self.observation_ranges["HM"],
64 | self.observation_ranges["LM"],
65 | self.observation_ranges["RM"],
66 | ]
67 | )
68 |
69 | low_bounds = bounds[:, 0].unsqueeze(0)
70 | high_bounds = bounds[:, 1].unsqueeze(0)
71 |
72 | observation_spec = BoundedTensorSpec(
73 | low=low_bounds,
74 | high=high_bounds,
75 | )
76 |
77 | self.observation_spec = CompositeSpec(shape=(1,))
78 | self.observation_spec.set(self.observation_key, observation_spec)
79 | self.observation_spec.set(self.goal_observation_key, observation_spec)
80 | super().__init__(
81 | action_dim=self.action_dim,
82 | state_dim=self.state_dim,
83 | verbose=verbose,
84 | use_hub=1 - pretrain,
85 | )
86 |
87 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
88 | """
89 | Reset the environment and return the initial state.
90 |
91 | Returns:
92 | TensorDictBase: The initial state of the environment.
93 | """
94 | # TODO solve this fake action sending before to receive first state
95 | self.episode_step_iter = 0
96 | if tensordict is not None:
97 | action = tensordict.get("action").cpu().numpy().squeeze()
98 | else:
99 | action = np.zeros(self.action_dim)
100 | self.send_to_hub(action)
101 | time.sleep(self.sleep_time)
102 | observation = self.read_from_hub()
103 | # sample random goal state
104 | self.goal_observation = (
105 | self.observation_spec[self.goal_observation_key].rand().numpy()
106 | )
107 |
108 | return TensorDict(
109 | {
110 | self.observation_key: torch.tensor(observation, dtype=torch.float32),
111 | self.goal_observation_key: torch.tensor(
112 | self.goal_observation, dtype=torch.float32
113 | ),
114 | "error": torch.tensor([0]).float(),
115 | },
116 | batch_size=[1],
117 | )
118 |
119 | @staticmethod
120 | def shortest_angular_distance_vectorized(
121 | theta_goal: np.array, theta_current: np.array
122 | ) -> float:
123 | """
124 | Calculate the shortest angular distance between two arrays of angles.
125 |
126 | Parameters:
127 | - theta_goal: Array of goal angles in degrees.
128 | - theta_current: Array of current angles in degrees.
129 |
130 | Returns:
131 | - Array of the shortest angular distances in degrees.
132 | """
133 |
134 | # Convert angles from degrees to radians
135 | theta_goal_rad = np.radians(theta_goal)
136 | theta_current_rad = np.radians(theta_current)
137 |
138 | # Calculate difference in radians using np.arctan2 for vectorized operation
139 | delta_theta_rad = np.arctan2(
140 | np.sin(theta_goal_rad - theta_current_rad),
141 | np.cos(theta_goal_rad - theta_current_rad),
142 | )
143 |
144 | # Convert result back to degrees
145 | delta_theta_deg = np.degrees(delta_theta_rad)
146 |
147 | return delta_theta_deg
148 |
149 | def reward(
150 | self,
151 | achieved_state: np.array,
152 | ) -> Tuple[float, bool]:
153 | """Reward function of roboarm.
154 |
155 | Args:
156 | achieved_state (np.ndarray): The achieved state.
157 | goal_state (np.ndarray): The goal state.
158 |
159 | Returns:
160 | Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done.
161 | """
162 |
163 | done = False
164 | if self.reward_signal == "dense":
165 | angle_deltas = self.shortest_angular_distance_vectorized(
166 | self.goal_observation, achieved_state
167 | )
168 | error = np.sum(np.abs(angle_deltas))
169 | reward = -error / 100
170 | if error < np.mean(self.goal_thresholds):
171 | done = True
172 | elif self.reward_signal == "sparse":
173 | angle_deltas = self.shortest_angular_distance_vectorized(
174 | self.goal_observation, achieved_state
175 | )
176 | error = np.sum(np.abs(angle_deltas))
177 | if np.all(error <= self.goal_thresholds):
178 | reward = 1
179 | done = True
180 |
181 | else:
182 | reward = 0
183 | else:
184 | raise ValueError("Reward signal must be dense or sparse.")
185 |
186 | return reward, done, error
187 |
188 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
189 | """ """
190 | # Send action to hub to receive next state
191 | self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze())
192 | time.sleep(
193 | self.sleep_time
194 | ) # we need to wait some time for sensors to read and to
195 |
196 | # receive the next state
197 | next_observation = self.read_from_hub()
198 |
199 | # calc reward and done
200 | reward, done, error = self.reward(
201 | achieved_state=next_observation,
202 | )
203 |
204 | next_tensordict = TensorDict(
205 | {
206 | self.observation_key: torch.tensor(
207 | next_observation, dtype=torch.float32
208 | ),
209 | self.goal_observation_key: torch.tensor(
210 | self.goal_observation, dtype=torch.float32
211 | ),
212 | "reward": torch.tensor([reward]).float(),
213 | "done": torch.tensor([done]).bool(),
214 | "error": torch.tensor([error]).float(),
215 | },
216 | batch_size=[1],
217 | )
218 |
219 | # increment episode step counter
220 | self.episode_step_iter += 1
221 | if self.episode_step_iter >= self.max_episode_steps:
222 | next_tensordict.set("done", torch.tensor([True]))
223 | return next_tensordict
224 |
--------------------------------------------------------------------------------
/bricksrl/environments/roboarm_v0/client.py:
--------------------------------------------------------------------------------
1 | import ustruct
2 | from micropython import kbd_intr
3 | from pybricks.hubs import InventorHub
4 | from pybricks.parameters import Port
5 | from pybricks.pupdevices import Motor
6 | from pybricks.tools import wait
7 | from uselect import poll
8 | from usys import stdin, stdout
9 |
10 | kbd_intr(-1)
11 |
12 | hub = InventorHub()
13 |
14 | # Initialize and set the motors
15 | grab_motor_range = (-148, -45)
16 | grab_motor = Motor(Port.E)
17 | grab_motor.run_target(speed=400, target_angle=-95) # start roughly in the middle
18 |
19 | high_motor_range = (-150, 10)
20 | high_motor = Motor(Port.A)
21 | high_motor.run_target(speed=400, target_angle=-70)
22 |
23 | low_motor_range = (10, 70)
24 | low_motor = Motor(Port.D)
25 | low_motor.control.limits(500, 1000, 900)
26 | low_motor.run_target(speed=400, target_angle=40)
27 |
28 | rotation_motor = Motor(Port.B, gears=[20, 60])
29 |
30 | motors = {"GM": grab_motor, "HM": high_motor, "LM": low_motor, "RM": rotation_motor}
31 |
32 |
33 | def get_current_motor_angles():
34 | angles = {}
35 | for k, v in motors.items():
36 | angle = normalize_angle(get_angle(v))
37 | angles.update({k: angle})
38 | return angles
39 |
40 |
41 | def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360):
42 | # Normalize angle to be within -179 to 179 degrees
43 | while angle <= low_angle:
44 | angle += original_one_round
45 | while angle > high_angle:
46 | angle -= original_one_round
47 | return angle
48 |
49 |
50 | def run_angle(motor, angle, speed=300):
51 | motor.run_angle(speed=speed, rotation_angle=angle, wait=False)
52 |
53 |
54 | def get_angle(motor):
55 | return motor.angle()
56 |
57 |
58 | def transform_range(value, old_min, old_max, new_min, new_max):
59 | """
60 | Transform a value from one range to another.
61 |
62 | Parameters:
63 | value (float): The value to transform.
64 | old_min (float): The minimum value of the old range.
65 | old_max (float): The maximum value of the old range.
66 | new_min (float): The minimum value of the new range.
67 | new_max (float): The maximum value of the new range.
68 |
69 | Returns:
70 | float: The transformed value.
71 | """
72 | # Compute the scale factor between the old and new ranges
73 | scale = (new_max - new_min) / (old_max - old_min)
74 | # Apply the transformation
75 | return new_min + (value - old_min) * scale
76 |
77 |
78 | keyboard = poll()
79 | keyboard.register(stdin)
80 | motor_speed = 250
81 |
82 | while True:
83 |
84 | while not keyboard.poll(0):
85 | wait(1)
86 |
87 | # Read action values for the motors
88 | data = stdin.buffer.read(16) # Reading 4 bytes (4 floats)
89 | rotation_action, low_action, high_action, grab_action = ustruct.unpack(
90 | "!ffff", data
91 | )
92 |
93 | # Transform action range for motors
94 | grab_action = transform_range(grab_action, -1, 1, -25, 25)
95 | high_action = transform_range(high_action, -1, 1, -60, 60)
96 | low_action = transform_range(low_action, -1, 1, -30, 30)
97 | rotation_action = transform_range(rotation_action, -1, 1, -100, 100)
98 |
99 | angles = get_current_motor_angles()
100 |
101 | # Adjust grab action to ensure it stays within range after being applied
102 | if angles["GM"] + grab_action > max(grab_motor_range):
103 | grab_action = max(grab_motor_range) - angles["GM"]
104 | elif angles["GM"] + grab_action < min(grab_motor_range):
105 | grab_action = min(grab_motor_range) - angles["GM"]
106 | grab_motor.run_angle(speed=motor_speed, rotation_angle=grab_action, wait=False)
107 |
108 | # Adjust high action to ensure it stays within range after being applied
109 | if angles["HM"] + high_action > max(high_motor_range):
110 | high_action = max(high_motor_range) - angles["HM"]
111 | elif angles["HM"] + high_action < min(high_motor_range):
112 | high_action = min(high_motor_range) - angles["HM"]
113 | high_motor.run_angle(speed=motor_speed, rotation_angle=high_action, wait=False)
114 |
115 | # Adjust low action to ensure it stays within range after being applied
116 | if angles["LM"] + low_action > max(low_motor_range):
117 | low_action = max(low_motor_range) - angles["LM"]
118 | elif angles["LM"] + low_action < min(low_motor_range):
119 | low_action = min(low_motor_range) - angles["LM"]
120 | low_motor.run_angle(speed=motor_speed, rotation_angle=low_action, wait=False)
121 | rotation_motor.run_angle(
122 | speed=motor_speed, rotation_angle=rotation_action, wait=False
123 | )
124 |
125 | # Small delay to let motors arrive target angle
126 | wait(250)
127 |
128 | # Sometimes low angle jumps out of range and cant move back this corrects those cases
129 | if low_angle < 10:
130 | low_motor.run_target(speed=200, target_angle=10)
131 |
132 | # Read sensors to get current state of the robot
133 | rotation_angle = rotation_motor.angle()
134 | high_angle = high_motor.angle()
135 | grab_angle = grab_motor.angle()
136 | low_angle = low_motor.angle()
137 |
138 | # Send current state back to environment
139 | out_msg = ustruct.pack(
140 | "!ffff", grab_angle, high_angle, low_angle, normalize_angle(rotation_angle)
141 | )
142 | stdout.buffer.write(out_msg)
143 |
--------------------------------------------------------------------------------
/bricksrl/environments/runaway_v0/RunAwayEnv.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import Tuple
3 |
4 | import numpy as np
5 | import torch
6 | from bricksrl.environments.base.base_env import BaseEnv
7 | from tensordict import TensorDict, TensorDictBase
8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
9 |
10 |
11 | class RunAwayEnv_v0(BaseEnv):
12 | """
13 | A reinforcement learning environment for training agents to get away from a wall.
14 |
15 | The goal of the agent is to increase the distance measured by an ultrasonic sensor and get away from the wall as fast as possible.
16 | The environment provides a state consisting of 4 sensor readings (left, right, pitch, roll) and the distance to the wall.
17 | The agent can take a continuous action in the range [-1, 1] to control the movement of the robot.
18 | The environment returns a reward based on the change in distance to the wall and terminates the episode if the robot gets too close to the wall or the maximum number of steps is reached.
19 |
20 | Args:
21 | max_episode_steps (int): The maximum number of steps per episode. Defaults to 10.
22 | min_distance (float): The minimum distance to the wall. Defaults to 40.
23 | sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.2.
24 | verbose (bool): Whether to print verbose information during the environment's execution. Defaults to False.
25 |
26 | """
27 |
28 | action_dim = 1 # control the wheel motors together
29 | # 5 sensors (left motor angle, right motor angle, pitch, roll, distance)
30 | state_dim = 5
31 |
32 | observation_ranges = {
33 | "left_motor_angles": [0, 360],
34 | "right_motor_angles": [0, 360],
35 | "roll_angle": [-90, 90],
36 | "pitch_angle": [-90, 90],
37 | "distance": [0, 2000],
38 | }
39 |
40 | observation_key = "observation"
41 |
42 | def __init__(
43 | self,
44 | max_episode_steps: int = 10,
45 | min_distance: float = 40,
46 | sleep_time: float = 0.2,
47 | verbose: bool = False,
48 | pretrain: bool = False,
49 | ):
50 | self.sleep_time = sleep_time
51 | self.min_distance = min_distance
52 | self.max_episode_steps = max_episode_steps
53 | self._batch_size = torch.Size([1])
54 |
55 | # Define action spec
56 | self.action_spec = BoundedTensorSpec(
57 | low=-1,
58 | high=1,
59 | shape=(1, self.action_dim),
60 | )
61 |
62 | # Define observation spec
63 | bounds = torch.tensor(
64 | [
65 | self.observation_ranges["left_motor_angles"],
66 | self.observation_ranges["right_motor_angles"],
67 | self.observation_ranges["roll_angle"],
68 | self.observation_ranges["pitch_angle"],
69 | self.observation_ranges["distance"],
70 | ]
71 | )
72 |
73 | low_bounds = bounds[:, 0].unsqueeze(0)
74 | high_bounds = bounds[:, 1].unsqueeze(0)
75 |
76 | observation_spec = BoundedTensorSpec(
77 | low=low_bounds,
78 | high=high_bounds,
79 | )
80 | self.observation_spec = CompositeSpec(
81 | {self.observation_key: observation_spec}, shape=(1,)
82 | )
83 | self.verbose = verbose
84 | super().__init__(
85 | action_dim=self.action_dim,
86 | state_dim=self.state_dim,
87 | verbose=verbose,
88 | use_hub=1 - pretrain,
89 | )
90 |
91 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
92 | """
93 | Reset the environment and return the initial state.
94 |
95 | Returns:
96 | TensorDictBase: The initial state of the environment.
97 | """
98 | # TODO solve this fake action sending before to receive first state
99 | self.episode_step_iter = 0
100 | if tensordict is not None:
101 | action = tensordict.get("action").cpu().numpy().squeeze(0)
102 | else:
103 | action = np.zeros(self.action_dim)
104 | self.send_to_hub(action)
105 | time.sleep(self.sleep_time)
106 | observation = self.read_from_hub()
107 | self.distance = observation[:, -1]
108 | return TensorDict(
109 | {
110 | self.observation_key: torch.tensor(observation, dtype=torch.float32),
111 | "distance": torch.tensor([self.distance]).float(),
112 | },
113 | batch_size=[1],
114 | )
115 |
116 | def reward(self, next_observation: np.array) -> Tuple[float, bool]:
117 | """Reward function of RunAwayEnv.
118 |
119 | Goal: Increase distance measured by ultrasonic sensor aka.
120 | get away from the wall as fast as possible.
121 |
122 | """
123 | done = False
124 |
125 | current_distance = next_observation[:, -1]
126 | if current_distance <= self.min_distance: # too close to the wall break episode
127 | done = True
128 | reward = 0.0
129 | elif current_distance < self.distance:
130 | reward = -1.0
131 | elif current_distance > self.distance:
132 | reward = 1.0
133 | else:
134 | reward = 0.0
135 | if self.distance >= 2000:
136 | done = True
137 | self.distance = current_distance
138 | return reward, done
139 |
140 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
141 | """ """
142 | # Send action to hub to receive next state
143 | self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze(0))
144 | time.sleep(self.sleep_time) # wait some time for sensors to read and to
145 |
146 | # receive the next state
147 | next_observation = self.read_from_hub()
148 |
149 | # calc reward and done
150 | reward, done = self.reward(
151 | next_observation=next_observation,
152 | )
153 |
154 | next_tensordict = TensorDict(
155 | {
156 | self.observation_key: torch.tensor(
157 | next_observation, dtype=torch.float32
158 | ),
159 | "reward": torch.tensor([reward]).float(),
160 | "done": torch.tensor([done]).bool(),
161 | "distance": torch.tensor([self.distance]).float(),
162 | },
163 | batch_size=[1],
164 | )
165 |
166 | # increment episode step counter
167 | self.episode_step_iter += 1
168 | if self.episode_step_iter >= self.max_episode_steps:
169 | next_tensordict.set("done", torch.tensor([True]))
170 | return next_tensordict
171 |
--------------------------------------------------------------------------------
/bricksrl/environments/runaway_v0/client.py:
--------------------------------------------------------------------------------
1 | import ustruct
2 | from micropython import kbd_intr
3 | from pybricks.hubs import InventorHub
4 | from pybricks.parameters import Direction, Port
5 | from pybricks.pupdevices import Motor, UltrasonicSensor
6 | from pybricks.robotics import DriveBase
7 | from pybricks.tools import wait
8 | from uselect import poll
9 | from usys import stdin, stdout
10 |
11 | kbd_intr(-1)
12 |
13 |
14 | def normalize_angle(angle):
15 | # Normalize angle to be within 0 and 360
16 | while angle <= 0:
17 | angle += 360
18 | while angle > 360:
19 | angle -= 360
20 | return angle
21 |
22 |
23 | def transform_range(value, old_min, old_max, new_min, new_max):
24 | """
25 | Transform a value from one range to another.
26 |
27 | Parameters:
28 | value (float): The value to transform.
29 | old_min (float): The minimum value of the old range.
30 | old_max (float): The maximum value of the old range.
31 | new_min (float): The minimum value of the new range.
32 | new_max (float): The maximum value of the new range.
33 |
34 | Returns:
35 | float: The transformed value.
36 | """
37 | # Compute the scale factor between the old and new ranges
38 | scale = (new_max - new_min) / (old_max - old_min)
39 | # Apply the transformation
40 | return new_min + (value - old_min) * scale
41 |
42 |
43 | kbd_intr(-1)
44 | hub = InventorHub()
45 |
46 | # Initialize the drive base.
47 | left_motor = Motor(Port.E, Direction.COUNTERCLOCKWISE)
48 | right_motor = Motor(Port.A)
49 | drive_base = DriveBase(left_motor, right_motor, wheel_diameter=56, axle_track=130)
50 | # Initialize the distance sensor.
51 | sensor = UltrasonicSensor(Port.C)
52 |
53 | keyboard = poll()
54 | keyboard.register(stdin)
55 |
56 | while True:
57 |
58 | # Optional: Check available input.
59 | while not keyboard.poll(0):
60 | wait(1)
61 |
62 | # Read action values for the motors
63 | action_value = ustruct.unpack("!f", stdin.buffer.read(4))[0]
64 | action = transform_range(action_value, -1, 1, -100, 100)
65 |
66 | drive_base.straight(action, wait=True)
67 |
68 | # Read sensors to get current state of the robot
69 | (left, right) = (left_motor.angle(), right_motor.angle())
70 | (pitch, roll) = hub.imu.tilt()
71 | dist = sensor.distance()
72 |
73 | # Send current state back to environment
74 | out_msg = ustruct.pack(
75 | "!fffff", normalize_angle(left), normalize_angle(right), pitch, roll, dist
76 | )
77 | stdout.buffer.write(out_msg)
78 |
--------------------------------------------------------------------------------
/bricksrl/environments/spinning_v0/SpinningEnv.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import Tuple
3 |
4 | import numpy as np
5 | import torch
6 | from bricksrl.environments.base.base_env import BaseEnv
7 | from tensordict import TensorDict, TensorDictBase
8 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
9 |
10 |
11 | class SpinningEnv_v0(BaseEnv):
12 | """
13 | SpinningEnv_v0 is a custom gym environment for a spinning robot.
14 | The robot has to learn to spin in a circle around its own axis given a random goal direction (left or right, 0 or 1).
15 |
16 | Args:
17 | max_episode_steps (int): The maximum number of steps per episode. Defaults to 50.
18 | sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.2.
19 | verbose (bool): Whether to print verbose information during the environment's execution. Defaults to False.
20 |
21 | """
22 |
23 | action_dim = 2 # to control the wheel motors independently
24 | state_dim = 5 # 5 sensors (left, right, pitch, roll, rotation_velocity) + 1 direction (left or right)
25 |
26 | observation_ranges = {
27 | "left_motor_angle": [0, 360],
28 | "right_motor_angle": [0, 360],
29 | "pitch_angle": [-90, 90],
30 | "roll_angle": [-90, 90],
31 | "rotation_velocity": [-100, 100],
32 | "direction": [0, 1],
33 | }
34 |
35 | observation_key = "observation"
36 |
37 | def __init__(
38 | self,
39 | max_episode_steps: int = 50,
40 | sleep_time: float = 0.2,
41 | verbose: bool = False,
42 | pretrain: bool = False,
43 | ):
44 | self.sleep_time = sleep_time
45 | self._batch_size = torch.Size([1])
46 | self.max_episode_steps = max_episode_steps
47 |
48 | # Define action spec
49 | self.action_spec = BoundedTensorSpec(
50 | low=-1,
51 | high=1,
52 | shape=(1, self.action_dim),
53 | )
54 |
55 | # Define observation spec
56 | bounds = torch.tensor(
57 | [
58 | self.observation_ranges["left_motor_angle"],
59 | self.observation_ranges["right_motor_angle"],
60 | self.observation_ranges["pitch_angle"],
61 | self.observation_ranges["roll_angle"],
62 | self.observation_ranges["rotation_velocity"],
63 | self.observation_ranges["direction"],
64 | ]
65 | )
66 | low_bounds = bounds[:, 0].unsqueeze(0)
67 | high_bounds = bounds[:, 1].unsqueeze(0)
68 |
69 | observation_spec = BoundedTensorSpec(
70 | low=low_bounds,
71 | high=high_bounds,
72 | )
73 | self.observation_spec = CompositeSpec(
74 | {self.observation_key: observation_spec}, shape=(1,)
75 | )
76 |
77 | super().__init__(
78 | action_dim=self.action_dim,
79 | state_dim=self.state_dim,
80 | verbose=verbose,
81 | use_hub=1 - pretrain,
82 | )
83 |
84 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
85 | """
86 | Reset the environment and return the initial state.
87 |
88 | Returns:
89 | TensorDictBase: The initial state of the environment.
90 | """
91 | # TODO solve this fake action sending before to receive first state
92 | self.episode_step_iter = 0
93 | if tensordict is not None:
94 | action = tensordict.get("action").cpu().numpy().squeeze()
95 | else:
96 | action = np.zeros(self.action_dim)
97 | self.send_to_hub(action)
98 | time.sleep(self.sleep_time)
99 |
100 | state = self.read_from_hub()
101 | self.direction = np.random.randint(0, 2) # (0,1) left or right
102 | full_original_state = np.concatenate(
103 | (state, np.array([[self.direction]])), axis=1, dtype=np.float32
104 | )
105 |
106 | return TensorDict(
107 | {
108 | self.observation_key: torch.tensor(full_original_state),
109 | },
110 | batch_size=[1],
111 | )
112 |
113 | def reward(self, next_observation: np.array) -> Tuple[float, bool]:
114 | """Reward function of Spinning environment.
115 | If the self.direction is 0, the robot is spinning left, otherwise right.
116 | We want to maximise in those cases the angular velocity (last element of the state vector).
117 | If the robot is spinning in the wrong direction, we want to minimize the angular velocity.
118 | """
119 | done = False
120 | velocity = next_observation[:, -2]
121 |
122 | if self.direction == 0:
123 | reward = velocity
124 | else:
125 | reward = -velocity
126 |
127 | return reward, done
128 |
129 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
130 | """ """
131 | # Send action to hub to receive next state
132 | self.send_to_hub(tensordict.get("action").cpu().numpy().squeeze())
133 | time.sleep(self.sleep_time) # wait some time for sensors to read and to
134 | # receive the next state
135 | next_observation = self.read_from_hub()
136 | full_original_next_observation = np.concatenate(
137 | (next_observation, np.array([[self.direction]])), axis=1, dtype=np.float32
138 | )
139 | # calc reward and done
140 | reward, done = self.reward(full_original_next_observation)
141 |
142 | next_tensordict = TensorDict(
143 | {
144 | self.observation_key: torch.tensor(full_original_next_observation),
145 | "reward": torch.tensor([reward]).float(),
146 | "done": torch.tensor([done]).bool(),
147 | },
148 | batch_size=[1],
149 | )
150 | # increment episode step counter
151 | self.episode_step_iter += 1
152 | if self.episode_step_iter >= self.max_episode_steps:
153 | next_tensordict.set("done", torch.tensor([True]).bool())
154 |
155 | return next_tensordict
156 |
--------------------------------------------------------------------------------
/bricksrl/environments/spinning_v0/client.py:
--------------------------------------------------------------------------------
1 | import ustruct
2 | from micropython import kbd_intr
3 | from pybricks.hubs import InventorHub
4 | from pybricks.parameters import Axis, Direction, Port
5 | from pybricks.pupdevices import Motor
6 | from pybricks.tools import wait
7 | from uselect import poll
8 | from usys import stdin, stdout
9 |
10 | kbd_intr(-1)
11 | hub = InventorHub()
12 |
13 | # Initialize and set the motors
14 | left_motor = Motor(Port.E, Direction.COUNTERCLOCKWISE)
15 | right_motor = Motor(Port.A)
16 |
17 | keyboard = poll()
18 | keyboard.register(stdin)
19 |
20 |
21 | def normalize_angle(angle):
22 | # Normalize angle to be within 0 and 360
23 | while angle <= 0:
24 | angle += 360
25 | while angle > 360:
26 | angle -= 360
27 | return angle
28 |
29 |
30 | def transform_range(value, old_min, old_max, new_min, new_max):
31 | """
32 | Transform a value from one range to another.
33 |
34 | Parameters:
35 | value (float): The value to transform.
36 | old_min (float): The minimum value of the old range.
37 | old_max (float): The maximum value of the old range.
38 | new_min (float): The minimum value of the new range.
39 | new_max (float): The maximum value of the new range.
40 |
41 | Returns:
42 | float: The transformed value.
43 | """
44 | # Compute the scale factor between the old and new ranges
45 | scale = (new_max - new_min) / (old_max - old_min)
46 | # Apply the transformation
47 | return new_min + (value - old_min) * scale
48 |
49 |
50 | while True:
51 |
52 | while not keyboard.poll(0):
53 | wait(1)
54 |
55 | # Read action values for both motors
56 | data = stdin.buffer.read(8) # Reading 8 bytes (two floats)
57 | left_action_value, right_action_value = ustruct.unpack("!ff", data)
58 |
59 | # Apply action to each motor
60 | left_motor.run_angle(
61 | speed=400,
62 | rotation_angle=transform_range(left_action_value, -1, 1, -100, 100),
63 | wait=False,
64 | )
65 | right_motor.run_angle(
66 | speed=400,
67 | rotation_angle=transform_range(right_action_value, -1, 1, -100, 100),
68 | wait=False,
69 | )
70 |
71 | wait(100) # Small delay
72 |
73 | # Read sensors to get current state of the robot
74 | (left, right) = (left_motor.angle(), right_motor.angle())
75 | (pitch, roll) = hub.imu.tilt()
76 | z_angl_vel = hub.imu.angular_velocity(Axis.Z)
77 |
78 | # Send current state back to environment
79 | out_msg = ustruct.pack(
80 | "!fffff", normalize_angle(left), normalize_angle(right), pitch, roll, z_angl_vel
81 | )
82 | stdout.buffer.write(out_msg)
83 |
--------------------------------------------------------------------------------
/bricksrl/environments/walker_v0/WalkerEnv.py:
--------------------------------------------------------------------------------
1 | import time
2 | from typing import Tuple
3 |
4 | import numpy as np
5 |
6 | import torch
7 |
8 | from bricksrl.environments.base.base_env import BaseEnv
9 | from tensordict import TensorDict, TensorDictBase
10 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
11 |
12 |
13 | class WalkerEnv_v0(BaseEnv):
14 | """
15 | A reinforcement learning environment for the robodog to learn to walk.
16 |
17 | Specific to the walker_v0 environment is, that the reward function is hard coded to learn a gait routine.
18 | In contrast to the walker_v1 environment, the reward function is not based on the acceleration of the robot.
19 |
20 | Args:
21 | max_episode_steps (int): The maximum number of steps per episode. Defaults to 10.
22 | sleep_time (float): The time to wait between sending actions and receiving the next state. Defaults to 0.0.
23 | verbose (bool): Whether to print additional information. Defaults to False.
24 |
25 | """
26 |
27 | action_dim = 4 # (lf_value, lb_value, rf_value, rb_value)
28 | # angles are in range [-179, 179]
29 | state_dim = 7 # (lf_angle, rf_angle, lb_angle, rb_angle, pitch, roll, acc_x)
30 |
31 | observation_ranges = {
32 | "lf_angle": [-179, 179],
33 | "rf_angle": [-179, 179],
34 | "lb_angle": [-179, 179],
35 | "rb_angle": [-179, 179],
36 | "pitch": [-50, 50],
37 | "roll": [-50, 50],
38 | "acc_x": [-3000, 3000],
39 | }
40 |
41 | observation_key = "observation"
42 |
43 | def __init__(
44 | self,
45 | max_episode_steps: int = 50,
46 | sleep_time: float = 0.0,
47 | verbose: bool = False,
48 | pretrain: bool = False,
49 | ):
50 | self.sleep_time = sleep_time
51 | self._batch_size = torch.Size([1])
52 | self.max_episode_steps = max_episode_steps
53 |
54 | # Define action spec
55 | self.action_spec = BoundedTensorSpec(
56 | low=-1,
57 | high=1,
58 | shape=(1, self.action_dim),
59 | )
60 |
61 | # Define observation spec
62 | bounds = torch.tensor(
63 | [
64 | self.observation_ranges["lf_angle"],
65 | self.observation_ranges["rf_angle"],
66 | self.observation_ranges["lb_angle"],
67 | self.observation_ranges["rb_angle"],
68 | self.observation_ranges["pitch"],
69 | self.observation_ranges["roll"],
70 | self.observation_ranges["acc_x"],
71 | ]
72 | )
73 | # Reshape bounds to (1, 7)
74 | low_bounds = bounds[:, 0].unsqueeze(0)
75 | high_bounds = bounds[:, 1].unsqueeze(0)
76 |
77 | observation_spec = BoundedTensorSpec(
78 | low=low_bounds,
79 | high=high_bounds,
80 | shape=(1, self.state_dim),
81 | )
82 |
83 | self.observation_spec = CompositeSpec(
84 | {self.observation_key: observation_spec}, shape=(1,)
85 | )
86 | super().__init__(
87 | action_dim=self.action_dim,
88 | state_dim=self.state_dim,
89 | verbose=verbose,
90 | use_hub=1 - pretrain,
91 | )
92 |
93 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
94 | """
95 | Reset the environment and return the initial state.
96 |
97 | Returns:
98 | TensorDictBase: The initial state of the environment.
99 | """
100 | # TODO solve this fake action sending before to receive first state
101 | self.episode_step_iter = 0
102 | if tensordict is not None:
103 | action = tensordict.get("action").cpu().numpy().squeeze()
104 | else:
105 | action = np.zeros(self.action_dim)
106 | self.send_to_hub(action)
107 | time.sleep(self.sleep_time)
108 | observation = self.read_from_hub()
109 |
110 | return TensorDict(
111 | {
112 | self.observation_key: torch.tensor(observation, dtype=torch.float32),
113 | },
114 | batch_size=[1],
115 | )
116 |
117 | def reward(
118 | self,
119 | action: np.ndarray,
120 | next_state: np.ndarray,
121 | ) -> Tuple[float, bool]:
122 | """Reward function of walker.
123 |
124 | Args:
125 | action (np.ndarray): The action taken.
126 | next_state (np.ndarray): The next state.
127 |
128 | Returns:
129 | Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done.
130 | """
131 |
132 | done = False
133 | # pitch and roll need to stay in range [-75, 75] outside done = True
134 | pitch, roll = next_state[:, -3], next_state[:, -2]
135 | if np.abs(pitch) > 100 or np.abs(roll) > 100:
136 | done = True
137 | reward = 0
138 | return reward, done
139 |
140 | (
141 | lf_angle,
142 | rf_angle,
143 | lb_angle,
144 | rb_angle,
145 | pitch,
146 | roll,
147 | acc_x,
148 | ) = next_state.squeeze()
149 |
150 | # we want actions to be negative and high
151 | # action is in range [-1, 1] over 4 dims -> sum is in range [-4, 4] -> divide by 4 to get in range [-1, 1]
152 | action_reward = -np.sum(action) / 4 / 10
153 | # Take this off we dont want them to be similar otherwise we cant adapt for noise in the system
154 | # actions should ideally be similar something like [-0.75, -0.75, -0.75, -0.75]
155 | # action_std_reward = -np.std(action)
156 |
157 | # we want lf_angle and rb_angle to be synchronized and rf_angle and lb_angle to be synchronized
158 | # divide by 180 to get in range [-1, 0]
159 | lf_rb_diff_reward = -angular_difference(lf_angle, rb_angle) / 180
160 | rf_lb_diff_reward = -angular_difference(rf_angle, lb_angle) / 180
161 |
162 | # we want lf_rb and rf_lb to be 180° apart
163 | # divide by 180 to get in range [-1, 0]
164 | lf_rf_180_reward = -(180 - angular_difference(lf_angle, rf_angle)) / 180
165 | lb_rb_180_reward = -(180 - angular_difference(lb_angle, rb_angle)) / 180
166 |
167 | if self.verbose:
168 | print("action_reward", action_reward)
169 | # print("action_std_reward", action_std_reward)
170 | print("lf_rb_diff_reward", lf_rb_diff_reward)
171 | print("rf_lb_diff_reward", rf_lb_diff_reward)
172 | print("lf_rf_180_reward", lf_rf_180_reward)
173 |
174 | reward = (
175 | action_reward
176 | # + action_std_reward
177 | + lf_rb_diff_reward
178 | + rf_lb_diff_reward
179 | + lf_rf_180_reward
180 | + lb_rb_180_reward
181 | )
182 |
183 | return reward.item(), done
184 |
185 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
186 | """ """
187 | # Send action to hub to receive next state
188 | action = tensordict.get("action").cpu().numpy().squeeze()
189 | self.send_to_hub(action)
190 | time.sleep(self.sleep_time) # wait some time for sensors to read and to
191 | # receive the next state
192 | next_observation = self.read_from_hub()
193 |
194 | # calc reward and done
195 | reward, done = self.reward(
196 | action=action,
197 | next_state=next_observation,
198 | )
199 | next_tensordict = TensorDict(
200 | {
201 | self.observation_key: torch.tensor(
202 | next_observation, dtype=torch.float32
203 | ),
204 | "reward": torch.tensor([reward]).float(),
205 | "done": torch.tensor([done]).bool(),
206 | },
207 | batch_size=[1],
208 | )
209 |
210 | # increment episode step counter
211 | self.episode_step_iter += 1
212 | if self.episode_step_iter >= self.max_episode_steps:
213 | next_tensordict.set("done", torch.tensor([True]))
214 | return next_tensordict
215 |
216 |
217 | def angular_difference(angle1, angle2):
218 | # Calculate the difference in angles, wrapped between -180 and 180
219 | difference = (angle2 - angle1 + 180) % 360 - 180
220 | return abs(difference) # Return the absolute value of the difference
221 |
--------------------------------------------------------------------------------
/bricksrl/environments/walker_v0/WalkerEnvSim.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | import numpy as np
4 |
5 | import torch
6 |
7 | from bricksrl.environments.base.base_env import BaseSimEnv
8 | from tensordict import TensorDict, TensorDictBase
9 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
10 |
11 |
12 | class WalkerEnvSim_v0(BaseSimEnv):
13 | """ """
14 |
15 | action_dim = 4 # (lf_value, lb_value, rf_value, rb_value)
16 | # angles are in range [-179, 179]
17 | state_dim = 7 # (lf_angle, rf_angle, lb_angle, rb_angle, pitch, roll, acc_x)
18 |
19 | observation_ranges = {
20 | "lf_angle": [-179, 179],
21 | "rf_angle": [-179, 179],
22 | "lb_angle": [-179, 179],
23 | "rb_angle": [-179, 179],
24 | "pitch": [-75, 75],
25 | "roll": [-75, 75],
26 | "acc_x": [-3000, 3000],
27 | }
28 |
29 | observation_key = "observation"
30 |
31 | def __init__(
32 | self,
33 | max_episode_steps: int = 50,
34 | noise: float = 0.1,
35 | low_action_angle: int = -100,
36 | high_action_angle: int = 0,
37 | verbose: bool = False,
38 | ):
39 | self._batch_size = torch.Size([1])
40 | self.max_episode_steps = max_episode_steps
41 | self.noise = noise
42 | self.low_action_angle = low_action_angle
43 | self.high_action_angle = high_action_angle
44 | self.current_leg_angles = None
45 |
46 | # Define action spec
47 | self.action_spec = BoundedTensorSpec(
48 | low=-1,
49 | high=1,
50 | shape=(1, self.action_dim),
51 | )
52 |
53 | # Define observation spec
54 | bounds = torch.tensor(
55 | [
56 | self.observation_ranges["lf_angle"],
57 | self.observation_ranges["rf_angle"],
58 | self.observation_ranges["lb_angle"],
59 | self.observation_ranges["rb_angle"],
60 | self.observation_ranges["pitch"],
61 | self.observation_ranges["roll"],
62 | self.observation_ranges["acc_x"],
63 | ]
64 | )
65 | # Reshape bounds to (1, 7)
66 | low_bounds = bounds[:, 0].unsqueeze(0)
67 | high_bounds = bounds[:, 1].unsqueeze(0)
68 | observation_spec = BoundedTensorSpec(
69 | low=low_bounds,
70 | high=high_bounds,
71 | )
72 |
73 | self.observation_spec = CompositeSpec(
74 | {self.observation_key: observation_spec}, shape=(1,)
75 | )
76 | super().__init__(
77 | action_dim=self.action_dim,
78 | state_dim=self.state_dim,
79 | verbose=verbose,
80 | use_hub=False,
81 | )
82 |
83 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
84 | """
85 | Reset the environment and return the initial state.
86 |
87 | Returns:
88 | TensorDictBase: The initial state of the environment.
89 | """
90 | # TODO solve this fake action sending before to receive first state
91 | self.episode_step_iter = 0
92 |
93 | observation = self.observation_spec[self.observation_key].rand()
94 | self.current_leg_angles = observation[0, :4]
95 | return TensorDict(
96 | {
97 | self.observation_key: observation,
98 | },
99 | batch_size=[1],
100 | )
101 |
102 | def reward(
103 | self,
104 | action: np.ndarray,
105 | next_state: np.ndarray,
106 | ) -> Tuple[float, bool]:
107 | """Reward function of walker.
108 |
109 | Args:
110 | action (np.ndarray): The action taken.
111 | next_state (np.ndarray): The next state.
112 |
113 | Returns:
114 | Tuple[float, bool]: The reward received and a boolean indicating whether the episode is done.
115 | """
116 |
117 | done = False
118 | # pitch and roll need to stay in range [-75, 75] outside done = True
119 | pitch, roll = next_state[:, -3], next_state[:, -2]
120 | if np.abs(pitch) > 100 or np.abs(roll) > 100:
121 | done = True
122 | reward = 0
123 | return reward, done
124 |
125 | (
126 | lf_angle,
127 | rf_angle,
128 | lb_angle,
129 | rb_angle,
130 | pitch,
131 | roll,
132 | acc_x,
133 | ) = next_state.squeeze()
134 |
135 | # we want actions to be negative and high
136 | # action is in range [-1, 1] over 4 dims -> sum is in range [-4, 4] -> divide by 4 to get in range [-1, 1]
137 | action_reward = -np.sum(action) / 4 / 10
138 | # Take this off we dont want them to be similar otherwise we cant adapt for noise in the system
139 | # actions should ideally be similar something like [-0.75, -0.75, -0.75, -0.75]
140 | # action_std_reward = -np.std(action)
141 |
142 | # we want lf_angle and rb_angle to be synchronized and rf_angle and lb_angle to be synchronized
143 | # divide by 180 to get in range [-1, 0]
144 | lf_rb_diff_reward = -angular_difference(lf_angle, rb_angle) / 180
145 | rf_lb_diff_reward = -angular_difference(rf_angle, lb_angle) / 180
146 |
147 | # we want lf_rb and rf_lb to be 180° apart
148 | # divide by 180 to get in range [-1, 0]
149 | lf_rf_180_reward = -(180 - angular_difference(lf_angle, rf_angle)) / 180
150 | lb_rb_180_reward = -(180 - angular_difference(lb_angle, rb_angle)) / 180
151 |
152 | if self.verbose:
153 | print("action_reward", action_reward)
154 | # print("action_std_reward", action_std_reward)
155 | print("lf_rb_diff_reward", lf_rb_diff_reward)
156 | print("rf_lb_diff_reward", rf_lb_diff_reward)
157 | print("lf_rf_180_reward", lf_rf_180_reward)
158 |
159 | reward = (
160 | action_reward
161 | # + action_std_reward
162 | + lf_rb_diff_reward
163 | + rf_lb_diff_reward
164 | + lf_rf_180_reward
165 | + lb_rb_180_reward
166 | )
167 |
168 | return reward.item(), done
169 |
170 | @staticmethod
171 | def transform_range(value, old_min, old_max, new_min, new_max):
172 | """
173 | Transform a value from one range to another.
174 |
175 | Parameters:
176 | value (float): The value to transform.
177 | old_min (float): The minimum value of the old range.
178 | old_max (float): The maximum value of the old range.
179 | new_min (float): The minimum value of the new range.
180 | new_max (float): The maximum value of the new range.
181 |
182 | Returns:
183 | float: The transformed value.
184 | """
185 | # Compute the scale factor between the old and new ranges
186 | scale = (new_max - new_min) / (old_max - old_min)
187 | # Apply the transformation
188 | return new_min + (value - old_min) * scale
189 |
190 | @staticmethod
191 | def normalize_angle(angle, low_angle=-180, high_angle=179, original_one_round=360):
192 | # Normalize angle to be within -179 to 179 degrees
193 | while angle <= low_angle:
194 | angle += original_one_round
195 | while angle > high_angle:
196 | angle -= original_one_round
197 | return angle
198 |
199 | def apply_action(self, action: np.ndarray) -> np.ndarray:
200 |
201 | noise = np.random.normal(0, self.noise, size=4)
202 | action += noise
203 |
204 | lf_value, lb_value, rf_value, rb_value = action
205 | # transform action range for motors
206 | lf_action = self.transform_range(
207 | lf_value, -1, 1, self.low_action_angle, self.high_action_angle
208 | )
209 | lb_action = self.transform_range(
210 | lb_value, -1, 1, self.low_action_angle, self.high_action_angle
211 | )
212 | rf_actopm = self.transform_range(
213 | rf_value, -1, 1, self.low_action_angle, self.high_action_angle
214 | )
215 | rb_action = self.transform_range(
216 | rb_value, -1, 1, self.low_action_angle, self.high_action_angle
217 | )
218 |
219 | (
220 | lf_angle,
221 | rf_angle,
222 | lb_angle,
223 | rb_angle,
224 | ) = self.current_leg_angles.squeeze()
225 |
226 | new_lf_angle = self.normalize_angle(lf_angle + lf_action)
227 | new_lb_angle = self.normalize_angle(lb_angle + lb_action)
228 | new_rf_angle = self.normalize_angle(rf_angle + rf_actopm)
229 | new_rb_angle = self.normalize_angle(rb_angle + rb_action)
230 |
231 | self.current_leg_angles = np.array(
232 | [
233 | [
234 | new_lf_angle,
235 | new_rf_angle,
236 | new_lb_angle,
237 | new_rb_angle,
238 | ]
239 | ],
240 | dtype=np.float32,
241 | )
242 | return self.current_leg_angles
243 |
244 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
245 | """ """
246 | # Send action to hub to receive next state
247 | action = tensordict.get("action").cpu().numpy().squeeze()
248 |
249 | # receive the next state
250 | next_observation = self.apply_action(action)
251 |
252 | # add zeros for pitch, roll and acc_x
253 | next_observation = np.concatenate(
254 | (next_observation, np.zeros((1, 3))), axis=1, dtype=np.float32
255 | )
256 |
257 | # calc reward and done
258 | reward, done = self.reward(
259 | action=action,
260 | next_state=next_observation,
261 | )
262 | next_tensordict = TensorDict(
263 | {
264 | self.observation_key: next_observation,
265 | "reward": torch.tensor([reward]).float(),
266 | "done": torch.tensor([done]).bool(),
267 | },
268 | batch_size=[1],
269 | )
270 |
271 | # increment episode step counter
272 | self.episode_step_iter += 1
273 | if self.episode_step_iter >= self.max_episode_steps:
274 | next_tensordict.set("done", torch.tensor([True]))
275 | return next_tensordict
276 |
277 |
278 | def angular_difference(angle1, angle2):
279 | # Calculate the difference in angles, wrapped between -180 and 180
280 | difference = (angle2 - angle1 + 180) % 360 - 180
281 | return abs(difference) # Return the absolute value of the difference
282 |
--------------------------------------------------------------------------------
/bricksrl/environments/walker_v0/client.py:
--------------------------------------------------------------------------------
1 | # NOTE: Run this program with the latest
2 | # firmware provided via https://beta.pybricks.com/
3 |
4 | import umath
5 | import ustruct
6 | from micropython import kbd_intr
7 | from pybricks.hubs import InventorHub
8 | from pybricks.parameters import Axis, Direction, Port
9 | from pybricks.pupdevices import Motor, UltrasonicSensor
10 | from pybricks.tools import wait
11 | from uselect import poll
12 |
13 | # Standard MicroPython modules
14 | from usys import stdin, stdout
15 |
16 | kbd_intr(-1)
17 | hub = InventorHub()
18 |
19 | # Initialize and set the motors
20 | lf_motor = Motor(Port.D, Direction.COUNTERCLOCKWISE)
21 | lb_motor = Motor(Port.B, Direction.COUNTERCLOCKWISE)
22 | rf_motor = Motor(Port.C)
23 | rb_motor = Motor(Port.A)
24 |
25 | # Init additional sensor
26 | eyes = UltrasonicSensor(Port.E)
27 |
28 | # Setup poll
29 | keyboard = poll()
30 | keyboard.register(stdin)
31 |
32 |
33 | def normalize_angle(angle):
34 | # Normalize angle to be within -179 to 179 degrees
35 | while angle <= -180:
36 | angle += 360
37 | while angle > 179:
38 | angle -= 360
39 | return angle
40 |
41 |
42 | def transform_range(value, old_min, old_max, new_min, new_max):
43 | """
44 | Transform a value from one range to another.
45 |
46 | Parameters:
47 | value (float): The value to transform.
48 | old_min (float): The minimum value of the old range.
49 | old_max (float): The maximum value of the old range.
50 | new_min (float): The minimum value of the new range.
51 | new_max (float): The maximum value of the new range.
52 |
53 | Returns:
54 | float: The transformed value.
55 | """
56 | # Compute the scale factor between the old and new ranges
57 | scale = (new_max - new_min) / (old_max - old_min)
58 | # Apply the transformation
59 | return new_min + (value - old_min) * scale
60 |
61 |
62 | # Setting default values and ranges
63 | low_angle = -100 # 270
64 | high_angle = 0
65 | speed = 600
66 |
67 | while True:
68 |
69 | while not keyboard.poll(0):
70 | wait(1)
71 |
72 | # Read action values for the motors
73 | data = stdin.buffer.read(16) # Reading 16 bytes (4 floats)
74 | lf_value, lb_value, rf_value, rb_value = ustruct.unpack("!ffff", data)
75 |
76 | # Apply actions. Motor angle range is [-180, 179] action outputs are [-1, 1] we transform the actions first.
77 | lb_motor.run_angle(
78 | speed=speed,
79 | rotation_angle=transform_range(lb_value, -1, 1, low_angle, high_angle),
80 | wait=False,
81 | )
82 | lf_motor.run_angle(
83 | speed=speed,
84 | rotation_angle=transform_range(lf_value, -1, 1, low_angle, high_angle),
85 | wait=False,
86 | )
87 | rb_motor.run_angle(
88 | speed=speed,
89 | rotation_angle=transform_range(rb_value, -1, 1, low_angle, high_angle),
90 | wait=False,
91 | )
92 | rf_motor.run_angle(
93 | speed=speed,
94 | rotation_angle=transform_range(rf_value, -1, 1, low_angle, high_angle),
95 | wait=False,
96 | )
97 |
98 | # Small delay to let motors arrive target angle
99 | wait(250) # 250
100 |
101 | # Read sensors to get current state of the robot
102 | a_x = hub.imu.acceleration(Axis.X)
103 | (lf_angle, rf_angle) = (lf_motor.angle(), rf_motor.angle())
104 | (lb_angle, rb_angle) = (lb_motor.angle(), rb_motor.angle())
105 | (pitch, roll) = hub.imu.tilt()
106 | dist = eyes.distance()
107 |
108 | if umath.fabs(pitch) > 90 or umath.fabs(roll) > 120 or dist <= 40:
109 | hub.display.text(text="Help", on=500, off=50)
110 |
111 | # Send current state back to environment
112 | out_msg = ustruct.pack(
113 | "!fffffff",
114 | normalize_angle(lf_angle),
115 | normalize_angle(rf_angle),
116 | normalize_angle(lb_angle),
117 | normalize_angle(rb_angle),
118 | pitch,
119 | roll,
120 | a_x,
121 | )
122 | stdout.buffer.write(out_msg)
123 |
--------------------------------------------------------------------------------
/conf/README.md:
--------------------------------------------------------------------------------
1 | # Configuration Details for BricksRL Experiments
2 |
3 | ## Overview
4 | This directory contains all the necessary configuration files to tailor your experiments using BricksRL. Configurations are managed using [Hydra](https://hydra.cc/), a powerful tool for configuring complex applications that allows for easy modification of parameters directly from the command line.
5 |
6 | ## Configuration Files
7 | - **config.yaml**: The base configuration for all experiments including what agent and environment to run.
8 | - **env/**: Contains environment-specific configurations.
9 | - **runaway-v0.yaml**: Settings for the *RunAway-v0* environment for the 2wheeler robot.
10 | - **spinning-v0.yaml**: Settings for the *Spinning-v0* environment for the 2wheeler robot.
11 | - **walker-v0.yaml**: Settings for the *Walker-v0* environment for the walker robot.
12 | - **walker_sim-v0.yaml**: Settings for the *WalkerSim-v0* environment for the walker robot.
13 | - **roboarm-v0.yaml**: Settings for the *RoboArm-v0* environment for the roboarm robot.
14 | - **roboarm_sim-v0.yaml**: Settings for the *RoboArmSim-v0* environment for the roboarm robot.
15 | - **roboarm_mixed-v0.yaml**: Settings for the *RoboArmMixed-v0* environment for the roboarm robot.
16 | - **agent/**: Contains agent-specific configurations.
17 | - **sac.yaml**: Configuration for the SAC agent.
18 | - **td3.yaml**: Configuration for the TD3 agent.
19 | - **droq.yaml**: Configuration for the DroQ agent.
20 |
21 | ## Using Hydra for Configuration Overrides
22 | Hydra allows you to override any configuration parameter directly from the terminal when you run your experiments. This makes it easy to test different configurations without altering your configuration files.
23 |
24 | ### Example Usage
25 | To run an experiment with the walker environment using the SAC agent and specify the number of episodes directly from the command line, you can use the following command:
26 |
27 | ```bash
28 | python experiments/walker/train.py episodes=200 agent=sac
29 | ```
30 | This command temporarily overrides the episodes and agent parameters for this specific run without needing to change the configuration files.
31 |
32 | You can further override agent or environment specific parameter like:
33 |
34 | ```bash
35 | python experiments/walker/train.py agent=sac agent.batch_size=32
36 | ```
37 |
38 | or
39 |
40 | ```bash
41 | python experiments/walker/train.py env.max_episode_steps=200 env.frame_stack=4
42 | ```
--------------------------------------------------------------------------------
/conf/agent/bc.yaml:
--------------------------------------------------------------------------------
1 | name: bc
2 | lr: 3e-4
3 | batch_size: 256
4 | num_updates: 1
5 | prefill_episodes: 0
6 |
7 |
8 | policy_type: deterministic # stochastic or deterministic
9 | num_cells: 256
10 | dropout: 0.01
11 | normalization: LayerNorm
12 |
--------------------------------------------------------------------------------
/conf/agent/cql.yaml:
--------------------------------------------------------------------------------
1 | name: cql
2 | lr: 3e-4
3 | batch_size: 256
4 | num_updates: 1
5 | prefill_episodes: 10
6 |
7 | bc_steps: 1000
8 |
9 | # CQL specific
10 | num_cells: 256
11 | gamma: 0.99
12 | soft_update_eps: 0.995
13 | loss_function: l2
14 | temperature: 1.0
15 | min_q_weight: 1.0
16 | max_q_backup: False
17 | deterministic_backup: False
18 | num_random: 10
19 | with_lagrange: True
20 | lagrange_thresh: 5.0 # tau
21 |
22 | normalization: None
23 | dropout: 0.0
24 |
25 | prb: 0
26 | buffer_size: 1000000
27 | pretrain: False
28 | reset_params: False
--------------------------------------------------------------------------------
/conf/agent/droq.yaml:
--------------------------------------------------------------------------------
1 | name: sac
2 | lr: 3e-4
3 | batch_size: 256
4 | num_updates: 20
5 | prefill_episodes: 10
6 |
7 | num_cells: 256
8 | gamma: 0.99
9 | soft_update_eps: 0.995
10 | alpha_init: 1
11 | fixed_alpha: False
12 | loss_function: l2
13 |
14 | normalization: LayerNorm
15 | dropout: 0.01
16 |
17 | prb: 0
18 | buffer_size: 1000000
19 | reset_params: False
20 |
--------------------------------------------------------------------------------
/conf/agent/iql.yaml:
--------------------------------------------------------------------------------
1 | name: iql
2 | lr: 3e-4
3 | batch_size: 256
4 | num_updates: 1
5 | prefill_episodes: 0
6 |
7 | num_cells: 256
8 | gamma: 0.99
9 | soft_update_eps: 0.995
10 | loss_function: l2
11 | temperature: 1.0
12 | expectile: 0.5
13 |
14 | normalization: None
15 | dropout: 0.0
16 |
17 | prb: 0
18 | buffer_size: 1000000
19 | pretrain: False
20 | reset_params: False
--------------------------------------------------------------------------------
/conf/agent/random.yaml:
--------------------------------------------------------------------------------
1 | name: random
2 | # not used for random agent
3 | batch_size: 256
4 | num_updates: 2500
5 | prefill_episodes: 0
--------------------------------------------------------------------------------
/conf/agent/sac.yaml:
--------------------------------------------------------------------------------
1 | name: sac
2 | lr: 3e-4
3 | batch_size: 256
4 | num_updates: 1
5 | prefill_episodes: 10
6 |
7 | num_cells: 256
8 | gamma: 0.99
9 | soft_update_eps: 0.995
10 | alpha_init: 1
11 | fixed_alpha: False
12 | loss_function: l2
13 |
14 | normalization: None
15 | dropout: 0.0
16 |
17 | prb: 0
18 | buffer_size: 1000000
19 | reset_params: False
--------------------------------------------------------------------------------
/conf/agent/td3.yaml:
--------------------------------------------------------------------------------
1 | name: td3
2 | lr: 3e-4
3 | batch_size: 256
4 | num_updates: 1
5 | prefill_episodes: 10
6 |
7 | num_cells: 256
8 | gamma: 0.99
9 | soft_update_eps: 0.995
10 | loss_function: smooth_l1
11 | exploration_noise: 0.1 # 0.01
12 |
13 | normalization: None
14 | dropout: 0.0
15 |
16 | prb: 0
17 | buffer_size: 1000000
18 | reset_params: False
19 | use_bc: False
20 | alpha: 1.0
--------------------------------------------------------------------------------
/conf/config.yaml:
--------------------------------------------------------------------------------
1 | # Base Config to run all examples
2 |
3 | run_name: ""
4 | verbose: 0
5 |
6 | device: "cuda"
7 | episodes: 250
8 |
9 | defaults:
10 | - _self_
11 | # random, sac, td3, droq
12 | - agent: sac
13 | - env: walker_sim-v0
--------------------------------------------------------------------------------
/conf/env/roboarm-v0.yaml:
--------------------------------------------------------------------------------
1 | name: "roboarm-v0"
2 | max_episode_steps: 100
3 | # env specific params
4 | verbose: 0
5 | # env wrapper
6 | frame_stack: 1
7 | action_filter: 1
8 | sleep_time: 0.0
9 | reward_signal: dense
10 |
--------------------------------------------------------------------------------
/conf/env/roboarm_mixed-v0.yaml:
--------------------------------------------------------------------------------
1 | name: "roboarm_mixed-v0"
2 | max_episode_steps: 30
3 | # env specific params
4 | verbose: 0
5 | # env wrapper
6 | frame_stack: 1
7 | action_filter: 1
8 | sleep_time: 0.0
9 | reward_signal: dense
10 | camera_id: 2
11 | goal_radius: 25
12 |
--------------------------------------------------------------------------------
/conf/env/roboarm_sim-v0.yaml:
--------------------------------------------------------------------------------
1 | name: "roboarm_sim-v0"
2 | max_episode_steps: 100
3 | # env specific params
4 | verbose: 0
5 | # env wrapper
6 | frame_stack: 1
7 | action_filter: 1
8 | noise: 0.05
9 | reward_signal: dense
10 |
--------------------------------------------------------------------------------
/conf/env/runaway-v0.yaml:
--------------------------------------------------------------------------------
1 | name: "runaway-v0"
2 | max_episode_steps: 20
3 | # env specific params
4 | min_distance: 40.
5 | verbose: 0
6 | # env wrapper
7 | frame_stack: 1
8 | action_filter: 1
9 |
10 |
--------------------------------------------------------------------------------
/conf/env/spinning-v0.yaml:
--------------------------------------------------------------------------------
1 | name: "spinning-v0"
2 | max_episode_steps: 50
3 | verbose: 0
4 | # env wrapper
5 | frame_stack: 1
6 | action_filter: 1
7 | sleep_time: 0.0
8 |
9 |
--------------------------------------------------------------------------------
/conf/env/walker-v0.yaml:
--------------------------------------------------------------------------------
1 | name: "walker-v0"
2 | max_episode_steps: 100
3 | # env specific params
4 | verbose: 0
5 | # env wrapper
6 | frame_stack: 1
7 | action_filter: 1
8 | sleep_time: 0.0
9 |
10 |
--------------------------------------------------------------------------------
/conf/env/walker_sim-v0.yaml:
--------------------------------------------------------------------------------
1 | name: "walker_sim-v0"
2 | max_episode_steps: 100
3 | # env specific params
4 | noise: 0.1
5 | low_action_angle: -100
6 | high_action_angle: 0
7 | verbose: 0
8 | # env wrapper
9 | frame_stack: 1
10 | action_filter: 1
11 |
12 |
--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | # Examples
2 |
3 | ## TorchRL SOTA Example
4 |
5 | In the [torchrl_sac](./torchrl_sac/) folder you will find a training script to train LEGO robots with Bricksrl similar to the state-of-the-art implementations in [TorchRL](https://github.com/pytorch/rl/tree/main/sota-implementations). This allows you to basically plug-and-play with any TorchRL sota-implementation or do custom adaptations.
6 |
7 | [Example results](https://wandb.ai/sebastian-dittert/bricksrl_torchrl_sac_example?nw=nwusersebastiandittert)
8 |
9 | ### TorchRL sota-example diff
10 |
11 | Only change made to the TorchRL sota-implementations is the make_env function:
12 |
13 | ```
14 | # Environment import from BricksRL
15 | from bricksrl.environments.walker_v0.WalkerEnvSim import WalkerEnvSim_v0
16 |
17 | # ====================================================================
18 | # Make BricksRL Environment
19 | # -----------------
20 |
21 |
22 | def env_maker(cfg, device="cpu", from_pixels=False):
23 | # We use the WalkerEnvSim_v0 environment from BricksRL as an example
24 | # as it is easy to test as it does not require a robot at hand or to connect to the hub.
25 | # Users can replace this with any other environment from BricksRL or custom environments.
26 | env = WalkerEnvSim_v0(max_episode_steps=cfg.env.max_episode_steps)
27 | observation_keys = [key for key in env.observation_spec.keys()]
28 |
29 | transforms = []
30 | if cfg.env.frame_stack > 1:
31 | transforms.append(
32 | CatFrames(
33 | N=cfg.env.frame_stack,
34 | in_keys=observation_keys,
35 | out_key=observation_keys,
36 | )
37 | )
38 | normalize_keys = [key for key in observation_keys if key != "pixels"]
39 | obs_ranges = np.array(list(env.observation_ranges.values()))
40 | obs_mean = obs_ranges.mean(axis=-1)
41 | obs_std = obs_ranges.std(axis=-1)
42 | transforms.append(
43 | ObservationNorm(
44 | in_keys=normalize_keys, loc=obs_mean, scale=obs_std, standard_normal=True
45 | )
46 | )
47 | transforms.append(DeviceCastTransform(device))
48 | return TransformedEnv(env, Compose(*transforms))
49 |
50 | ```
51 |
52 |
53 |
54 |
55 | ## Custom Environment
56 | A template to create your own custom environments can be found [here](./custom_env.py). With an environment created like this you can update the [TorchRL example](./torchrl_sac) to train your own TorchRL agent on your custom environment.
57 |
58 | > **Attention!** For each custom environment, you need a custom client script that must be loaded on the HUB!
59 |
60 | ## High Level Examples
61 | In the [example notebook](./example_notebook.ipynb) we provide high-level training examples to train a **SAC agent** in the **RoboArmSim-v0** environment and a **TD3 agent** in the **WalkerSim-v0** enviornment.
62 | The examples are based on the experiments for our paper. Stand alone examples similar to the [TorchRL sota-implementations](https://github.com/pytorch/rl/tree/main/sota-implementations) can be found [here](./torchrl_sac).
--------------------------------------------------------------------------------
/examples/custom_env.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | import numpy as np
4 |
5 | import torch
6 |
7 | from bricksrl.environments.base.base_env import BaseEnv
8 | from tensordict import TensorDict, TensorDictBase
9 | from torchrl.data.tensor_specs import BoundedTensorSpec, CompositeSpec
10 |
11 |
12 | class CustomEnv(BaseEnv):
13 | """
14 | Environment template for creating your own custom environment for BricksRL.
15 |
16 | Args:
17 | max_episode_steps (int): The maximum number of steps per episode. Defaults to 10.
18 | verbose (bool): Whether to print additional information. Defaults to False.
19 |
20 | """
21 |
22 | def __init__(
23 | self,
24 | max_episode_steps: int = 50,
25 | verbose: bool = False,
26 | ):
27 | self._batch_size = torch.Size([1])
28 | self.max_episode_steps = max_episode_steps
29 |
30 | # Define action spec
31 | self.action_spec = BoundedTensorSpec(
32 | low=-1,
33 | high=1,
34 | shape=(1, self.action_dim),
35 | )
36 |
37 | # Define observation spec
38 |
39 | observation_spec = BoundedTensorSpec(
40 | low=-1,
41 | high=1,
42 | shape=(1, self.state_dim),
43 | )
44 |
45 | self.observation_spec = CompositeSpec(
46 | {self.observation_key: observation_spec}, shape=(1,)
47 | )
48 | super().__init__(
49 | action_dim=self.action_dim,
50 | state_dim=self.state_dim,
51 | verbose=verbose,
52 | )
53 |
54 | def _reset(self, tensordict: TensorDictBase, **kwargs) -> TensorDictBase:
55 | """
56 | Reset the environment and return the initial state.
57 |
58 | Returns:
59 | TensorDictBase: The initial state of the environment.
60 | """
61 | # TODO solve this fake action sending before to receive first state
62 | self.episode_step_iter = 0
63 | if tensordict is not None:
64 | action = tensordict.get("action").cpu().numpy().squeeze()
65 | else:
66 | action = np.zeros(self.action_dim)
67 | self.send_to_hub(action)
68 | # Get current observation
69 | observation = self.read_from_hub()
70 |
71 | return TensorDict(
72 | {
73 | self.observation_key: torch.tensor(observation, dtype=torch.float32),
74 | },
75 | batch_size=[1],
76 | )
77 |
78 | def reward(
79 | self,
80 | action: np.ndarray,
81 | next_state: np.ndarray,
82 | ) -> Tuple[float, bool]:
83 | """Your custom reward function"""
84 | return 1.0, False
85 |
86 | def _step(self, tensordict: TensorDictBase) -> TensorDictBase:
87 | """Custom step function"""
88 | # Send action to hub to receive next state
89 | action = tensordict.get("action").cpu().numpy().squeeze()
90 | self.send_to_hub(action)
91 | # receive the next state
92 | next_observation = self.read_from_hub()
93 |
94 | # calc reward and done
95 | reward, done = self.reward(
96 | action=action,
97 | next_state=next_observation,
98 | )
99 | next_tensordict = TensorDict(
100 | {
101 | self.observation_key: torch.tensor(
102 | next_observation, dtype=torch.float32
103 | ),
104 | "reward": torch.tensor([reward]).float(),
105 | "done": torch.tensor([done]).bool(),
106 | },
107 | batch_size=[1],
108 | )
109 |
110 | # increment episode step counter
111 | self.episode_step_iter += 1
112 | if self.episode_step_iter >= self.max_episode_steps:
113 | next_tensordict.set("done", torch.tensor([True]))
114 | return next_tensordict
115 |
--------------------------------------------------------------------------------
/examples/torchrl_sac/config.yaml:
--------------------------------------------------------------------------------
1 | # environment and task
2 | env:
3 | max_episode_steps: 100
4 | seed: 41
5 | frame_stack: 1
6 |
7 | # collector
8 | collector:
9 | total_frames: 10_000
10 | init_random_frames: 1000
11 | frames_per_batch: 1000
12 | init_env_steps: 1000
13 | device: cpu
14 | env_per_collector: 1
15 | reset_at_each_iter: False
16 |
17 | # replay buffer
18 | replay_buffer:
19 | size: 1000000
20 | prb: 0 # use prioritized experience replay
21 | scratch_dir: null
22 |
23 | # optim
24 | optim:
25 | utd_ratio: 1.0
26 | gamma: 0.99
27 | loss_function: l2
28 | lr: 3.0e-4
29 | weight_decay: 0.0
30 | batch_size: 256
31 | target_update_polyak: 0.995
32 | alpha_init: 1.0
33 | adam_eps: 1.0e-8
34 |
35 | # network
36 | network:
37 | hidden_sizes: [256, 256]
38 | activation: relu
39 | default_policy_scale: 1.0
40 | scale_lb: 0.1
41 | device:
42 |
43 | # logging
44 | logger:
45 | backend: wandb
46 | project_name: bricksrl_torchrl_sac_example
47 | group_name: null
48 | exp_name: Walkersim-v0_SAC
49 | mode: online
50 | eval_iter: 1000
51 | video: False
--------------------------------------------------------------------------------
/examples/torchrl_sac/train.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 | """SAC Example.
6 |
7 | This is a simple self-contained example of a SAC training script.
8 |
9 | It supports state environments like MuJoCo.
10 |
11 | The helper functions are coded in the utils.py associated with this script.
12 | """
13 | import time
14 |
15 | import hydra
16 |
17 | import numpy as np
18 | import torch
19 | import torch.cuda
20 | import tqdm
21 | from tensordict import TensorDict
22 | from torchrl._utils import logger as torchrl_logger
23 | from torchrl.envs.utils import ExplorationType, set_exploration_type
24 |
25 | from torchrl.record.loggers import generate_exp_name, get_logger
26 | from utils import (
27 | dump_video,
28 | log_metrics,
29 | make_collector,
30 | make_environment,
31 | make_loss_module,
32 | make_replay_buffer,
33 | make_sac_agent,
34 | make_sac_optimizer,
35 | )
36 |
37 |
38 | @hydra.main(version_base="1.1", config_path="", config_name="config")
39 | def main(cfg: "DictConfig"): # noqa: F821
40 | device = cfg.network.device
41 | if device in ("", None):
42 | if torch.cuda.is_available():
43 | device = torch.device("cuda:0")
44 | else:
45 | device = torch.device("cpu")
46 | device = torch.device(device)
47 |
48 | # Create logger
49 | exp_name = generate_exp_name("SAC", cfg.logger.exp_name)
50 | logger = None
51 | if cfg.logger.backend:
52 | logger = get_logger(
53 | logger_type=cfg.logger.backend,
54 | logger_name="sac_logging",
55 | experiment_name=exp_name,
56 | wandb_kwargs={
57 | "mode": cfg.logger.mode,
58 | "config": dict(cfg),
59 | "project": cfg.logger.project_name,
60 | "group": cfg.logger.group_name,
61 | },
62 | )
63 |
64 | torch.manual_seed(cfg.env.seed)
65 | np.random.seed(cfg.env.seed)
66 |
67 | # Create environments
68 | train_env, eval_env = make_environment(cfg, logger=logger)
69 |
70 | # Create agent
71 | model, exploration_policy = make_sac_agent(cfg, train_env, eval_env, device)
72 |
73 | # Create SAC loss
74 | loss_module, target_net_updater = make_loss_module(cfg, model)
75 |
76 | # Create off-policy collector
77 | collector = make_collector(cfg, train_env, exploration_policy)
78 |
79 | # Create replay buffer
80 | replay_buffer = make_replay_buffer(
81 | batch_size=cfg.optim.batch_size,
82 | prb=cfg.replay_buffer.prb,
83 | buffer_size=cfg.replay_buffer.size,
84 | scratch_dir=cfg.replay_buffer.scratch_dir,
85 | device="cpu",
86 | )
87 |
88 | # Create optimizers
89 | (
90 | optimizer_actor,
91 | optimizer_critic,
92 | optimizer_alpha,
93 | ) = make_sac_optimizer(cfg, loss_module)
94 |
95 | # Main loop
96 | start_time = time.time()
97 | collected_frames = 0
98 | pbar = tqdm.tqdm(total=cfg.collector.total_frames)
99 |
100 | init_random_frames = cfg.collector.init_random_frames
101 | num_updates = int(
102 | cfg.collector.env_per_collector
103 | * cfg.collector.frames_per_batch
104 | * cfg.optim.utd_ratio
105 | )
106 | prb = cfg.replay_buffer.prb
107 | eval_iter = cfg.logger.eval_iter
108 | frames_per_batch = cfg.collector.frames_per_batch
109 | eval_rollout_steps = cfg.env.max_episode_steps
110 |
111 | sampling_start = time.time()
112 | for i, tensordict in enumerate(collector):
113 | sampling_time = time.time() - sampling_start
114 |
115 | # Update weights of the inference policy
116 | collector.update_policy_weights_()
117 |
118 | pbar.update(tensordict.numel())
119 |
120 | tensordict = tensordict.reshape(-1)
121 | current_frames = tensordict.numel()
122 | # Add to replay buffer
123 | replay_buffer.extend(tensordict.cpu())
124 | collected_frames += current_frames
125 |
126 | # Optimization steps
127 | training_start = time.time()
128 | if collected_frames >= init_random_frames:
129 | losses = TensorDict({}, batch_size=[num_updates])
130 | for i in range(num_updates):
131 | # Sample from replay buffer
132 | sampled_tensordict = replay_buffer.sample()
133 | if sampled_tensordict.device != device:
134 | sampled_tensordict = sampled_tensordict.to(
135 | device, non_blocking=True
136 | )
137 | else:
138 | sampled_tensordict = sampled_tensordict.clone()
139 |
140 | # Compute loss
141 | loss_td = loss_module(sampled_tensordict)
142 |
143 | actor_loss = loss_td["loss_actor"]
144 | q_loss = loss_td["loss_qvalue"]
145 | alpha_loss = loss_td["loss_alpha"]
146 |
147 | # Update actor
148 | optimizer_actor.zero_grad()
149 | actor_loss.backward()
150 | optimizer_actor.step()
151 |
152 | # Update critic
153 | optimizer_critic.zero_grad()
154 | q_loss.backward()
155 | optimizer_critic.step()
156 |
157 | # Update alpha
158 | optimizer_alpha.zero_grad()
159 | alpha_loss.backward()
160 | optimizer_alpha.step()
161 |
162 | losses[i] = loss_td.select(
163 | "loss_actor", "loss_qvalue", "loss_alpha"
164 | ).detach()
165 |
166 | # Update qnet_target params
167 | target_net_updater.step()
168 |
169 | # Update priority
170 | if prb:
171 | replay_buffer.update_priority(sampled_tensordict)
172 |
173 | training_time = time.time() - training_start
174 | episode_end = (
175 | tensordict["next", "done"]
176 | if tensordict["next", "done"].any()
177 | else tensordict["next", "truncated"]
178 | )
179 | episode_rewards = tensordict["next", "episode_reward"][episode_end]
180 |
181 | # Logging
182 | metrics_to_log = {}
183 | if len(episode_rewards) > 0:
184 | episode_length = tensordict["next", "step_count"][episode_end]
185 | metrics_to_log["train/reward"] = episode_rewards.mean().item()
186 | metrics_to_log["train/episode_length"] = episode_length.sum().item() / len(
187 | episode_length
188 | )
189 | if collected_frames >= init_random_frames:
190 | metrics_to_log["train/q_loss"] = losses.get("loss_qvalue").mean().item()
191 | metrics_to_log["train/actor_loss"] = losses.get("loss_actor").mean().item()
192 | metrics_to_log["train/alpha_loss"] = losses.get("loss_alpha").mean().item()
193 | metrics_to_log["train/alpha"] = loss_td["alpha"].item()
194 | metrics_to_log["train/entropy"] = loss_td["entropy"].item()
195 | metrics_to_log["train/sampling_time"] = sampling_time
196 | metrics_to_log["train/training_time"] = training_time
197 |
198 | # Evaluation
199 | if abs(collected_frames % eval_iter) < frames_per_batch:
200 | with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
201 | eval_start = time.time()
202 | eval_rollout = eval_env.rollout(
203 | eval_rollout_steps,
204 | model[0],
205 | auto_cast_to_device=True,
206 | break_when_any_done=True,
207 | )
208 | eval_env.apply(dump_video)
209 | eval_time = time.time() - eval_start
210 | eval_reward = eval_rollout["next", "reward"].sum(-2).mean().item()
211 | metrics_to_log["eval/reward"] = eval_reward
212 | metrics_to_log["eval/time"] = eval_time
213 | if logger is not None:
214 | log_metrics(logger, metrics_to_log, collected_frames)
215 | sampling_start = time.time()
216 |
217 | collector.shutdown()
218 | if not eval_env.is_closed:
219 | eval_env.close()
220 | if not train_env.is_closed:
221 | train_env.close()
222 | end_time = time.time()
223 | execution_time = end_time - start_time
224 | torchrl_logger.info(f"Training took {execution_time:.2f} seconds to finish")
225 |
226 |
227 | if __name__ == "__main__":
228 | main()
229 |
--------------------------------------------------------------------------------
/experiments/2wheeler/eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 |
5 | import hydra
6 | import numpy as np
7 | import wandb
8 | from omegaconf import DictConfig, OmegaConf
9 | from torchrl.envs.utils import step_mdp
10 | from tqdm import tqdm
11 |
12 | # Add the project root to PYTHONPATH for config
13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
14 | if project_root not in sys.path:
15 | sys.path.insert(0, project_root)
16 |
17 | from bricksrl.environments import make_env
18 | from experiments.helper.agents import get_agent
19 | from experiments.helper.utils import login, logout, setup_check
20 |
21 |
22 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
23 | def run(cfg: DictConfig) -> None:
24 | print(OmegaConf.to_yaml(cfg))
25 |
26 | # make environment.
27 | setup_check(robot="2wheeler", config=cfg)
28 | env, action_space, state_space = make_env(cfg)
29 |
30 | # make agent
31 | agent, project_name = get_agent(action_space, state_space, cfg)
32 | login(agent)
33 | agent.eval()
34 |
35 | # initialize wandb
36 | wandb.init(project=project_name + "_eval")
37 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
38 |
39 | eval_episodes = cfg.episodes
40 | quit = False
41 | _ = input("Press Enter to start evaluation...")
42 | try:
43 | for e in tqdm(range(eval_episodes), desc="Evaluation"):
44 | td = env.reset()
45 | done = td.get("done", False)
46 | truncated = td.get("truncated", False)
47 | ep_return = 0
48 | ep_steps = 0
49 | total_step_times = []
50 | actions = []
51 | print("Start new evaluation...", flush=True)
52 | while not done and not truncated:
53 | ep_steps += 1
54 | step_start_time = time.time()
55 | td = agent.get_eval_action(td)
56 | actions.append(td.get("action").cpu().numpy())
57 | td = env.step(td)
58 | agent.add_experience(td)
59 | total_agent_step_time = time.time() - step_start_time
60 | total_step_times.append(total_agent_step_time)
61 | done = td.get(("next", "done"), False)
62 | ep_return += td.get(("next", "reward"), 0)
63 | if done:
64 | if cfg.env.name == "runaway-v0":
65 | inpt = input(
66 | "Please reset the robot to the starting position and press Enter to continue or q to quit:"
67 | )
68 | if inpt == "q":
69 | quit = True
70 | break
71 | td = step_mdp(td)
72 |
73 | if quit:
74 | break
75 |
76 | # Metrics Logging
77 | log_dict = {
78 | "epoch": e,
79 | "reward": ep_return,
80 | "steps": ep_steps,
81 | "total_step_time": np.mean(total_step_times),
82 | "buffer_size": agent.replay_buffer.__len__(),
83 | "done": done.float(),
84 | "mean_action": np.mean(actions),
85 | }
86 | if cfg.env.name == "runaway-v0":
87 | log_dict.update({"distance": td.get("distance")})
88 |
89 | wandb.log(log_dict)
90 |
91 | except KeyboardInterrupt:
92 | print("Evaluation interrupted by user.")
93 | logout(agent)
94 | env.close()
95 |
96 |
97 | if __name__ == "__main__":
98 | run()
99 |
--------------------------------------------------------------------------------
/experiments/2wheeler/pretrain.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import hydra
5 | import wandb
6 | from omegaconf import DictConfig, OmegaConf
7 | from tqdm import tqdm
8 |
9 | # Add the project root to PYTHONPATH for config
10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
11 | if project_root not in sys.path:
12 | sys.path.insert(0, project_root)
13 |
14 | from bricksrl.environments import make_env
15 | from experiments.helper.agents import get_agent
16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict
17 |
18 |
19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
20 | def run(cfg: DictConfig) -> None:
21 | print(OmegaConf.to_yaml(cfg))
22 |
23 | # make environment.
24 | setup_check(robot="2wheeler", config=cfg)
25 | env, action_space, state_space = make_env(cfg, pretrain=True)
26 |
27 | # make agent
28 | agent, project_name = get_agent(action_space, state_space, cfg)
29 | login(agent)
30 |
31 | # initialize wandb
32 | wandb.init(project=project_name)
33 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
34 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None
35 |
36 | batch_size = cfg.agent.batch_size
37 | num_updates = cfg.agent.num_updates
38 | train_episodes = cfg.episodes
39 | print("Start training...")
40 | try:
41 | for e in tqdm(range(train_episodes), desc="Training"):
42 |
43 | loss_info = agent.train(batch_size=batch_size, num_updates=num_updates)
44 |
45 | # Metrics Logging
46 | log_dict = {
47 | "epoch": e,
48 | "buffer_size": agent.replay_buffer.__len__(),
49 | }
50 | log_dict.update(tensordict2dict(loss_info))
51 | wandb.log(log_dict)
52 |
53 | except KeyboardInterrupt:
54 | print("Training interrupted by user.")
55 |
56 | logout(agent)
57 | env.close()
58 |
59 |
60 | if __name__ == "__main__":
61 | run()
62 |
--------------------------------------------------------------------------------
/experiments/2wheeler/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 |
5 | import hydra
6 | import numpy as np
7 | import wandb
8 | from omegaconf import DictConfig, OmegaConf
9 | from torchrl.envs.utils import step_mdp
10 | from tqdm import tqdm
11 |
12 | # Add the project root to PYTHONPATH for config
13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
14 | if project_root not in sys.path:
15 | sys.path.insert(0, project_root)
16 |
17 | from bricksrl.environments import make_env
18 | from experiments.helper.agents import get_agent
19 | from experiments.helper.utils import (
20 | login,
21 | logout,
22 | prefill_buffer,
23 | setup_check,
24 | tensordict2dict,
25 | )
26 |
27 |
28 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
29 | def run(cfg: DictConfig) -> None:
30 | print(OmegaConf.to_yaml(cfg))
31 |
32 | # make environment.
33 | setup_check(robot="2wheeler", config=cfg)
34 | env, action_space, state_space = make_env(cfg)
35 |
36 | # make agent
37 | agent, project_name = get_agent(action_space, state_space, cfg)
38 | login(agent)
39 |
40 | # initialize wandb
41 | wandb.init(project=project_name)
42 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
43 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None
44 |
45 | # prefill buffer with random actions
46 | prefill_buffer(
47 | env=env,
48 | agent=agent,
49 | num_episodes=cfg.agent.prefill_episodes,
50 | stop_on_done=True if cfg.env.name == "runaway-v0" else False,
51 | )
52 |
53 | batch_size = cfg.agent.batch_size
54 | num_updates = cfg.agent.num_updates
55 | train_episodes = cfg.episodes
56 | print("Start training...")
57 | quit = False
58 | try:
59 | for e in tqdm(range(train_episodes), desc="Training"):
60 | td = env.reset()
61 | done = td.get("done", False)
62 | truncated = td.get("truncated", False)
63 | ep_return = 0
64 | ep_steps = 0
65 | total_step_times = []
66 | actions = []
67 | print("Start new data collection...", flush=True)
68 | while not done and not truncated:
69 | ep_steps += 1
70 | step_start_time = time.time()
71 | td = agent.get_action(td)
72 | actions.append(td.get("action").cpu().numpy())
73 | td = env.step(td)
74 | agent.add_experience(td)
75 | total_agent_step_time = time.time() - step_start_time
76 | total_step_times.append(total_agent_step_time)
77 | done = td.get(("next", "done"), False)
78 | ep_return += td.get(("next", "reward"), 0)
79 | if done:
80 | if cfg.env.name == "runaway-v0":
81 | inpt = input(
82 | "Please reset the robot to the starting position and press Enter to continue or q to quit:"
83 | )
84 | if inpt == "q":
85 | quit = True
86 | break
87 | td = step_mdp(td)
88 | loss_info = agent.train(
89 | batch_size=batch_size, num_updates=num_updates * ep_steps
90 | )
91 | if quit:
92 | break
93 |
94 | # Metrics Logging
95 | log_dict = {
96 | "epoch": e,
97 | "reward": ep_return,
98 | "steps": ep_steps,
99 | "total_step_time": np.mean(total_step_times),
100 | "buffer_size": agent.replay_buffer.__len__(),
101 | "done": done.float(),
102 | "mean_action": np.mean(actions),
103 | }
104 | if cfg.env.name == "runaway-v0":
105 | log_dict.update({"distance": td.get("distance")})
106 |
107 | log_dict.update(tensordict2dict(loss_info))
108 | wandb.log(log_dict)
109 |
110 | except KeyboardInterrupt:
111 | print("Training interrupted by user.")
112 |
113 | logout(agent)
114 | env.close()
115 |
116 |
117 | if __name__ == "__main__":
118 | run()
119 |
--------------------------------------------------------------------------------
/experiments/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/experiments/helper/__init__.py
--------------------------------------------------------------------------------
/experiments/helper/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from experiments.helper.agents.behavior_cloning import BehavioralCloningAgent
2 | from experiments.helper.agents.cql import CQLAgent
3 | from experiments.helper.agents.iql import IQLAgent
4 | from experiments.helper.agents.random import RandomAgent
5 | from experiments.helper.agents.sac import SACAgent
6 | from experiments.helper.agents.td3 import TD3Agent
7 |
8 | all_agents = ["td3", "sac", "iql", "cql", "bc", "random"]
9 |
10 |
11 | def get_agent(action_spec, state_spec, cfg):
12 | if cfg.agent.name == "td3":
13 | agent = TD3Agent(
14 | action_spec=action_spec,
15 | state_spec=state_spec,
16 | agent_config=cfg.agent,
17 | device=cfg.device,
18 | )
19 | elif cfg.agent.name == "sac":
20 | agent = SACAgent(
21 | action_spec=action_spec,
22 | state_spec=state_spec,
23 | agent_config=cfg.agent,
24 | device=cfg.device,
25 | )
26 | elif cfg.agent.name == "bc":
27 | agent = BehavioralCloningAgent(
28 | action_spec=action_spec,
29 | state_spec=state_spec,
30 | agent_config=cfg.agent,
31 | device=cfg.device,
32 | )
33 | elif cfg.agent.name == "random":
34 | agent = RandomAgent(
35 | action_spec=action_spec,
36 | state_spec=state_spec,
37 | agent_config=cfg.agent,
38 | device=cfg.device,
39 | )
40 | elif cfg.agent.name == "iql":
41 | agent = IQLAgent(
42 | action_spec=action_spec,
43 | state_spec=state_spec,
44 | agent_config=cfg.agent,
45 | device=cfg.device,
46 | )
47 | elif cfg.agent.name == "cql":
48 | agent = CQLAgent(
49 | action_spec=action_spec,
50 | state_spec=state_spec,
51 | agent_config=cfg.agent,
52 | device=cfg.device,
53 | )
54 | else:
55 | raise NotImplementedError(
56 | f"Agent {cfg.agent.name} not implemented, please choose from {all_agents}"
57 | )
58 |
59 | project_name = f"lego-{cfg.agent.name}-{cfg.env.name}"
60 | print("--- Agent initialized ---", flush=True)
61 |
62 | return agent, project_name
63 |
--------------------------------------------------------------------------------
/experiments/helper/agents/base.py:
--------------------------------------------------------------------------------
1 | import math
2 | from typing import Tuple
3 |
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.init as init
7 | from tensordict import TensorDictBase
8 | from torchrl.data.tensor_specs import TensorSpec
9 | from torchrl.envs.utils import set_exploration_mode
10 |
11 |
12 | class BaseAgent:
13 | """Implements a base agent used to interact with the lego robots.
14 |
15 | Args:
16 | state_spec (TensorSpec): The state specification of the environment.
17 | action_spec (TensorSpec): The action specification of the environment.
18 | agent_name (str): The name of the agent.
19 | device (str): The device to use for computation.
20 |
21 | Attributes:
22 | name (str): The name of the agent.
23 | observation_spec (TensorSpec): The state specification of the environment.
24 | action_spec (TensorSpec): The action specification of the environment.
25 | device (str): The device to use for computation.
26 | observation_keys (List[str]): The keys used to access the observation in the tensor dictionary.
27 | """
28 |
29 | def __init__(
30 | self,
31 | state_spec: TensorSpec,
32 | action_spec: TensorSpec,
33 | agent_name: str,
34 | device: str = "cpu",
35 | ):
36 | self.name = agent_name
37 | self.observation_spec = state_spec
38 | self.action_spec = action_spec
39 | self.device = device
40 | self.observation_keys = [key for key in self.observation_spec.keys()]
41 |
42 | def init_nets(self, model: nn.Module):
43 | """Initializes the networks with random data.
44 |
45 | Args:
46 | model (list): A list of PyTorch models to initialize.
47 | """
48 | with torch.no_grad(), set_exploration_mode("random"):
49 | td = self.observation_spec.rand()
50 | td = td.to(self.device)
51 | for net in model:
52 | net(td)
53 | del td
54 |
55 | def eval(self):
56 | """Sets the agent to evaluation mode."""
57 | raise NotImplementedError
58 |
59 | @staticmethod
60 | def reset_parameter(param):
61 | if param.data.ndimension() == 2: # Weights
62 | init.kaiming_uniform_(param.data, a=math.sqrt(5))
63 | else: # Biases and others
64 | # Adjust based on your specific needs
65 | init.uniform_(param.data, -1, 1)
66 |
67 | def get_action(self, tensordict: TensorDictBase) -> TensorDictBase:
68 | """Returns a sampled action given a tensordict to collect data.
69 |
70 | Args:
71 | tensordict (TensorDictBase): Tensordict containing the current state of the environment.
72 |
73 | Returns:
74 | TensorDictBase: TensorDict containing the sampled action to take in the environment.
75 | """
76 | raise NotImplementedError
77 |
78 | def get_eval_action(self, tensordict: TensorDictBase) -> TensorDictBase:
79 | """Returns an action given a tensordict to evaluate the agent.
80 |
81 | Args:
82 | tensordict (TensorDictBase): Tensordict containing the current state of the environment.
83 |
84 | Returns:
85 | TensorDictBase: TensorDict containing the eval action to take in the environment.
86 | """
87 | raise NotImplementedError
88 |
89 | def train(
90 | self,
91 | ):
92 | """Trains the agent.
93 |
94 | Raises:
95 | NotImplementedError: This method must be implemented by a subclass.
96 | """
97 | raise NotImplementedError
98 |
--------------------------------------------------------------------------------
/experiments/helper/agents/behavior_cloning.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensordict as td
3 | import torch
4 |
5 | from experiments.helper.agents.base import BaseAgent
6 | from experiments.helper.networks.networks import (
7 | get_deterministic_actor,
8 | get_stochastic_actor,
9 | )
10 | from tensordict import TensorDictBase
11 | from torch import nn, optim
12 | from torchrl.data import BoundedTensorSpec, TensorDictReplayBuffer
13 |
14 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage
15 | from torchrl.envs import RenameTransform, ToTensorImage
16 | from torchrl.envs.utils import ExplorationType, set_exploration_type
17 |
18 |
19 | def initialize(net, std=0.02):
20 | for p, n in net.named_parameters():
21 | if "weight" in p:
22 | # nn.init.xavier_uniform_(n)
23 | nn.init.normal_(n, mean=0, std=std)
24 | elif "bias" in p:
25 | nn.init.zeros_(n)
26 |
27 |
28 | class BehavioralCloningAgent(BaseAgent):
29 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
30 | super(BehavioralCloningAgent, self).__init__(
31 | state_spec, action_spec, agent_config.name, device
32 | )
33 |
34 | if agent_config.policy_type == "deterministic":
35 | self.actor = get_deterministic_actor(state_spec, action_spec, agent_config)
36 | elif agent_config.policy_type == "stochastic":
37 | raise NotImplementedError(
38 | "Stochastic actor training is not implemented yet"
39 | )
40 | # TODO: Implement stochastic actor training
41 | # self.actor = get_stochastic_actor(
42 | # state_spec, action_spec, agent_config
43 | # )
44 | else:
45 | raise ValueError(
46 | "policy_type not recognized, choose deterministic or stochastic"
47 | )
48 | self.actor.to(device)
49 | # initialize networks
50 | self.init_nets([self.actor])
51 |
52 | self.optimizer = optim.Adam(
53 | self.actor.parameters(), lr=agent_config.lr, weight_decay=0.0
54 | )
55 |
56 | # create replay buffer
57 | self.batch_size = agent_config.batch_size
58 | self.replay_buffer = self.create_replay_buffer()
59 |
60 | # general stats
61 | self.collected_transitions = 0
62 | self.do_pretrain = False
63 | self.episodes = 0
64 |
65 | def get_agent_statedict(self):
66 | """Save agent"""
67 | act_statedict = self.actor.state_dict()
68 | return {"actor": act_statedict}
69 |
70 | def load_model(self, path):
71 | """load model"""
72 | try:
73 | statedict = torch.load(path)
74 | self.actor.load_state_dict(statedict["actor"])
75 | print("Model loaded")
76 | except:
77 | raise ValueError("Model not loaded")
78 |
79 | def load_replaybuffer(self, path):
80 | """load replay buffer"""
81 | try:
82 | loaded_data = TensorDictBase.load_memmap(path)
83 | self.replay_buffer.extend(loaded_data)
84 | if self.replay_buffer._batch_size != self.batch_size:
85 | Warning(
86 | "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
87 | )
88 | self.replay_buffer._batch_size = self.batch_size
89 | print("Replay Buffer loaded")
90 | print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n")
91 | except:
92 | raise ValueError("Replay Buffer not loaded")
93 |
94 | def eval(self):
95 | """Sets the agent to evaluation mode."""
96 | self.actor.eval()
97 |
98 | @torch.no_grad()
99 | def get_eval_action(self, td: TensorDictBase) -> TensorDictBase:
100 | """Get eval action from actor network"""
101 | with set_exploration_type(ExplorationType.MODE):
102 | out_td = self.actor(td.to(self.device))
103 | return out_td
104 |
105 | def create_replay_buffer(
106 | self,
107 | buffer_size=1000000,
108 | buffer_scratch_dir="./tmp",
109 | device="cpu",
110 | prefetch=3,
111 | ):
112 | """Create replay buffer"""
113 |
114 | replay_buffer = TensorDictReplayBuffer(
115 | pin_memory=False,
116 | prefetch=prefetch,
117 | storage=LazyMemmapStorage(
118 | buffer_size,
119 | scratch_dir=buffer_scratch_dir,
120 | ),
121 | batch_size=self.batch_size,
122 | )
123 | replay_buffer.append_transform(lambda x: x.to(device))
124 | # TODO: check if we have image in observation space if so add this transform
125 | # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))
126 |
127 | return replay_buffer
128 |
129 | @torch.no_grad()
130 | def get_action(self, td: TensorDictBase) -> TensorDictBase:
131 | """Get action from actor network"""
132 | with set_exploration_type(ExplorationType.RANDOM):
133 | out_td = self.actor(td.to(self.device))
134 | return out_td
135 |
136 | def add_experience(self, transition: td.TensorDict):
137 | """Add experience to replay buffer"""
138 | """Add experience to replay buffer"""
139 | self.replay_buffer.extend(transition)
140 | self.collected_transitions += 1
141 |
142 | def train(self, batch_size=64, num_updates=1):
143 | """Train the agent"""
144 | log_data = {}
145 |
146 | for i in range(num_updates):
147 | batch = self.replay_buffer.sample(batch_size).to(self.device)
148 | orig_action = batch.get("action").clone()
149 |
150 | out_dict = self.actor(batch)
151 | loss = torch.mean((out_dict.get("action") - orig_action) ** 2)
152 | self.optimizer.zero_grad()
153 | loss.backward()
154 | self.optimizer.step()
155 | log_data.update({"loss": loss})
156 | return log_data
157 |
--------------------------------------------------------------------------------
/experiments/helper/agents/cql.py:
--------------------------------------------------------------------------------
1 | import tensordict as td
2 | import torch
3 |
4 | from experiments.helper.agents.base import BaseAgent
5 | from experiments.helper.networks.networks import get_critic, get_stochastic_actor
6 | from tensordict import TensorDictBase
7 | from torch import optim
8 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer
9 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage
10 | from torchrl.envs.utils import ExplorationType, set_exploration_type
11 | from torchrl.objectives import SoftUpdate
12 |
13 | from torchrl.objectives.cql import CQLLoss
14 |
15 |
16 | class CQLAgent(BaseAgent):
17 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
18 | super(CQLAgent, self).__init__(
19 | state_spec, action_spec, agent_config.name, device
20 | )
21 |
22 | with_lagrange = agent_config.with_lagrange
23 |
24 | self.actor = get_stochastic_actor(state_spec, action_spec, agent_config)
25 | self.critic = get_critic(state_spec, agent_config)
26 |
27 | self.actor.to(device)
28 | self.critic.to(device)
29 |
30 | # initialize networks
31 | self.init_nets([self.actor, self.critic])
32 |
33 | # define loss function
34 | self.loss_module = CQLLoss(
35 | actor_network=self.actor,
36 | qvalue_network=self.critic,
37 | loss_function=agent_config.loss_function,
38 | temperature=agent_config.temperature,
39 | min_q_weight=agent_config.min_q_weight,
40 | max_q_backup=agent_config.max_q_backup,
41 | deterministic_backup=agent_config.deterministic_backup,
42 | num_random=agent_config.num_random,
43 | with_lagrange=agent_config.with_lagrange,
44 | lagrange_thresh=agent_config.lagrange_thresh,
45 | )
46 | # Define Target Network Updater
47 | self.target_net_updater = SoftUpdate(
48 | self.loss_module, eps=agent_config.soft_update_eps
49 | )
50 | self.target_net_updater.init_()
51 |
52 | # Reset weights
53 | self.reset_params = agent_config.reset_params
54 |
55 | # Define Replay Buffer
56 | self.batch_size = agent_config.batch_size
57 | self.replay_buffer = self.create_replay_buffer(
58 | prb=agent_config.prb,
59 | buffer_size=agent_config.buffer_size,
60 | device=device,
61 | )
62 |
63 | # Define Optimizer
64 | critic_params = list(
65 | self.loss_module.qvalue_network_params.flatten_keys().values()
66 | )
67 | actor_params = list(
68 | self.loss_module.actor_network_params.flatten_keys().values()
69 | )
70 | self.optimizer_actor = optim.Adam(
71 | actor_params, lr=agent_config.lr, weight_decay=0.0
72 | )
73 | self.optimizer_critic = optim.Adam(
74 | critic_params, lr=agent_config.lr, weight_decay=0.0
75 | )
76 | self.optimizer_alpha = optim.Adam(
77 | [self.loss_module.log_alpha],
78 | lr=3.0e-4,
79 | )
80 | if with_lagrange:
81 | self.alpha_prime_optim = torch.optim.Adam(
82 | [self.loss_module.log_alpha_prime],
83 | lr=agent_config.lr,
84 | )
85 | else:
86 | self.alpha_prime_optim = None
87 | # general stats
88 | self.collected_transitions = 0
89 | self.total_updates = 0
90 | self.do_pretrain = agent_config.pretrain
91 | self.bc_steps = agent_config.bc_steps
92 |
93 | def get_agent_statedict(self):
94 | """Save agent"""
95 | act_statedict = self.actor.state_dict()
96 | critic_statedict = self.critic.state_dict()
97 | return {"actor": act_statedict, "critic": critic_statedict}
98 |
99 | def load_model(self, path):
100 | """load model"""
101 | try:
102 | statedict = torch.load(path)
103 | self.actor.load_state_dict(statedict["actor"])
104 | self.critic.load_state_dict(statedict["critic"])
105 | print("Model loaded")
106 | except:
107 | raise ValueError("Model not loaded")
108 |
109 | def load_replaybuffer(self, path):
110 | """load replay buffer"""
111 | try:
112 | loaded_data = TensorDictBase.load_memmap(path)
113 | self.replay_buffer.extend(loaded_data)
114 | if self.replay_buffer._batch_size != self.batch_size:
115 | Warning(
116 | "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
117 | )
118 | self.replay_buffer._batch_size = self.batch_size
119 | print("Replay Buffer loaded")
120 | print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n")
121 | except:
122 | raise ValueError("Replay Buffer not loaded")
123 |
124 | def reset_networks(self):
125 | """reset network parameters"""
126 | print("Resetting Networks!")
127 | self.loss_module.actor_network_params.apply(self.reset_parameter)
128 | self.loss_module.target_actor_network_params.apply(self.reset_parameter)
129 | self.loss_module.qvalue_network_params.apply(self.reset_parameter)
130 | self.loss_module.target_qvalue_network_params.apply(self.reset_parameter)
131 |
132 | def eval(self):
133 | """Sets the agent to evaluation mode."""
134 | self.actor.eval()
135 |
136 | def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase:
137 | # TODO not ideal to have this here
138 | td.pop("scale")
139 | td.pop("loc")
140 | td.pop("params")
141 | if "vector_obs_embedding" in td.keys():
142 | td.pop("vector_obs_embedding")
143 | if "image_embedding" in td.keys():
144 | td.pop("image_embedding")
145 |
146 | def create_replay_buffer(
147 | self,
148 | prb=False,
149 | buffer_size=100000,
150 | buffer_scratch_dir=None,
151 | device="cpu",
152 | prefetch=3,
153 | ):
154 | """Create replay buffer"""
155 | # TODO: make this part of base off policy agent
156 | if prb:
157 | replay_buffer = TensorDictPrioritizedReplayBuffer(
158 | alpha=0.7,
159 | beta=0.5,
160 | pin_memory=False,
161 | prefetch=1,
162 | storage=LazyTensorStorage(
163 | buffer_size,
164 | ),
165 | )
166 | else:
167 | replay_buffer = TensorDictReplayBuffer(
168 | pin_memory=False,
169 | prefetch=prefetch,
170 | storage=LazyMemmapStorage(
171 | buffer_size,
172 | scratch_dir=buffer_scratch_dir,
173 | ),
174 | batch_size=self.batch_size,
175 | )
176 | replay_buffer.append_transform(lambda x: x.to(device))
177 | # TODO: check if we have image in observation space if so add this transform
178 | # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))
179 | return replay_buffer
180 |
181 | @torch.no_grad()
182 | def get_action(self, td: TensorDictBase) -> TensorDictBase:
183 | """Get action from actor network"""
184 | with set_exploration_type(ExplorationType.RANDOM):
185 | out_td = self.actor(td.to(self.device))
186 | self.td_preprocessing(out_td)
187 | return out_td
188 |
189 | @torch.no_grad()
190 | def get_eval_action(self, td: TensorDictBase) -> TensorDictBase:
191 | """Get eval action from actor network"""
192 | with set_exploration_type(ExplorationType.MODE):
193 | out_td = self.actor(td.to(self.device))
194 | self.td_preprocessing(out_td)
195 | return out_td
196 |
197 | def add_experience(self, transition: td.TensorDict):
198 | """Add experience to replay buffer"""
199 | self.replay_buffer.extend(transition)
200 | self.collected_transitions += 1
201 |
202 | def train(self, batch_size=64, num_updates=1):
203 | """Train the agent"""
204 | self.actor.train()
205 | for i in range(num_updates):
206 | self.total_updates += 1
207 | # Sample a batch from the replay buffer
208 | batch = self.replay_buffer.sample(batch_size)
209 | # Compute CQL Loss
210 | loss = self.loss_module(batch)
211 |
212 | # Update alpha
213 | alpha_loss = loss["loss_alpha"]
214 | alpha_prime_loss = loss["loss_alpha_prime"]
215 | self.optimizer_alpha.zero_grad()
216 | alpha_loss.backward()
217 | self.optimizer_alpha.step()
218 |
219 | # Update Actpr Network
220 | # official cql implementation uses behavior cloning loss for first few updating steps as it helps for some tasks
221 | if self.total_updates >= self.bc_steps:
222 | actor_loss = loss["loss_actor"]
223 | else:
224 | actor_loss = loss["loss_actor_bc"]
225 | self.optimizer_actor.zero_grad()
226 | actor_loss.backward()
227 | self.optimizer_actor.step()
228 |
229 | if self.alpha_prime_optim is not None:
230 | self.alpha_prime_optim.zero_grad()
231 | alpha_prime_loss.backward(retain_graph=True)
232 | self.alpha_prime_optim.step()
233 |
234 | # Update Critic Network
235 | q_loss = loss["loss_qvalue"]
236 | cql_loss = loss["loss_cql"]
237 |
238 | q_loss = q_loss + cql_loss
239 | self.optimizer_critic.zero_grad()
240 | q_loss.backward(retain_graph=False)
241 | self.optimizer_critic.step()
242 |
243 | # Update Target Networks
244 | self.target_net_updater.step()
245 | # Update Prioritized Replay Buffer
246 | if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer):
247 | self.replay_buffer.update_priorities(
248 | batch["indices"],
249 | loss["critic_loss"].detach().cpu().numpy(),
250 | )
251 | self.actor.eval()
252 | return loss
253 |
--------------------------------------------------------------------------------
/experiments/helper/agents/iql.py:
--------------------------------------------------------------------------------
1 | import tensordict as td
2 | import torch
3 |
4 | from experiments.helper.agents.base import BaseAgent
5 | from experiments.helper.networks.networks import (
6 | get_critic,
7 | get_stochastic_actor,
8 | get_value_operator,
9 | )
10 | from tensordict import TensorDictBase
11 | from torch import optim
12 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer
13 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage
14 | from torchrl.envs.transforms import ToTensorImage
15 | from torchrl.envs.utils import ExplorationType, set_exploration_type
16 | from torchrl.objectives import SoftUpdate
17 |
18 | from torchrl.objectives.iql import IQLLoss
19 |
20 |
21 | class IQLAgent(BaseAgent):
22 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
23 | super(IQLAgent, self).__init__(
24 | state_spec, action_spec, agent_config.name, device
25 | )
26 |
27 | self.actor = get_stochastic_actor(state_spec, action_spec, agent_config)
28 | self.critic = get_critic(state_spec, agent_config)
29 |
30 | self.value = get_value_operator(state_spec, agent_config)
31 |
32 | self.actor.to(device)
33 | self.critic.to(device)
34 | self.value.to(device)
35 |
36 | # initialize networks
37 | self.init_nets([self.actor, self.critic, self.value])
38 |
39 | # define loss function
40 | self.loss_module = IQLLoss(
41 | actor_network=self.actor,
42 | qvalue_network=self.critic,
43 | value_network=self.value,
44 | num_qvalue_nets=2,
45 | temperature=agent_config.temperature,
46 | expectile=agent_config.expectile,
47 | loss_function=agent_config.loss_function,
48 | )
49 | # Define Target Network Updater
50 | self.target_net_updater = SoftUpdate(
51 | self.loss_module, eps=agent_config.soft_update_eps
52 | )
53 | self.target_net_updater.init_()
54 |
55 | # Reset weights
56 | self.reset_params = agent_config.reset_params
57 |
58 | # Define Replay Buffer
59 | self.batch_size = agent_config.batch_size
60 |
61 | self.replay_buffer = self.create_replay_buffer(
62 | prb=agent_config.prb,
63 | buffer_size=agent_config.buffer_size,
64 | device=device,
65 | )
66 |
67 | # Define Optimizer
68 | critic_params = list(
69 | self.loss_module.qvalue_network_params.flatten_keys().values()
70 | )
71 | value_params = list(
72 | self.loss_module.value_network_params.flatten_keys().values()
73 | )
74 | actor_params = list(
75 | self.loss_module.actor_network_params.flatten_keys().values()
76 | )
77 | self.optimizer_actor = optim.Adam(
78 | actor_params, lr=agent_config.lr, weight_decay=0.0
79 | )
80 | self.optimizer_critic = optim.Adam(
81 | critic_params, lr=agent_config.lr, weight_decay=0.0
82 | )
83 | self.optimizer_value = optim.Adam(
84 | value_params, lr=agent_config.lr, weight_decay=0.0
85 | )
86 |
87 | # general stats
88 | self.collected_transitions = 0
89 | self.total_updates = 0
90 | self.do_pretrain = agent_config.pretrain
91 |
92 | def get_agent_statedict(self):
93 | """Save agent"""
94 | act_statedict = self.actor.state_dict()
95 | critic_statedict = self.critic.state_dict()
96 | value_statedict = self.value.state_dict()
97 | return {
98 | "actor": act_statedict,
99 | "critic": critic_statedict,
100 | "value": value_statedict,
101 | }
102 |
103 | def load_model(self, path):
104 | """load model"""
105 |
106 | try:
107 | statedict = torch.load(path)
108 | self.actor.load_state_dict(statedict["actor"])
109 | self.critic.load_state_dict(statedict["critic"])
110 | self.value.load_state_dict(statedict["value"])
111 | print("Model loaded")
112 | except:
113 | raise ValueError("Model not loaded")
114 |
115 | def load_replaybuffer(self, path):
116 | """load replay buffer"""
117 | try:
118 | loaded_data = TensorDictBase.load_memmap(path)
119 | self.replay_buffer.extend(loaded_data)
120 | if self.replay_buffer._batch_size != self.batch_size:
121 | Warning(
122 | "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
123 | )
124 | self.replay_buffer._batch_size = self.batch_size
125 | print("Replay Buffer loaded")
126 | print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n")
127 | except:
128 | raise ValueError("Replay Buffer not loaded")
129 |
130 | def reset_networks(self):
131 | """reset network parameters"""
132 | print("Resetting Networks!")
133 | self.loss_module.actor_network_params.apply(self.reset_parameter)
134 | self.loss_module.target_actor_network_params.apply(self.reset_parameter)
135 | self.loss_module.qvalue_network_params.apply(self.reset_parameter)
136 | self.loss_module.target_qvalue_network_params.apply(self.reset_parameter)
137 | self.loss_module.value_network_params.apply(self.reset_parameter)
138 |
139 | def eval(self):
140 | """Sets the agent to evaluation mode."""
141 | self.actor.eval()
142 |
143 | def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase:
144 | # TODO not ideal to have this here
145 | td.pop("scale")
146 | td.pop("loc")
147 | td.pop("params")
148 | if "vector_obs_embedding" in td.keys():
149 | td.pop("vector_obs_embedding")
150 | if "image_embedding" in td.keys():
151 | td.pop("image_embedding")
152 |
153 | def create_replay_buffer(
154 | self,
155 | prb=False,
156 | buffer_size=100000,
157 | buffer_scratch_dir=None,
158 | device="cpu",
159 | prefetch=3,
160 | ):
161 | """Create replay buffer"""
162 | # TODO: make this part of base off policy agent
163 | if prb:
164 | replay_buffer = TensorDictPrioritizedReplayBuffer(
165 | alpha=0.7,
166 | beta=0.5,
167 | pin_memory=False,
168 | prefetch=1,
169 | storage=LazyTensorStorage(
170 | buffer_size,
171 | device=device,
172 | ),
173 | )
174 | else:
175 | replay_buffer = TensorDictReplayBuffer(
176 | pin_memory=False,
177 | prefetch=prefetch,
178 | storage=LazyMemmapStorage(
179 | buffer_size,
180 | scratch_dir=buffer_scratch_dir,
181 | ),
182 | batch_size=self.batch_size,
183 | )
184 | replay_buffer.append_transform(lambda x: x.to(device))
185 | # TODO: check if we have image in observation space if so add this transform
186 | # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))
187 |
188 | return replay_buffer
189 |
190 | @torch.no_grad()
191 | def get_action(self, td: TensorDictBase) -> TensorDictBase:
192 | """Get action from actor network"""
193 | with set_exploration_type(ExplorationType.RANDOM):
194 | out_td = self.actor(td.to(self.device))
195 | self.td_preprocessing(out_td)
196 | return out_td
197 |
198 | @torch.no_grad()
199 | def get_eval_action(self, td: TensorDictBase) -> TensorDictBase:
200 | """Get eval action from actor network"""
201 | with set_exploration_type(ExplorationType.MODE):
202 | out_td = self.actor(td.to(self.device))
203 | self.td_preprocessing(out_td)
204 | return out_td
205 |
206 | def add_experience(self, transition: td.TensorDict):
207 | """Add experience to replay buffer"""
208 | self.replay_buffer.extend(transition)
209 | self.collected_transitions += 1
210 |
211 | def pretrain(self, wandb, batch_size=64, num_updates=1):
212 | """Pretrain the agent with simple behavioral cloning"""
213 | # TODO: implement pretrain for testing
214 | # for i in range(num_updates):
215 | # batch = self.replay_buffer.sample(batch_size)
216 | # pred, _ = self.actor(batch["observations"].float())
217 | # loss = torch.mean((pred - batch["actions"]) ** 2)
218 | # self.optimizer.zero_grad()
219 | # loss.backward()
220 | # self.optimizer.step()
221 | # wandb.log({"pretrain/loss": loss.item()})
222 |
223 | def train(self, batch_size=64, num_updates=1):
224 | """Train the agent"""
225 | self.actor.train()
226 | for i in range(num_updates):
227 | self.total_updates += 1
228 | if self.reset_params and self.total_updates % self.reset_params == 0:
229 | self.reset_networks()
230 | # Sample a batch from the replay buffer
231 | batch = self.replay_buffer.sample(batch_size)
232 | # Compute IQL Loss
233 | loss = self.loss_module(batch)
234 |
235 | # Update Actpr Network
236 | self.optimizer_actor.zero_grad()
237 | loss["loss_actor"].backward()
238 | self.optimizer_actor.step()
239 | # Update Critic Network
240 | self.optimizer_critic.zero_grad()
241 | loss["loss_qvalue"].backward()
242 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
243 | self.optimizer_critic.step()
244 | # Update Value Network
245 | self.optimizer_value.zero_grad()
246 | loss["loss_value"].backward()
247 | self.optimizer_value.step()
248 |
249 | # Update Target Networks
250 | self.target_net_updater.step()
251 | # Update Prioritized Replay Buffer
252 | if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer):
253 | self.replay_buffer.update_priorities(
254 | batch["indices"],
255 | loss["critic_loss"].detach().cpu().numpy(),
256 | )
257 | self.actor.eval()
258 | return loss
259 |
--------------------------------------------------------------------------------
/experiments/helper/agents/random.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from experiments.helper.agents.base import BaseAgent
4 | from tensordict import TensorDictBase
5 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer
6 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage
7 |
8 |
9 | class RandomAgent(BaseAgent):
10 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
11 | super(RandomAgent, self).__init__(
12 | state_spec, action_spec, agent_config.name, device
13 | )
14 |
15 | self.actor = None
16 | self.replay_buffer = self.create_replay_buffer(
17 | batch_size=256,
18 | prb=False,
19 | buffer_size=1000000,
20 | device=device,
21 | buffer_scratch_dir="/tmp",
22 | )
23 |
24 | def eval(self):
25 | """Sets the agent to evaluation mode."""
26 |
27 | @torch.no_grad()
28 | def get_action(self, tensordict: TensorDictBase):
29 | """Sample random actions from a uniform distribution"""
30 | tensordict.set("action", self.action_spec.rand())
31 | return tensordict
32 |
33 | @torch.no_grad()
34 | def get_eval_action(self, tensordict: TensorDictBase):
35 | """Sample random actions from a uniform distribution"""
36 | tensordict.set("action", self.action_spec.rand())
37 | return tensordict
38 |
39 | def add_experience(self, transition: TensorDictBase):
40 | """Add experience to replay buffer"""
41 | self.replay_buffer.extend(transition)
42 |
43 | def train(self, batch_size=64, num_updates=1):
44 | """Train the agent"""
45 | return {}
46 |
47 | def create_replay_buffer(
48 | self,
49 | batch_size=256,
50 | prb=False,
51 | buffer_size=100000,
52 | buffer_scratch_dir=None,
53 | device="cpu",
54 | prefetch=3,
55 | ):
56 | """Create replay buffer"""
57 | # TODO: make this part of base off policy agent
58 | if prb:
59 | replay_buffer = TensorDictPrioritizedReplayBuffer(
60 | alpha=0.7,
61 | beta=0.5,
62 | pin_memory=False,
63 | prefetch=1,
64 | storage=LazyTensorStorage(
65 | buffer_size,
66 | ),
67 | )
68 | else:
69 | replay_buffer = TensorDictReplayBuffer(
70 | pin_memory=False,
71 | prefetch=prefetch,
72 | storage=LazyMemmapStorage(
73 | buffer_size,
74 | scratch_dir=buffer_scratch_dir,
75 | ),
76 | batch_size=batch_size,
77 | )
78 | return replay_buffer
79 |
--------------------------------------------------------------------------------
/experiments/helper/agents/sac.py:
--------------------------------------------------------------------------------
1 | import tensordict as td
2 | import torch
3 |
4 | from experiments.helper.agents.base import BaseAgent
5 | from experiments.helper.networks.networks import get_critic, get_stochastic_actor
6 | from tensordict import TensorDictBase
7 | from torch import optim
8 | from torchrl.data import TensorDictPrioritizedReplayBuffer, TensorDictReplayBuffer
9 | from torchrl.data.replay_buffers.storages import LazyMemmapStorage, LazyTensorStorage
10 | from torchrl.envs.utils import ExplorationType, set_exploration_type
11 | from torchrl.objectives import SoftUpdate
12 |
13 | from torchrl.objectives.sac import SACLoss
14 |
15 |
16 | class SACAgent(BaseAgent):
17 | def __init__(self, state_spec, action_spec, agent_config, device="cpu"):
18 | super(SACAgent, self).__init__(
19 | state_spec, action_spec, agent_config.name, device
20 | )
21 |
22 | self.actor = get_stochastic_actor(state_spec, action_spec, agent_config)
23 | self.critic = get_critic(state_spec, agent_config)
24 |
25 | self.actor.to(device)
26 | self.critic.to(device)
27 |
28 | # initialize networks
29 | self.init_nets([self.actor, self.critic])
30 |
31 | # define loss function
32 | self.loss_module = SACLoss(
33 | actor_network=self.actor,
34 | qvalue_network=self.critic,
35 | delay_qvalue=True,
36 | value_network=None, # None to use SAC version 2
37 | num_qvalue_nets=2,
38 | fixed_alpha=agent_config.fixed_alpha,
39 | alpha_init=agent_config.alpha_init,
40 | loss_function=agent_config.loss_function,
41 | )
42 | # Define Target Network Updater
43 | self.target_net_updater = SoftUpdate(
44 | self.loss_module, eps=agent_config.soft_update_eps
45 | )
46 | self.target_net_updater.init_()
47 |
48 | # Reset weights
49 | self.reset_params = agent_config.reset_params
50 |
51 | self.batch_size = agent_config.batch_size
52 | # Define Replay Buffer
53 | self.buffer_batch_size = agent_config.batch_size
54 | self.replay_buffer = self.create_replay_buffer(
55 | prb=agent_config.prb,
56 | buffer_size=agent_config.buffer_size,
57 | buffer_scratch_dir="/tmp",
58 | device=device,
59 | )
60 | # Define Optimizer
61 | critic_params = list(
62 | self.loss_module.qvalue_network_params.flatten_keys().values()
63 | )
64 | actor_params = list(
65 | self.loss_module.actor_network_params.flatten_keys().values()
66 | )
67 | self.optimizer_actor = optim.Adam(
68 | actor_params, lr=agent_config.lr, weight_decay=0.0
69 | )
70 | self.optimizer_critic = optim.Adam(
71 | critic_params, lr=agent_config.lr, weight_decay=0.0
72 | )
73 | self.optimizer_alpha = optim.Adam(
74 | [self.loss_module.log_alpha],
75 | lr=3.0e-4,
76 | )
77 |
78 | # general stats
79 | self.collected_transitions = 0
80 | self.total_updates = 0
81 |
82 | def get_agent_statedict(self):
83 | """Save agent"""
84 | act_statedict = self.actor.state_dict()
85 | critic_statedict = self.critic.state_dict()
86 | return {"actor": act_statedict, "critic": critic_statedict}
87 |
88 | def load_model(self, path):
89 | """load model"""
90 | try:
91 | statedict = torch.load(path)
92 | self.actor.load_state_dict(statedict["actor"])
93 | self.critic.load_state_dict(statedict["critic"])
94 | print("Model loaded")
95 | except:
96 | raise ValueError("Model not loaded")
97 |
98 | def load_replaybuffer(self, path):
99 | """load replay buffer"""
100 | try:
101 | loaded_data = TensorDictBase.load_memmap(path)
102 | self.replay_buffer.extend(loaded_data)
103 | if self.replay_buffer._batch_size != self.batch_size:
104 | Warning(
105 | "Batch size of the loaded replay buffer is different from the agent's config batch size! Rewriting the batch size to match the agent's config batch size."
106 | )
107 | self.replay_buffer._batch_size = self.batch_size
108 | print("Replay Buffer loaded")
109 | print("Replay Buffer size: ", self.replay_buffer.__len__(), "\n")
110 | except:
111 | raise ValueError("Replay Buffer not loaded")
112 |
113 | def reset_networks(self):
114 | """reset network parameters"""
115 | print("Resetting Networks!")
116 | self.loss_module.actor_network_params.apply(self.reset_parameter)
117 | self.loss_module.target_actor_network_params.apply(self.reset_parameter)
118 | self.loss_module.qvalue_network_params.apply(self.reset_parameter)
119 | self.loss_module.target_qvalue_network_params.apply(self.reset_parameter)
120 |
121 | def eval(self):
122 | """Sets the agent to evaluation mode."""
123 | self.actor.eval()
124 |
125 | def td_preprocessing(self, td: TensorDictBase) -> TensorDictBase:
126 | # TODO not ideal to have this here
127 | td.pop("scale")
128 | td.pop("loc")
129 | td.pop("params")
130 | if "obs_embedding" in td.keys():
131 | td.pop("obs_embedding")
132 | if "pixel_embedding" in td.keys():
133 | td.pop("pixel_embedding")
134 |
135 | def create_replay_buffer(
136 | self,
137 | prb=False,
138 | buffer_size=100000,
139 | buffer_scratch_dir=".",
140 | device="cpu",
141 | prefetch=3,
142 | ):
143 | """Create replay buffer"""
144 | # TODO: make this part of base off policy agent
145 | if prb:
146 | replay_buffer = TensorDictPrioritizedReplayBuffer(
147 | alpha=0.7,
148 | beta=0.5,
149 | pin_memory=False,
150 | prefetch=1,
151 | storage=LazyTensorStorage(
152 | buffer_size,
153 | ),
154 | )
155 | else:
156 | replay_buffer = TensorDictReplayBuffer(
157 | pin_memory=False,
158 | prefetch=prefetch,
159 | storage=LazyMemmapStorage(
160 | buffer_size,
161 | scratch_dir=buffer_scratch_dir,
162 | ),
163 | batch_size=self.batch_size,
164 | )
165 | replay_buffer.append_transform(lambda x: x.to(device))
166 | # TODO: check if we have image in observation space if so add this transform
167 | # replay_buffer.append_transform(ToTensorImage(from_int=True, shape_tolerant=True))
168 |
169 | return replay_buffer
170 |
171 | @torch.no_grad()
172 | def get_action(self, td: TensorDictBase) -> TensorDictBase:
173 | """Get action from actor network"""
174 | with set_exploration_type(ExplorationType.RANDOM):
175 | out_td = self.actor(td.to(self.device))
176 | self.td_preprocessing(out_td)
177 | return out_td
178 |
179 | @torch.no_grad()
180 | def get_eval_action(self, td: TensorDictBase) -> TensorDictBase:
181 | """Get eval action from actor network"""
182 | with set_exploration_type(ExplorationType.MODE):
183 | out_td = self.actor(td.to(self.device))
184 | self.td_preprocessing(out_td)
185 | return out_td
186 |
187 | def add_experience(self, transition: td.TensorDict):
188 | """Add experience to replay buffer"""
189 | self.replay_buffer.extend(transition)
190 | self.collected_transitions += 1
191 |
192 | def train(self, batch_size=64, num_updates=1):
193 | """Train the agent"""
194 | self.actor.train()
195 | for i in range(num_updates):
196 | self.total_updates += 1
197 | if self.reset_params and self.total_updates % self.reset_params == 0:
198 | self.reset_networks()
199 | # Sample a batch from the replay buffer
200 | batch = self.replay_buffer.sample(batch_size)
201 | # Compute SAC Loss
202 | loss = self.loss_module(batch)
203 |
204 | # Update Actpr Network
205 | self.optimizer_actor.zero_grad()
206 | loss["loss_actor"].backward()
207 | self.optimizer_actor.step()
208 | # Update Critic Network
209 | self.optimizer_critic.zero_grad()
210 | loss["loss_qvalue"].backward()
211 | torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5)
212 | self.optimizer_critic.step()
213 |
214 | # Update alpha
215 | self.optimizer_alpha.zero_grad()
216 | loss["loss_alpha"].backward()
217 | self.optimizer_alpha.step()
218 |
219 | # Update Target Networks
220 | self.target_net_updater.step()
221 | # Update Prioritized Replay Buffer
222 | if isinstance(self.replay_buffer, TensorDictPrioritizedReplayBuffer):
223 | self.replay_buffer.update_priorities(
224 | batch["indices"],
225 | loss["critic_loss"].detach().cpu().numpy(),
226 | )
227 | self.actor.eval()
228 | return loss
229 |
--------------------------------------------------------------------------------
/experiments/helper/networks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/experiments/helper/networks/__init__.py
--------------------------------------------------------------------------------
/experiments/helper/utils.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import numpy as np
4 | import tensordict as td
5 | import torch
6 | from bricksrl.environments import ALL_2WHEELER_ENVS, ALL_ROBOARM_ENVS, ALL_WALKER_ENVS
7 | from moviepy.editor import concatenate_videoclips, ImageClip
8 | from omegaconf import DictConfig
9 | from tensordict import TensorDict, TensorDictBase
10 | from torchrl.envs.utils import step_mdp
11 | from tqdm import tqdm
12 |
13 |
14 | def setup_check(robot: str, config: DictConfig):
15 | if robot == "2wheeler":
16 | assert (
17 | config.env.name in ALL_2WHEELER_ENVS
18 | ), f"You are trying to run a 2wheeler experiment but are using the env {config.env.name}, select one of {ALL_2WHEELER_ENVS}"
19 | elif robot == "walker":
20 | assert (
21 | config.env.name in ALL_WALKER_ENVS
22 | ), f"You are trying to run a walker experiment but are using the env {config.env.name}, select one of {ALL_WALKER_ENVS}"
23 | elif robot == "roboarm":
24 | assert (
25 | config.env.name in ALL_ROBOARM_ENVS
26 | ), f"You are trying to run a roboarm experiment but are using the env {config.env.name}, select one of {ALL_ROBOARM_ENVS}"
27 |
28 |
29 | def data2numpy(data: list):
30 | """Convert a list of bytes to a numpy array."""
31 | return np.array(data)[None, :]
32 |
33 |
34 | def handle_disconnect(_):
35 | print("Hub was disconnected.")
36 |
37 |
38 | def tensordict2dict(td: TensorDictBase) -> dict:
39 | """Convert a TensorDict to a dictionary."""
40 | return {k: v.item() for k, v in td.items()}
41 |
42 |
43 | def logout(agent):
44 | # TODO save model or training data
45 | x = input("Do you want to save the model? (y/n)")
46 | if x == "y":
47 | save_name = input("Enter the name of the file to save: ")
48 | torch.save(agent.get_agent_statedict(), save_name + ".pth")
49 | x = input("Do you want to save the replay buffer? (y/n)")
50 | if x == "y":
51 | save_name = input("Enter the name of the file to save: ")
52 | # agent.replay_buffer.dump(save_name)
53 | batched_data = agent.replay_buffer.storage._storage[
54 | : agent.replay_buffer.__len__()
55 | ]
56 | batched_data.save(save_name, copy_existing=True)
57 |
58 |
59 | def login(agent):
60 | x = input("Do you want to load the model? (y/n)")
61 | if x == "y":
62 | save_name = input("Enter the name of the file to load: ")
63 | agent.load_model(save_name)
64 | else:
65 | print("Model not loaded!")
66 | x = input("Do you want to load the replay buffer? (y/n)")
67 | if x == "y":
68 | save_name = input("Enter the name of the file to load: ")
69 | agent.load_replaybuffer(save_name)
70 | else:
71 | print("Buffer not loaded!")
72 |
73 |
74 | def prefill_buffer(env, agent, num_episodes=10, stop_on_done=False):
75 | """
76 | Prefills the agent's replay buffer with experiences by running the environment for a specified number of episodes.
77 |
78 | Args:
79 | - env: gym.Env object representing the environment
80 | - agent: Agent object with an add_experience method to add experiences to the replay buffer
81 | - num_episodes: int, number of episodes to run the environment for
82 |
83 | Returns: None
84 | """
85 | if agent.name in ["sac", "td3"]:
86 | inpt = input("Press Enter to start prefilling episode: ")
87 | for e in tqdm(range(num_episodes), desc="Prefilling buffer"):
88 | print("Prefill episode: ", e)
89 | td = env.reset()
90 | done = False
91 | truncated = False
92 | while not done and not truncated:
93 | td = env.sample_random_action(td)
94 | td = env.step(td)
95 | agent.add_experience(td)
96 | done = td.get(("next", "done"))
97 |
98 | if done and stop_on_done:
99 | inpt = input(
100 | "Please reset the robot to the starting position and press Enter to continue or q to quit:"
101 | )
102 | if inpt == "q":
103 | break
104 | td = step_mdp(td)
105 | print("Prefill done! Buffer size: ", agent.replay_buffer.__len__())
106 |
107 |
108 | def convert_bgr_to_rgb(bgr_image: np.array) -> np.array:
109 | return bgr_image[:, :, ::-1] # Reverses the third dimension (color channels)
110 |
111 |
112 | def create_video_from_images(
113 | images: List[np.array], video_name: str = "episode_1", fps: int = 20
114 | ):
115 | # Convert each NumPy array image to an ImageClip
116 | clips = [ImageClip(convert_bgr_to_rgb(np_img.squeeze(0))) for np_img in images]
117 |
118 | # Set the duration of each clip to match the desired FPS
119 | # Note: This assumes all images should be displayed for an equal amount of time.
120 | for clip in clips:
121 | clip.duration = 1 / fps
122 |
123 | # Concatenate the ImageClips into a single video
124 | final_clip = concatenate_videoclips(clips, method="compose")
125 |
126 | # Write the result to a video file
127 | final_clip.write_videofile(video_name, fps=fps)
128 |
--------------------------------------------------------------------------------
/experiments/roboarm/eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 |
5 | import hydra
6 | import numpy as np
7 | import wandb
8 | from omegaconf import DictConfig, OmegaConf
9 | from torchrl.envs.utils import step_mdp
10 | from tqdm import tqdm
11 |
12 | # Add the project root to PYTHONPATH for config
13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
14 | if project_root not in sys.path:
15 | sys.path.insert(0, project_root)
16 |
17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS
18 | from experiments.helper.agents import get_agent
19 | from experiments.helper.utils import (
20 | create_video_from_images,
21 | login,
22 | logout,
23 | setup_check,
24 | )
25 |
26 |
27 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
28 | def run(cfg: DictConfig) -> None:
29 | print(OmegaConf.to_yaml(cfg))
30 |
31 | # make environment.
32 | setup_check(robot="roboarm", config=cfg)
33 | env, action_space, state_space = make_env(cfg)
34 |
35 | # make agent
36 | agent, project_name = get_agent(action_space, state_space, cfg)
37 | login(agent)
38 | agent.eval()
39 |
40 | # initialize wandb
41 | wandb.init(project=project_name + "_eval")
42 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
43 |
44 | eval_episodes = cfg.episodes
45 | env_name = cfg.env.name
46 | quit = False
47 | _ = input("Press Enter to start evaluation...")
48 | try:
49 | for e in tqdm(range(eval_episodes), desc="Evaluation"):
50 | td = env.reset()
51 | done = td.get("done", False)
52 | truncated = td.get("truncated", False)
53 | ep_return = 0
54 | ep_steps = 0
55 | total_step_times = []
56 | if env_name in VIDEO_LOGGING_ENVS:
57 | image_caputres = [td.get("original_pixels").numpy()]
58 | print("Start new evaluation...", flush=True)
59 | while not done and not truncated:
60 | ep_steps += 1
61 | step_start_time = time.time()
62 | td = agent.get_eval_action(td)
63 | td = env.step(td)
64 | agent.add_experience(td)
65 | if env_name in VIDEO_LOGGING_ENVS:
66 | image_caputres.append(
67 | td.get(("next", "original_pixels")).cpu().numpy()
68 | )
69 | agent.add_experience(td)
70 | total_agent_step_time = time.time() - step_start_time
71 | total_step_times.append(total_agent_step_time)
72 | done = td.get(("next", "done"), False)
73 | ep_return += td.get(("next", "reward"), 0)
74 |
75 | if done:
76 | break
77 | td = step_mdp(td)
78 |
79 | if quit:
80 | break
81 |
82 | # Metrics Logging
83 | log_dict = {
84 | "epoch": e,
85 | "reward": ep_return,
86 | "steps": ep_steps,
87 | "total_step_time": np.mean(total_step_times),
88 | "done": done.float(),
89 | }
90 | if env_name == "roboarm-v0" or env_name == "roboarm_sim-v0":
91 | final_error = td.get(("error")).item()
92 | log_dict.update({"final_error": final_error})
93 |
94 | wandb.log(log_dict)
95 | if env_name in VIDEO_LOGGING_ENVS:
96 | video_name = "episode_{}.mp4".format(e)
97 | create_video_from_images(image_caputres, video_name, fps=5)
98 | wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")})
99 |
100 | except KeyboardInterrupt:
101 | print("Evaluation interrupted by user.")
102 |
103 | logout(agent)
104 | env.close()
105 |
106 |
107 | if __name__ == "__main__":
108 | run()
109 |
--------------------------------------------------------------------------------
/experiments/roboarm/pretrain.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import hydra
5 | import wandb
6 | from omegaconf import DictConfig, OmegaConf
7 | from tqdm import tqdm
8 |
9 | # Add the project root to PYTHONPATH for config
10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
11 | if project_root not in sys.path:
12 | sys.path.insert(0, project_root)
13 |
14 | from bricksrl.environments import make_env
15 | from experiments.helper.agents import get_agent
16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict
17 |
18 |
19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
20 | def run(cfg: DictConfig) -> None:
21 | print(OmegaConf.to_yaml(cfg))
22 |
23 | # make environment.
24 | setup_check(robot="roboarm", config=cfg)
25 | env, action_space, state_space = make_env(cfg, pretrain=True)
26 |
27 | # make agent
28 | agent, project_name = get_agent(action_space, state_space, cfg)
29 | login(agent)
30 |
31 | # initialize wandb
32 | wandb.init(project=project_name)
33 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
34 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None
35 |
36 | batch_size = cfg.agent.batch_size
37 | num_updates = cfg.agent.num_updates
38 | train_episodes = cfg.episodes
39 | print("Start training...")
40 | try:
41 | for e in tqdm(range(train_episodes), desc="Training"):
42 |
43 | loss_info = agent.train(batch_size=batch_size, num_updates=num_updates)
44 |
45 | # Metrics Logging
46 | log_dict = {
47 | "epoch": e,
48 | "buffer_size": agent.replay_buffer.__len__(),
49 | }
50 | log_dict.update(tensordict2dict(loss_info))
51 | wandb.log(log_dict)
52 |
53 | except KeyboardInterrupt:
54 | print("Training interrupted by user.")
55 |
56 | logout(agent)
57 | env.close()
58 |
59 |
60 | if __name__ == "__main__":
61 | run()
62 |
--------------------------------------------------------------------------------
/experiments/roboarm/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 |
5 | import hydra
6 | import numpy as np
7 | import wandb
8 | from omegaconf import DictConfig, OmegaConf
9 | from torchrl.envs.utils import step_mdp
10 | from tqdm import tqdm
11 |
12 | # Add the project root to PYTHONPATH for config
13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
14 | if project_root not in sys.path:
15 | sys.path.insert(0, project_root)
16 |
17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS
18 | from experiments.helper.agents import get_agent
19 | from experiments.helper.utils import (
20 | create_video_from_images,
21 | login,
22 | logout,
23 | prefill_buffer,
24 | setup_check,
25 | tensordict2dict,
26 | )
27 |
28 |
29 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
30 | def run(cfg: DictConfig) -> None:
31 | print(OmegaConf.to_yaml(cfg))
32 |
33 | # make environment.
34 | setup_check(robot="roboarm", config=cfg)
35 | env, action_space, state_space = make_env(cfg)
36 |
37 | # make agent
38 | agent, project_name = get_agent(action_space, state_space, cfg)
39 | login(agent)
40 |
41 | # initialize wandb
42 | wandb.init(project=project_name)
43 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
44 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None
45 |
46 | # prefill buffer with random actions
47 | prefill_buffer(
48 | env=env,
49 | agent=agent,
50 | num_episodes=cfg.agent.prefill_episodes,
51 | )
52 |
53 | batch_size = cfg.agent.batch_size
54 | num_updates = cfg.agent.num_updates
55 | env_name = cfg.env.name
56 | train_episodes = cfg.episodes
57 | max_episode_steps = cfg.env.max_episode_steps
58 |
59 | print("Start training...")
60 | quit = False
61 | try:
62 | for e in tqdm(range(train_episodes), desc="Training"):
63 | td = env.reset()
64 | done = td.get("done", False)
65 | truncated = td.get("truncated", False)
66 | ep_return = 0
67 | ep_steps = 0
68 | total_step_times = []
69 | if env_name in VIDEO_LOGGING_ENVS:
70 | image_caputres = [td.get("original_pixels").numpy()]
71 | print("Start new data collection...", flush=True)
72 | while not done and not truncated:
73 | ep_steps += 1
74 | step_start_time = time.time()
75 | td = agent.get_action(td)
76 | td = env.step(td)
77 | if env_name in VIDEO_LOGGING_ENVS:
78 | image_caputres.append(
79 | td.get(("next", "original_pixels")).cpu().numpy()
80 | )
81 | agent.add_experience(td)
82 | total_agent_step_time = time.time() - step_start_time
83 | total_step_times.append(total_agent_step_time)
84 | done = td.get(("next", "done"), False)
85 | ep_return += td.get(("next", "reward"), 0)
86 |
87 | td = step_mdp(td)
88 | if done:
89 | break
90 |
91 | loss_info = agent.train(
92 | batch_size=batch_size, num_updates=num_updates * ep_steps
93 | )
94 |
95 | if quit:
96 | break
97 |
98 | # Metrics Logging
99 | log_dict = {
100 | "epoch": e,
101 | "reward": ep_return,
102 | "steps": ep_steps,
103 | "total_step_time": np.mean(total_step_times),
104 | "buffer_size": agent.replay_buffer.__len__(),
105 | "done": done.float(),
106 | }
107 | if env_name == "roboarm-v0" or env_name == "roboarm_sim-v0":
108 | final_error = td.get(("error")).item()
109 | log_dict.update({"final_error": final_error})
110 | log_dict.update(tensordict2dict(loss_info))
111 | wandb.log(log_dict)
112 | if env_name in VIDEO_LOGGING_ENVS and done and ep_steps < max_episode_steps:
113 | video_name = "episode_{}.mp4".format(e)
114 | create_video_from_images(image_caputres, video_name, fps=5)
115 | wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")})
116 |
117 | except KeyboardInterrupt:
118 | print("Training interrupted by user.")
119 |
120 | logout(agent)
121 | env.close()
122 |
123 |
124 | if __name__ == "__main__":
125 | run()
126 |
--------------------------------------------------------------------------------
/experiments/walker/eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 |
5 | import hydra
6 | import numpy as np
7 | import wandb
8 | from omegaconf import DictConfig, OmegaConf
9 | from torchrl.envs.utils import step_mdp
10 | from tqdm import tqdm
11 |
12 | # Add the project root to PYTHONPATH for config
13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
14 | if project_root not in sys.path:
15 | sys.path.insert(0, project_root)
16 |
17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS
18 | from experiments.helper.agents import get_agent
19 | from experiments.helper.utils import (
20 | create_video_from_images,
21 | login,
22 | logout,
23 | setup_check,
24 | )
25 |
26 |
27 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
28 | def run(cfg: DictConfig) -> None:
29 | print(OmegaConf.to_yaml(cfg))
30 |
31 | # make environment.
32 | setup_check(robot="walker", config=cfg)
33 | env, action_space, state_space = make_env(cfg)
34 |
35 | # make agent
36 | agent, project_name = get_agent(action_space, state_space, cfg)
37 | login(agent)
38 | agent.eval()
39 |
40 | # initialize wandb
41 | wandb.init(project=project_name + "_eval")
42 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
43 |
44 | eval_episodes = cfg.episodes
45 | env_name = cfg.env.name
46 | quit = False
47 | _ = input("Press Enter to start evaluation...")
48 | try:
49 | for e in tqdm(range(eval_episodes), desc="Evaluation"):
50 | td = env.reset()
51 | done = td.get("done", False)
52 | truncated = td.get("truncated", False)
53 | ep_return = 0
54 | ep_steps = 0
55 | total_step_times = []
56 | actions = []
57 | if env_name in VIDEO_LOGGING_ENVS:
58 | image_caputres = [td.get("original_image").numpy()]
59 | # so we can reset the robot in the camera view
60 | input("Press Enter to start episode...")
61 | print("Start new evaluation...", flush=True)
62 | while not done and not truncated:
63 | ep_steps += 1
64 | step_start_time = time.time()
65 | td = agent.get_eval_action(td)
66 | actions.append(td.get("action").cpu().numpy())
67 | td = env.step(td)
68 | agent.add_experience(td)
69 | total_agent_step_time = time.time() - step_start_time
70 | total_step_times.append(total_agent_step_time)
71 | done = td.get(("next", "done"), False)
72 | ep_return += td.get(("next", "reward"), 0)
73 | if env_name in VIDEO_LOGGING_ENVS:
74 | image_caputres.append(
75 | td.get(("next", "original_image")).cpu().numpy()
76 | )
77 |
78 | if done:
79 | break
80 | td = step_mdp(td)
81 |
82 | if quit:
83 | break
84 |
85 | # Metrics Logging
86 | log_dict = {
87 | "epoch": e,
88 | "reward": ep_return,
89 | "steps": ep_steps,
90 | "total_step_time": np.mean(total_step_times),
91 | "buffer_size": agent.replay_buffer.__len__(),
92 | "done": done.float(),
93 | }
94 |
95 | wandb.log(log_dict)
96 | if env_name in VIDEO_LOGGING_ENVS:
97 | video_name = "episode_{}.mp4".format(e)
98 | create_video_from_images(image_caputres, video_name, fps=5)
99 | wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")})
100 |
101 | except KeyboardInterrupt:
102 | print("Evaluation interrupted by user.")
103 |
104 | logout(agent)
105 | env.close()
106 |
107 |
108 | if __name__ == "__main__":
109 | run()
110 |
--------------------------------------------------------------------------------
/experiments/walker/pretrain.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | import hydra
5 | import wandb
6 | from omegaconf import DictConfig, OmegaConf
7 | from tqdm import tqdm
8 |
9 | # Add the project root to PYTHONPATH for config
10 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
11 | if project_root not in sys.path:
12 | sys.path.insert(0, project_root)
13 |
14 | from bricksrl.environments import make_env
15 | from experiments.helper.agents import get_agent
16 | from experiments.helper.utils import login, logout, setup_check, tensordict2dict
17 |
18 |
19 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
20 | def run(cfg: DictConfig) -> None:
21 | print(OmegaConf.to_yaml(cfg))
22 |
23 | # make environment.
24 | setup_check(robot="walker", config=cfg)
25 | env, action_space, state_space = make_env(cfg, pretrain=True)
26 |
27 | # make agent
28 | agent, project_name = get_agent(action_space, state_space, cfg)
29 | login(agent)
30 |
31 | # initialize wandb
32 | wandb.init(project=project_name)
33 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
34 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None
35 |
36 | batch_size = cfg.agent.batch_size
37 | num_updates = cfg.agent.num_updates
38 | train_episodes = cfg.episodes
39 | print("Start training...")
40 | try:
41 | for e in tqdm(range(train_episodes), desc="Training"):
42 |
43 | loss_info = agent.train(batch_size=batch_size, num_updates=num_updates)
44 |
45 | # Metrics Logging
46 | log_dict = {
47 | "epoch": e,
48 | "buffer_size": agent.replay_buffer.__len__(),
49 | }
50 | log_dict.update(tensordict2dict(loss_info))
51 | wandb.log(log_dict)
52 |
53 | except KeyboardInterrupt:
54 | print("Training interrupted by user.")
55 |
56 | logout(agent)
57 | env.close()
58 |
59 |
60 | if __name__ == "__main__":
61 | run()
62 |
--------------------------------------------------------------------------------
/experiments/walker/train.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import time
4 |
5 | import hydra
6 | import numpy as np
7 | import wandb
8 | from omegaconf import DictConfig, OmegaConf
9 | from torchrl.envs.utils import step_mdp
10 | from tqdm import tqdm
11 |
12 | # Add the project root to PYTHONPATH for config
13 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../"))
14 | if project_root not in sys.path:
15 | sys.path.insert(0, project_root)
16 |
17 | from bricksrl.environments import make_env, VIDEO_LOGGING_ENVS
18 | from experiments.helper.agents import get_agent
19 | from experiments.helper.utils import (
20 | create_video_from_images,
21 | login,
22 | logout,
23 | prefill_buffer,
24 | setup_check,
25 | tensordict2dict,
26 | )
27 |
28 |
29 | @hydra.main(version_base=None, config_path=project_root + "/conf", config_name="config")
30 | def run(cfg: DictConfig) -> None:
31 | print(OmegaConf.to_yaml(cfg))
32 |
33 | # make environment.
34 | setup_check(robot="walker", config=cfg)
35 | env, action_space, state_space = make_env(cfg)
36 |
37 | # make agent
38 | agent, project_name = get_agent(action_space, state_space, cfg)
39 | login(agent)
40 |
41 | # initialize wandb
42 | wandb.init(project=project_name)
43 | wandb.config = OmegaConf.to_container(cfg, resolve=True, throw_on_missing=True)
44 | wandb.watch(agent.actor, log_freq=1) if agent.actor else None
45 |
46 | # prefill buffer with random actions
47 | prefill_buffer(env=env, agent=agent, num_episodes=cfg.agent.prefill_episodes)
48 |
49 | batch_size = cfg.agent.batch_size
50 | num_updates = cfg.agent.num_updates
51 | env_name = cfg.env.name
52 | train_episodes = cfg.episodes
53 | print("Start training...")
54 | quit = False
55 | try:
56 | for e in tqdm(range(train_episodes), desc="Training"):
57 | td = env.reset()
58 | done = td.get("done", False)
59 | truncated = td.get("truncated", False)
60 | ep_return = 0
61 | ep_steps = 0
62 | total_step_times = []
63 | agent_actions = []
64 | if env_name in VIDEO_LOGGING_ENVS:
65 | image_caputres = [td.get("original_image").numpy()]
66 | # so we can reset the robot in the camera view
67 | input("Press Enter to start episode...")
68 |
69 | print("Start new data collection...", flush=True)
70 | while not done and not truncated:
71 | ep_steps += 1
72 | step_start_time = time.time()
73 | td = agent.get_action(td)
74 | td = env.step(td)
75 | agent.add_experience(td)
76 | done = td.get(("next", "done"), False)
77 | ep_return += td.get(("next", "reward"), 0)
78 | if env_name in VIDEO_LOGGING_ENVS:
79 | image_caputres.append(
80 | td.get(("next", "original_image")).cpu().numpy()
81 | )
82 | total_agent_step_time = time.time() - step_start_time
83 | total_step_times.append(total_agent_step_time)
84 | if done:
85 | break
86 | td = step_mdp(td)
87 | loss_info = agent.train(
88 | batch_size=batch_size, num_updates=num_updates * ep_steps
89 | )
90 | action = td.get("action").cpu().numpy()
91 | agent_actions.append(action)
92 |
93 | if quit:
94 | break
95 |
96 | # Metrics Logging
97 | log_dict = {
98 | "epoch": e,
99 | "reward": ep_return,
100 | "steps": ep_steps,
101 | "total_step_time": np.mean(total_step_times),
102 | "buffer_size": agent.replay_buffer.__len__(),
103 | "action": wandb.Histogram(action),
104 | "done": done,
105 | "action_mean": wandb.Histogram(np.mean(agent_actions, axis=0)),
106 | }
107 | log_dict.update(tensordict2dict(loss_info))
108 | wandb.log(log_dict)
109 | if env_name in VIDEO_LOGGING_ENVS:
110 | video_name = "episode_{}.mp4".format(e)
111 | create_video_from_images(image_caputres, video_name, fps=5)
112 | wandb.log({"video": wandb.Video(video_name, fps=5, format="mp4")})
113 |
114 | except KeyboardInterrupt:
115 | print("Training interrupted by user.")
116 |
117 | logout(agent)
118 | env.close()
119 |
120 |
121 | if __name__ == "__main__":
122 | run()
123 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=64", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | setup(
4 | name="bricksrl",
5 | version="0.1.0",
6 | packages=find_packages(),
7 | install_requires=[
8 | "pybricksdev",
9 | "tensordict==0.5.0",
10 | "torchrl==0.5.0",
11 | "hydra-core==1.3.2",
12 | "wandb==0.16.1",
13 | "opencv-python==4.9.0.80",
14 | "moviepy==1.0.3",
15 | "tqdm==4.66.1",
16 | "numpy==1.24.1",
17 | "pynput",
18 | ],
19 | extras_require={
20 | "dev": [
21 | "pytest==8.0.2",
22 | "ufmt",
23 | "pre-commit",
24 | ],
25 | },
26 | author="Sebastian Dittert",
27 | description="BricksRL: A Platform for Democratizing Robotics and Reinforcement Learning Research and Education with LEGO",
28 | url="https://github.com/BricksRL/bricksrl",
29 | classifiers=[
30 | "Programming Language :: Python :: 3",
31 | "License :: OSI Approved :: MIT License",
32 | "Operating System :: OS Independent",
33 | ],
34 | python_requires=">=3.8",
35 | )
36 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BricksRL/bricksrl/bc250aeaa3b9ab9d718601fced38325f3621c8a3/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_agents.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import torch
3 | from bricksrl.environments.dummy.mixed_obs_dummy import MixedObsDummyEnv
4 | from bricksrl.environments.dummy.vec_obs_dummy import VecGoalObsDummyEnv, VecObsDummyEnv
5 | from experiments.helper.agents import get_agent
6 | from hydra import compose, initialize
7 | from torchrl.envs import Compose, ToTensorImage, TransformedEnv
8 | from torchrl.envs.utils import step_mdp
9 |
10 |
11 | def collection_round(env, agent, max_steps=1000):
12 | td = env.reset()
13 | for _ in range(max_steps):
14 | td = agent.get_action(td)
15 | td = env.step(td)
16 | agent.add_experience(td)
17 | td = step_mdp(td)
18 |
19 |
20 | def get_env(env, img_shape=(64, 64, 3)):
21 | if env == "mixed":
22 | env = MixedObsDummyEnv(img_shape=img_shape)
23 | env = TransformedEnv(
24 | env, Compose(ToTensorImage(in_keys=["pixels"], from_int=True))
25 | )
26 | elif env == "vec":
27 | env = VecObsDummyEnv()
28 | elif env == "vec_goal":
29 | env = VecGoalObsDummyEnv()
30 | else:
31 | raise ValueError("Invalid environment")
32 | return env
33 |
34 |
35 | @pytest.mark.parametrize(
36 | "env",
37 | ["mixed", "vec", "vec_goal"],
38 | )
39 | @pytest.mark.parametrize(
40 | "device",
41 | ["cpu", "cuda"],
42 | )
43 | def test_random_agent(env, device):
44 | with initialize(config_path="../conf"):
45 | cfg = compose(config_name="config")
46 |
47 | if torch.cuda.is_available() and device == "cuda":
48 | device = "cuda"
49 | else:
50 | device = "cpu"
51 | with initialize(config_path="../conf"):
52 | cfg = compose(
53 | config_name="config", overrides=["device=" + device, "agent=random"]
54 | )
55 | # Test data collection
56 | env = get_env(env)
57 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
58 | collection_round(env, agent, max_steps=10)
59 |
60 |
61 | @pytest.mark.parametrize(
62 | "env",
63 | ["mixed", "vec", "vec_goal"],
64 | )
65 | @pytest.mark.parametrize(
66 | "device",
67 | ["cpu", "cuda"],
68 | )
69 | def test_sac_agent(env, device):
70 | if torch.cuda.is_available() and device == "cuda":
71 | device = "cuda"
72 | else:
73 | device = "cpu"
74 | with initialize(config_path="../conf"):
75 | cfg = compose(config_name="config", overrides=["agent=sac", "device=" + device])
76 |
77 | # Test data collection
78 | env = get_env(env)
79 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
80 | collection_round(env, agent, max_steps=10)
81 | # Test training
82 | agent.train(batch_size=1, num_updates=1)
83 |
84 | # Test evaluation
85 | td = env.reset()
86 | td1 = agent.get_action(td)
87 | td2 = agent.get_action(td)
88 |
89 | assert not torch.allclose(td1["action"], td2["action"])
90 |
91 | agent.eval()
92 | td = env.reset()
93 | eval_td1 = agent.get_eval_action(td)
94 | eval_td2 = agent.get_eval_action(td)
95 |
96 | assert torch.allclose(eval_td1["action"], eval_td2["action"])
97 |
98 |
99 | @pytest.mark.parametrize(
100 | "env",
101 | ["mixed", "vec", "vec_goal"],
102 | )
103 | @pytest.mark.parametrize(
104 | "device",
105 | ["cpu", "cuda"],
106 | )
107 | def test_td3_agent(env, device):
108 | if torch.cuda.is_available() and device == "cuda":
109 | device = "cuda"
110 | else:
111 | device = "cpu"
112 | with initialize(config_path="../conf"):
113 | cfg = compose(config_name="config", overrides=["agent=td3", "device=" + device])
114 |
115 | # Test data collection
116 | env = get_env(env)
117 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
118 | collection_round(env, agent, max_steps=10)
119 |
120 | # Test training
121 | agent.train(batch_size=1, num_updates=1)
122 |
123 | # Test evaluation
124 | td = env.reset()
125 | td1 = agent.get_action(td)
126 | td2 = agent.get_action(td)
127 |
128 | assert not torch.allclose(td1["action"], td2["action"])
129 |
130 | agent.eval()
131 | td = env.reset()
132 | eval_td1 = agent.get_eval_action(td)
133 | eval_td2 = agent.get_eval_action(td)
134 |
135 | assert torch.allclose(eval_td1["action"], eval_td2["action"])
136 |
137 |
138 | @pytest.mark.parametrize(
139 | "env",
140 | ["mixed", "vec", "vec_goal"],
141 | )
142 | @pytest.mark.parametrize(
143 | "device",
144 | ["cpu", "cuda"],
145 | )
146 | def test_drq_agent(env, device):
147 | if torch.cuda.is_available() and device == "cuda":
148 | device = "cuda"
149 | else:
150 | device = "cpu"
151 | with initialize(config_path="../conf"):
152 | cfg = compose(
153 | config_name="config", overrides=["agent=droq", "device=" + device]
154 | )
155 |
156 | # Test data collection
157 | env = get_env(env)
158 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
159 | collection_round(env, agent, max_steps=10)
160 | # Test training
161 | agent.train(batch_size=1, num_updates=1)
162 |
163 | # Test evaluation
164 | td = env.reset()
165 | td1 = agent.get_action(td)
166 | td2 = agent.get_action(td)
167 |
168 | assert not torch.allclose(td1["action"], td2["action"])
169 |
170 | agent.eval()
171 | td = env.reset()
172 | eval_td1 = agent.get_eval_action(td)
173 | eval_td2 = agent.get_eval_action(td)
174 |
175 | assert torch.allclose(eval_td1["action"], eval_td2["action"])
176 |
177 |
178 | @pytest.mark.parametrize(
179 | "env",
180 | ["mixed", "vec", "vec_goal"],
181 | )
182 | @pytest.mark.parametrize(
183 | "device",
184 | ["cpu", "cuda"],
185 | )
186 | def test_iql_agent(env, device):
187 | if torch.cuda.is_available() and device == "cuda":
188 | device = "cuda"
189 | else:
190 | device = "cpu"
191 | with initialize(config_path="../conf"):
192 | cfg = compose(config_name="config", overrides=["agent=iql", "device=" + device])
193 |
194 | # Test data collection
195 | env = get_env(env)
196 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
197 | collection_round(env, agent, max_steps=10)
198 | # Test training
199 | agent.train(batch_size=1, num_updates=1)
200 |
201 | # Test evaluation
202 | td = env.reset()
203 | td1 = agent.get_action(td)
204 | td2 = agent.get_action(td)
205 |
206 | assert not torch.allclose(td1["action"], td2["action"])
207 |
208 | agent.eval()
209 | td = env.reset()
210 | eval_td1 = agent.get_eval_action(td)
211 | eval_td2 = agent.get_eval_action(td)
212 |
213 | assert torch.allclose(eval_td1["action"], eval_td2["action"])
214 |
215 |
216 | @pytest.mark.parametrize(
217 | "env",
218 | ["mixed", "vec", "vec_goal"],
219 | )
220 | @pytest.mark.parametrize(
221 | "device",
222 | ["cpu", "cuda"],
223 | )
224 | def test_cql_agent(env, device):
225 | if torch.cuda.is_available() and device == "cuda":
226 | device = "cuda"
227 | else:
228 | device = "cpu"
229 | with initialize(config_path="../conf"):
230 | cfg = compose(config_name="config", overrides=["agent=cql", "device=" + device])
231 |
232 | # Test data collection
233 | env = get_env(env)
234 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
235 | collection_round(env, agent, max_steps=10)
236 | # Test training
237 | agent.train(batch_size=1, num_updates=1)
238 |
239 | # Test evaluation
240 | td = env.reset()
241 | td1 = agent.get_action(td)
242 | td2 = agent.get_action(td)
243 |
244 | assert not torch.allclose(td1["action"], td2["action"])
245 |
246 | agent.eval()
247 | td = env.reset()
248 | eval_td1 = agent.get_eval_action(td)
249 | eval_td2 = agent.get_eval_action(td)
250 |
251 | assert torch.allclose(eval_td1["action"], eval_td2["action"])
252 |
253 |
254 | @pytest.mark.parametrize(
255 | "env",
256 | ["mixed", "vec", "vec_goal"],
257 | )
258 | @pytest.mark.parametrize(
259 | "device",
260 | ["cpu", "cuda"],
261 | )
262 | def test_bc_agent(env, device):
263 | if torch.cuda.is_available() and device == "cuda":
264 | device = "cuda"
265 | else:
266 | device = "cpu"
267 | with initialize(config_path="../conf"):
268 | cfg = compose(config_name="config", overrides=["agent=bc", "device=" + device])
269 |
270 | # Test data collection
271 | env = get_env(env)
272 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
273 | collection_round(env, agent, max_steps=10)
274 | # Test training
275 | agent.train(batch_size=1, num_updates=1)
276 |
277 | # Test evaluation
278 | agent.eval()
279 | td = env.reset()
280 | eval_td1 = agent.get_eval_action(td)
281 | eval_td2 = agent.get_eval_action(td)
282 |
283 | assert torch.allclose(eval_td1["action"], eval_td2["action"])
284 |
285 |
286 | @pytest.mark.parametrize(
287 | "env",
288 | ["mixed"],
289 | )
290 | @pytest.mark.parametrize(
291 | "img_shape",
292 | [(64, 64, 3), (128, 128, 3)],
293 | )
294 | @pytest.mark.parametrize(
295 | "device",
296 | ["cpu", "cuda"],
297 | )
298 | def test_mixd_obs_size_agent(env, device, img_shape):
299 | if torch.cuda.is_available() and device == "cuda":
300 | device = "cuda"
301 | else:
302 | device = "cpu"
303 | with initialize(config_path="../conf"):
304 | cfg = compose(config_name="config", overrides=["agent=td3", "device=" + device])
305 |
306 | # Test data collection
307 | env = get_env(env, img_shape)
308 | agent, _ = get_agent(env.action_spec, env.observation_spec, cfg)
309 | collection_round(env, agent, max_steps=10)
310 |
311 | # Test training
312 | agent.train(batch_size=1, num_updates=1)
313 |
314 | # Test evaluation
315 | td = env.reset()
316 | td1 = agent.get_action(td)
317 | td2 = agent.get_action(td)
318 |
319 | assert not torch.allclose(td1["action"], td2["action"])
320 |
321 | agent.eval()
322 | td = env.reset()
323 | eval_td1 = agent.get_eval_action(td)
324 | eval_td2 = agent.get_eval_action(td)
325 |
326 | assert torch.allclose(eval_td1["action"], eval_td2["action"])
327 |
--------------------------------------------------------------------------------
/tests/test_env_sim.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import torch
3 | from bricksrl.environments import make_env
4 | from experiments.helper.agents import get_agent
5 | from hydra import compose, initialize
6 |
7 | from tests.test_agents import collection_round
8 |
9 |
10 | @pytest.mark.parametrize(
11 | "agent",
12 | ["sac", "td3", "random"],
13 | )
14 | @pytest.mark.parametrize("env", ["walker_sim-v0", "roboarm_sim-v0"])
15 | @pytest.mark.parametrize(
16 | "device",
17 | ["cpu", "cuda"],
18 | )
19 | def test_sac_agent(agent, env, device):
20 | if torch.cuda.is_available() and device == "cuda":
21 | device = "cuda"
22 | else:
23 | device = "cpu"
24 | with initialize(config_path="../conf"):
25 | cfg = compose(
26 | config_name="config",
27 | overrides=["agent=" + agent, "device=" + device, "env=" + env],
28 | )
29 |
30 | # Create environment
31 | env, action_space, state_space = make_env(cfg)
32 | # Create agent
33 | agent, _ = get_agent(action_space, state_space, cfg)
34 | # Test data collection
35 | collection_round(env, agent, max_steps=10)
36 |
--------------------------------------------------------------------------------