├── .vscode
    ├── extensions.json
    └── settings.json
├── human_scene_transformer
    ├── images
    │   ├── hero.png
    │   └── architecture.png
    ├── config
    │   ├── pedestrians
    │   │   ├── dataset_params.gin
    │   │   ├── training_params.gin
    │   │   ├── model_params.gin
    │   │   └── metrics.gin
    │   ├── jrdb_challenge
    │   │   ├── training_params.gin
    │   │   ├── model_params.gin
    │   │   ├── metrics.gin
    │   │   └── dataset_params.gin
    │   └── jrdb
    │   │   ├── training_params.gin
    │   │   ├── model_params.gin
    │   │   ├── dataset_params.gin
    │   │   └── metrics.gin
    ├── __init__.py
    ├── model
    │   ├── README.md
    │   ├── embedding.py
    │   ├── head.py
    │   ├── output_distributions.py
    │   ├── multimodality_induction.py
    │   ├── agent_self_alignment.py
    │   ├── scene_encoder.py
    │   ├── model_params.py
    │   ├── preprocess.py
    │   ├── agent_feature_encoder.py
    │   ├── agent_encoder.py
    │   └── model.py
    ├── metrics
    │   ├── metrics.py
    │   ├── pos_nll.py
    │   ├── mean_angle_error.py
    │   └── ade.py
    ├── jrdb
    │   ├── torch_dataset.py
    │   ├── dataset_params.py
    │   ├── eval.py
    │   └── eval_keypoints.py
    ├── pedestrians
    │   ├── dataset_params.py
    │   └── eval.py
    ├── training_params.py
    ├── data
    │   ├── extract_robot_odometry_from_rosbag.py
    │   ├── README.md
    │   ├── math
    │   │   ├── math_types.py
    │   │   └── data_types.py
    │   ├── jrdb_train_detections_to_tracks.py
    │   ├── jrdb_preprocess_test.py
    │   ├── jrdb_preprocess_train.py
    │   └── utils.py
    ├── is_hidden_generators.py
    └── train.py
├── requirements.txt
├── .gitignore
├── CHANGELOG.md
├── CONTRIBUTING.md
├── .github
    └── workflows
    │   └── pytest_and_autopublish.yml
├── pyproject.toml
├── README.md
└── LICENSE


/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": [
3 |       "ms-python.black-formatter"
4 |   ]
5 | }
6 | 


--------------------------------------------------------------------------------
/human_scene_transformer/images/hero.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/human-scene-transformer/HEAD/human_scene_transformer/images/hero.png


--------------------------------------------------------------------------------
/human_scene_transformer/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google-research/human-scene-transformer/HEAD/human_scene_transformer/images/architecture.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py>=1.4
 2 | gin-config>=0.5
 3 | numpy>=1.24
 4 | open3d>=0.17
 5 | pandas>=2.0
 6 | scipy>=1.10
 7 | tensorflow>=2.13
 8 | tensorflow-datasets>=4.9
 9 | tensorflow-graphics>=2021.12
10 | tensorflow-probability>=0.21
11 | tf-models-official>=2.5
12 | tqdm>=4.66


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled python modules.
 2 | *.pyc
 3 | 
 4 | # Byte-compiled
 5 | _pycache__/
 6 | .cache/
 7 | 
 8 | # Poetry, setuptools, PyPI distribution artifacts.
 9 | /*.egg-info
10 | .eggs/
11 | build/
12 | dist/
13 | poetry.lock
14 | 
15 | # Tests
16 | .pytest_cache/
17 | 
18 | # Type checking
19 | .pytype/
20 | 
21 | # Other
22 | *.DS_Store
23 | 
24 | # PyCharm
25 | .idea
26 | 


--------------------------------------------------------------------------------
/human_scene_transformer/config/pedestrians/dataset_params.gin:
--------------------------------------------------------------------------------
1 | PedestriansDatasetParams.path = '<dataset_path>'
2 | PedestriansDatasetParams.dataset = 'eth'
3 | PedestriansDatasetParams.train_config = 'train'  # train, trainval
4 | PedestriansDatasetParams.eval_config = 'val' # val, test
5 | 
6 | PedestriansDatasetParams.num_history_steps = 7
7 | PedestriansDatasetParams.num_steps = 20
8 | PedestriansDatasetParams.num_agents = 14
9 | PedestriansDatasetParams.timestep = 0.4


--------------------------------------------------------------------------------
/human_scene_transformer/config/jrdb_challenge/training_params.gin:
--------------------------------------------------------------------------------
 1 | TrainingParams.batch_size = 64
 2 | TrainingParams.shuffle_buffer_size = 10000
 3 | TrainingParams.total_train_steps = 2e6
 4 | TrainingParams.warmup_steps = 5e4
 5 | TrainingParams.peak_learning_rate = 1e-4
 6 | #TrainingParams.global_clipnorm = 1.
 7 | TrainingParams.batches_per_train_step = 25000
 8 | TrainingParams.batches_per_eval_step = 2000
 9 | TrainingParams.eval_every_n_step = 1e4
10 | TrainingParams.loss = @MultimodalPositionNLLLoss


--------------------------------------------------------------------------------
/human_scene_transformer/config/jrdb/training_params.gin:
--------------------------------------------------------------------------------
 1 | TrainingParams.batch_size = 64
 2 | TrainingParams.shuffle_buffer_size = 10000
 3 | TrainingParams.total_train_steps = 5e6
 4 | TrainingParams.warmup_steps = 5e4
 5 | TrainingParams.peak_learning_rate = 1e-4
 6 | #TrainingParams.global_clipnorm = 1.
 7 | TrainingParams.batches_per_train_step = 25000
 8 | TrainingParams.batches_per_eval_step = 2000
 9 | TrainingParams.eval_every_n_step = 1e4
10 | TrainingParams.loss = @MinNLLPositionMixtureCategoricalCrossentropyLoss


--------------------------------------------------------------------------------
/human_scene_transformer/config/pedestrians/training_params.gin:
--------------------------------------------------------------------------------
 1 | TrainingParams.batch_size = 16
 2 | TrainingParams.shuffle_buffer_size = 10000
 3 | TrainingParams.total_train_steps = 1e6
 4 | TrainingParams.warmup_steps = 5e4
 5 | TrainingParams.peak_learning_rate = 5e-4
 6 | #TrainingParams.global_clipnorm = 1.
 7 | TrainingParams.batches_per_train_step = 25000
 8 | TrainingParams.batches_per_eval_step = 2000
 9 | TrainingParams.eval_every_n_step = 1e4
10 | TrainingParams.loss = @MinNLLPositionMixtureCategoricalCrossentropyLoss


--------------------------------------------------------------------------------
/human_scene_transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """human_scene_transformer API."""
16 | 
17 | # A new PyPI release will be pushed everytime `__version__` is increased
18 | # When changing this, also update the CHANGELOG.md
19 | __version__ = '0.1.0'
20 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "files.insertFinalNewline": true,
 3 |     "files.trimFinalNewlines": true,
 4 |     "files.trimTrailingWhitespace": true,
 5 |     "files.associations": {
 6 |         ".pylintrc": "ini"
 7 |     },
 8 |     "python.testing.unittestEnabled": false,
 9 |     "python.testing.nosetestsEnabled": false,
10 |     "python.testing.pytestEnabled": true,
11 |     "python.linting.pylintUseMinimalCheckers": false,
12 |     "[python]": {
13 |         "editor.rulers": [80],
14 |         "editor.tabSize": 2,
15 |         "editor.defaultFormatter": "ms-python.black-formatter",
16 |         "editor.formatOnSave": true,
17 |         "editor.detectIndentation": false
18 |     },
19 |     "python.formatting.provider": "none",
20 |     "black-formatter.path": ["pyink"],
21 |     "files.watcherExclude": {
22 |         "**/.git/**": true
23 |     },
24 |     "files.exclude": {
25 |         "**/__pycache__": true,
26 |         "**/.pytest_cache": true,
27 |         "**/*.egg-info": true
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/human_scene_transformer/config/pedestrians/model_params.gin:
--------------------------------------------------------------------------------
 1 | ModelParams.agents_position_key = 'agents/position'
 2 | ModelParams.agents_feature_config = {
 3 |     'agents/position': @AgentPositionEncoder,
 4 | }
 5 | ModelParams.hidden_size = 64
 6 | ModelParams.feature_embedding_size = 128
 7 | ModelParams.transformer_ff_dim = 64
 8 | 
 9 | ModelParams.num_heads = 4
10 | ModelParams.num_modes = 20
11 | ModelParams.scene_encoder = None
12 | ModelParams.attn_architecture = (
13 |     'self-attention',
14 |     'self-attention',
15 |     'multimodality_induction',
16 |     'self-attention',
17 |     'self-attention-mode',
18 |     'self-attention',
19 |     'self-attention-mode',
20 |     )
21 | ModelParams.mask_style = "has_historic_data"
22 | ModelParams.drop_prob = 0.2
23 | ModelParams.prediction_head = @Prediction2DPositionHeadLayer
24 | 
25 | ModelParams.num_history_steps = 7
26 | ModelParams.num_steps = 20
27 | ModelParams.timestep = 0.4
28 | # Must be one of the classes in is_hidden_generators.py.
29 | ModelParams.is_hidden_generator = @BPIsHiddenGenerator


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | <!--
 4 | 
 5 | Changelog follow the https://keepachangelog.com/ standard (at least the headers)
 6 | 
 7 | This allow to:
 8 | 
 9 | * auto-parsing release notes during the automated releases from github-action:
10 |   https://github.com/marketplace/actions/pypi-github-auto-release
11 | * Have clickable headers in the rendered markdown
12 | 
13 | To release a new version (e.g. from `1.0.0` -> `2.0.0`):
14 | 
15 | * Create a new `# [2.0.0] - YYYY-MM-DD` header and add the current
16 |   `[Unreleased]` notes.
17 | * At the end of the file:
18 |   * Define the new link url:
19 |   `[2.0.0]: https://github.com/google-research/human-scene-transformer/compare/v1.0.0...v2.0.0`
20 |   * Update the `[Unreleased]` url: `v1.0.0...HEAD` -> `v2.0.0...HEAD`
21 | 
22 | -->
23 | 
24 | ## [Unreleased]
25 | 
26 | ## [0.1.0] - 2022-09-26
27 | 
28 | * Initial release
29 | 
30 | [Unreleased]: https://github.com/google-research/human-scene-transformer/compare/v0.1.0...HEAD
31 | [0.1.0]: https://github.com/google-research/human-scene-transformer/releases/tag/v0.1.0
32 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/README.md:
--------------------------------------------------------------------------------
1 | # Human Scene Transformer Architecture
2 | 
3 | ![Human Scene Transformer Architecture](../images/architecture.png)
4 | 
5 | Overview of the HST architecture: From the robot’s sensors we extract the scene context, the historic position tracks of each agent, and vision based skeletal keypoints/head orientation when feasible. All features are encoded individually before the agent features are combined via cross-attention (XA) using a learned query tensor. The resulting agent-timestep-tokens is passed to our Agent Self-Alignment layer which enables the use of subsequent full self-attention (FSA) layers. Embedded scene context is attended to via cross-attention (XA). After multimodality is induced and further FSA layers the model outputs the parameters of a Normal distribution for each agent at each prediction timestep. We can represent the full output structure as a Gaussian Mixture Model (formula in bottom right) over all possible futures where the mixture coefficients w come from the Multimodality Induction. Both cross-attention (XA) and full self-attention layers use the Transformer layer (top right) with different input configurations for Query (Q), Key (K), and Value (V).


--------------------------------------------------------------------------------
/human_scene_transformer/config/jrdb_challenge/model_params.gin:
--------------------------------------------------------------------------------
 1 | ModelParams.agents_position_key = 'agents/position'
 2 | ModelParams.agents_feature_config = {
 3 |     'agents/position': @AgentPositionEncoder,
 4 |     'agents/keypoints': @AgentKeypointsEncoder,
 5 |     #'agents/gaze': @Agent2DOrientationEncoder,
 6 | }
 7 | ModelParams.hidden_size = 128
 8 | ModelParams.feature_embedding_size = 128
 9 | ModelParams.transformer_ff_dim = 128
10 | 
11 | ModelParams.num_heads = 4
12 | ModelParams.num_modes = 4
13 | ModelParams.scene_encoder = @PointCloudEncoderLayer
14 | ModelParams.attn_architecture = (
15 |     'self-attention',
16 |     'self-attention',
17 |     'cross-attention',
18 |     'multimodality_induction',
19 |     'self-attention',
20 |     'self-attention-mode',
21 |     'self-attention',
22 |     'self-attention-mode',
23 |     )
24 | ModelParams.mask_style = "has_historic_data"
25 | ModelParams.drop_prob = 0.1
26 | ModelParams.prediction_head = @Prediction2DPositionHeadLayer
27 | 
28 | ModelParams.num_history_steps = 11
29 | ModelParams.num_steps = 24
30 | ModelParams.timestep = 0.4
31 | # Must be one of the classes in is_hidden_generators.py.
32 | ModelParams.is_hidden_generator = @BPIsHiddenGenerator


--------------------------------------------------------------------------------
/human_scene_transformer/config/jrdb/model_params.gin:
--------------------------------------------------------------------------------
 1 | ModelParams.agents_position_key = 'agents/position'
 2 | ModelParams.agents_feature_config = {
 3 |     'agents/position': @AgentPositionEncoder,
 4 |     'agents/keypoints': @AgentKeypointsEncoder,
 5 |     #'agents/gaze': @Agent2DOrientationEncoder,
 6 | }
 7 | ModelParams.hidden_size = 128
 8 | ModelParams.feature_embedding_size = 128
 9 | ModelParams.transformer_ff_dim = 128
10 | 
11 | ModelParams.num_heads = 4
12 | ModelParams.num_modes = 6
13 | #ModelParams.scene_encoder = @PointCloudEncoderLayer
14 | ModelParams.attn_architecture = (
15 |     'self-attention',
16 |     'self-attention',
17 |     #'cross-attention',
18 |     'multimodality_induction',
19 |     'self-attention',
20 |     'self-attention-mode',
21 |     'self-attention',
22 |     'self-attention-mode',
23 |     )
24 | ModelParams.mask_style = "has_historic_data"
25 | ModelParams.drop_prob = 0.1
26 | ModelParams.prediction_head = @Prediction2DPositionHeadLayer
27 | 
28 | ModelParams.num_history_steps = 6
29 | ModelParams.num_steps = 19
30 | ModelParams.timestep = 0.33333333333334
31 | # Must be one of the classes in is_hidden_generators.py.
32 | ModelParams.is_hidden_generator = @BPIsHiddenGenerator


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | The primary purpose of this repository is to provide a record of published work
 4 | and we do not expect to receive pull requests. If you'd nonetheless like to
 5 | contribute, there are just a few small guidelines you need to follow.
 6 | 
 7 | ## Contributor License Agreement
 8 | 
 9 | Contributions to this project must be accompanied by a Contributor License
10 | Agreement (CLA). You (or your employer) retain the copyright to your
11 | contribution; this simply gives us permission to use and redistribute your
12 | contributions as part of the project. Head over to
13 | <https://cla.developers.google.com/> to see your current agreements on file or
14 | to sign a new one.
15 | 
16 | You generally only need to submit a CLA once, so if you've already submitted one
17 | (even if it was for a different project), you probably don't need to do it
18 | again.
19 | 
20 | ## Code Reviews
21 | 
22 | All submissions, including submissions by project members, require review. We
23 | use GitHub pull requests for this purpose. Consult
24 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
25 | information on using pull requests.
26 | 
27 | ## Community Guidelines
28 | 
29 | This project follows
30 | [Google's Open Source Community Guidelines](https://opensource.google/conduct/).
31 | 


--------------------------------------------------------------------------------
/human_scene_transformer/metrics/metrics.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Entry point file for metrics."""
16 | 
17 | import gin
18 | 
19 | from human_scene_transformer.metrics import ade  # pylint: disable=unused-import
20 | from human_scene_transformer.metrics import mean_angle_error  # pylint: disable=unused-import
21 | from human_scene_transformer.metrics import pos_nll  # pylint: disable=unused-import
22 | 
23 | import tensorflow as tf
24 | 
25 | 
26 | # Create a "stock" subclass from Keras' Mean for gin.
27 | # Somehow, calling gin.external_configurable() did not work.
28 | @gin.configurable
29 | class Mean(tf.keras.metrics.Mean):
30 | 
31 |   def __init__(self, name="mean", dtype=None):
32 |     super().__init__(name=name, dtype=dtype)
33 | 


--------------------------------------------------------------------------------
/human_scene_transformer/jrdb/torch_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """PyTorch Dataset Wrapper for JRDB Dataset."""
16 | 
17 | from human_scene_transformer.jrdb import dataset_params as jrdb_dataset_params
18 | from human_scene_transformer.jrdb import input_fn
19 | 
20 | import tensorflow_datasets as tfds
21 | 
22 | import torch
23 | 
24 | 
25 | class JRDBPredictionDataset(torch.utils.data.IterableDataset):
26 |   """PyTorch Dataset Wrapper for JRDB Dataset."""
27 | 
28 |   def __init__(self,
29 |                dataset_params: jrdb_dataset_params.JRDBDatasetParams,
30 |                train: bool = True):
31 |     self.dataset_params = dataset_params
32 |     self.train = train
33 | 
34 |   def __iter__(self):
35 |     if self.train:
36 |       tfds_iter = iter(
37 |           tfds.as_numpy(input_fn.get_train_dataset(self.dataset_params)))
38 |     else:
39 |       tfds_iter = iter(
40 |           tfds.as_numpy(input_fn.get_eval_dataset(self.dataset_params)))
41 | 
42 |     return tfds_iter
43 | 


--------------------------------------------------------------------------------
/human_scene_transformer/pedestrians/dataset_params.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """JRDB dataset parameter class.
16 | 
17 | The module contains a dataclasses used to configure the dataset.
18 | """
19 | 
20 | import gin
21 | 
22 | 
23 | @gin.configurable
24 | class PedestriansDatasetParams(object):
25 |   """This object describes parameters of the dataset.
26 | 
27 |   The default values represent the proxy dataset.
28 |   """
29 | 
30 |   def __init__(
31 |       self,
32 |       path=None,
33 |       dataset='eth',
34 |       train_config='train',
35 |       eval_config='val',
36 |       num_history_steps=7,
37 |       num_steps=20,
38 |       num_agents=16,
39 |       timestep=0.4,
40 |   ):
41 | 
42 |     self.path = path
43 |     self.dataset = dataset
44 |     self.train_config = train_config
45 |     self.eval_config = eval_config
46 |     self.num_history_steps = num_history_steps
47 |     self.num_steps = num_steps
48 |     self.num_agents = num_agents
49 |     self.timestep = timestep
50 | 


--------------------------------------------------------------------------------
/.github/workflows/pytest_and_autopublish.yml:
--------------------------------------------------------------------------------
 1 | name: Unittests & Auto-publish
 2 | 
 3 | # Allow to trigger the workflow manually (e.g. when deps changes)
 4 | on: [push, workflow_dispatch]
 5 | 
 6 | jobs:
 7 |   pytest-job:
 8 |     runs-on: ubuntu-latest
 9 |     timeout-minutes: 30
10 | 
11 |     concurrency:
12 |       group: ${{ github.workflow }}-${{ github.ref }}
13 |       cancel-in-progress: true
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 | 
18 |     # Install deps
19 |     - uses: actions/setup-python@v4
20 |       with:
21 |         python-version: '3.10'
22 |         # Uncomment to cache of pip dependencies (if tests too slow)
23 |         # cache: pip
24 |         # cache-dependency-path: '**/pyproject.toml'
25 | 
26 |     - run: pip install --upgrade pip
27 |     - run: pip --version
28 |     - run: pip install -e .[dev]
29 |     - run: pip freeze
30 | 
31 |     # Run tests (in parallel)
32 |     # - name: Run core tests
33 |     #  run: pytest -vv -n auto
34 | 
35 |   # # Auto-publish when version is increased
36 |   # publish-job:
37 |   #   # Only try to publish if:
38 |   #   # * Repo is self (prevents running from forks)
39 |   #   # * Branch is `main`
40 |   #   if: |
41 |   #     github.repository == 'google-research/human-scene-transformer'
42 |   #     && github.ref == 'refs/heads/main'
43 |   #   needs: pytest-job  # Only publish after tests are successful
44 |   #   runs-on: ubuntu-latest
45 |   #   permissions:
46 |   #     contents: write
47 |   #   timeout-minutes: 30
48 | 
49 |   #   steps:
50 |   #   # Publish the package (if local `__version__` > pip version)
51 |   #   - uses: etils-actions/pypi-auto-publish@v1
52 |   #     with:
53 |   #       pypi-token: ${{ secrets.PYPI_API_TOKEN }}
54 |   #       gh-token: ${{ secrets.GITHUB_TOKEN }}
55 |   #       parse-changelog: true
56 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Contains embedding layers."""
16 | 
17 | import tensorflow as tf
18 | 
19 | 
20 | class SinusoidalEmbeddingLayer(tf.keras.layers.Layer):
21 |   """Sinusoidal Postional Embedding for xyz and time."""
22 | 
23 |   def __init__(self, min_freq=4, max_freq=256, hidden_size=256):
24 |     super().__init__()
25 |     self.min_freq = float(min_freq)
26 |     self.max_freq = float(max_freq)
27 |     self.hidden_size = hidden_size
28 |     if hidden_size % 2 != 0:
29 |       raise ValueError('hidden_size ({hidden_size}) must be divisible by 2.')
30 |     self.num_freqs_int32 = hidden_size // 2
31 |     self.num_freqs = tf.cast(self.num_freqs_int32, dtype=tf.float32)
32 | 
33 |   def build(self, input_shape):
34 |     log_freq_increment = (
35 |         tf.math.log(float(self.max_freq) / float(self.min_freq)) /
36 |         tf.maximum(1.0, self.num_freqs - 1))
37 |     # [num_freqs]
38 |     self.inv_freqs = self.min_freq * tf.exp(
39 |         tf.range(self.num_freqs, dtype=tf.float32) * -log_freq_increment)
40 | 
41 |   def call(self, input_tensor):
42 |     # [..., num_freqs]
43 |     input_tensor = tf.repeat(
44 |         input_tensor[..., tf.newaxis], self.num_freqs_int32, axis=-1)
45 |     # [..., h]
46 |     embedded = tf.concat([
47 |         tf.sin(input_tensor * self.inv_freqs),
48 |         tf.cos(input_tensor * self.inv_freqs)
49 |     ],
50 |                          axis=-1)
51 |     return embedded
52 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | # Project metadata. Available keys are documented at:
 3 | # https://packaging.python.org/en/latest/specifications/declaring-project-metadata
 4 | name = "human_scene_transformer"
 5 | description = "Human Scene Transformer: Trajectory Prediction in Human-Centered Environments."
 6 | readme = "README.md"
 7 | requires-python = ">=3.10"
 8 | license = {file = "LICENSE"}
 9 | authors = [
10 |   {name = "Tim Salzmann", email="Tim.Salzmann@tum.de"},
11 |   {name = "Lewis Chiang"},
12 |   {name = "Markus Ryll"},
13 |   {name = "Dorsa Sadigh"},
14 |   {name = "Carolina Parada"},
15 |   {name = "Alex Bewley", email="bewley@google.com"}]
16 | classifiers = [  # List of https://pypi.org/classifiers/
17 |     "License :: OSI Approved :: Apache Software License",
18 |     "Intended Audience :: Science/Research",
19 | ]
20 | keywords = []
21 | 
22 | # pip dependencies of the project
23 | # Installed locally with `pip install -e .`
24 | dependencies = [
25 |   "absl-py>=1.4",
26 |   "gin-config>=0.5",
27 |   "numpy>=1.24",
28 |   "open3d>=0.17",
29 |   "pandas>=2.0",
30 |   "scipy>=1.10",
31 |   "tensorflow>=2.13",
32 |   "tensorflow-datasets>=4.9",
33 |   "tensorflow-graphics>=2021.12",
34 |   "tensorflow-probability>=0.21",
35 |   "tf-models-official>=2.5",
36 |   "tqdm>=4.66"
37 | ]
38 | 
39 | # `version` is automatically set by flit to use `human_scene_transformer.__version__`
40 | dynamic = ["version"]
41 | 
42 | [project.urls]
43 | homepage = "https://human-scene-transformer.github.io/"
44 | repository = "https://github.com/google-research/human-scene-transformer"
45 | changelog = "https://github.com/google-research/human-scene-transformer/blob/main/CHANGELOG.md"
46 | # documentation = ""
47 | 
48 | [project.optional-dependencies]
49 | # Development deps (unittest, linting, formating,...)
50 | # Installed through `pip install -e .[dev]`
51 | dev = [
52 |     "pytest",
53 |     "pytest-xdist",
54 |     "pylint>=2.6.0",
55 |     "pyink",
56 | ]
57 | 
58 | [tool.pyink]
59 | # Formatting configuration to follow Google style-guide
60 | line-length = 80
61 | preview = true
62 | pyink-indentation = 2
63 | pyink-use-majority-quotes = true
64 | 
65 | [build-system]
66 | # Build system specify which backend is used to build/install the project (flit,
67 | # poetry, setuptools,...). All backends are supported by `pip install`
68 | requires = ["flit_core >=3.8,<4"]
69 | build-backend = "flit_core.buildapi"
70 | 
71 | [tool.flit.sdist]
72 | # Flit specific options (files to exclude from the PyPI package).
73 | # If using another build backend (setuptools, poetry), you can remove this
74 | # section.
75 | exclude = [
76 |   # Do not release tests files on PyPI
77 |   "**/*_test.py",
78 | ]
79 | 


--------------------------------------------------------------------------------
/human_scene_transformer/jrdb/dataset_params.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """JRDB dataset parameter class.
16 | 
17 | The module contains a dataclasses used to configure the dataset.
18 | """
19 | 
20 | import gin
21 | 
22 | 
23 | @gin.configurable
24 | class JRDBDatasetParams(object):
25 |   """This object describes parameters of the dataset.
26 | 
27 |   The default values represent the proxy dataset.
28 |   """
29 | 
30 |   def __init__(
31 |       self,
32 |       path=None,
33 |       train_scenes=None,
34 |       eval_scenes=None,
35 |       features=None,
36 |       train_split=(0., 1.),
37 |       eval_split=(0., 1.),
38 |       num_history_steps=6,
39 |       num_steps=19,
40 |       num_agents=8,
41 |       timestep=0.33333333333334,
42 |       subsample=5,
43 |       min_distance_to_robot=7.0,
44 |       num_pointcloud_points=512,
45 |   ):
46 |     """Initialize the DatasetParam object.
47 | 
48 |     Args:
49 |       path: Location of dataset(s).
50 |       train_scenes: Scenes to use in the train dataset.
51 |       eval_scenes: Scenes to use in the eval dataset.
52 |       features: The features to load from dataset file.
53 |       train_split: Split to use from the train scenes in the train dataset.
54 |       eval_split: Split to use from the eval scenes in the eval dataset.
55 |       num_history_steps: The number of history timesteps.
56 |       num_steps: The total number of timesteps.
57 |       num_agents: The max number of agents in the dataset.
58 |       timestep: The interval between timesteps.
59 |       subsample: Subsample JRDB dataset by this frequency.
60 |       min_distance_to_robot: Filter out agents with larger
61 |         minimum distance to robot in selected window.
62 |       num_pointcloud_points: Number of sampled pointcloud points.
63 |     """
64 | 
65 |     self.path = path
66 |     self.train_scenes = train_scenes
67 |     self.eval_scenes = eval_scenes
68 |     self.features = features
69 |     self.train_split = train_split
70 |     self.eval_split = eval_split
71 |     self.num_history_steps = num_history_steps
72 |     self.num_steps = num_steps
73 |     self.num_agents = num_agents
74 |     self.timestep = timestep
75 |     self.subsample = subsample
76 |     self.min_distance_to_robot = min_distance_to_robot
77 |     self.num_pointcloud_points = num_pointcloud_points
78 | 


--------------------------------------------------------------------------------
/human_scene_transformer/config/jrdb/dataset_params.gin:
--------------------------------------------------------------------------------
 1 | TRAIN_SCENES = ['bytes-cafe-2019-02-07_0',
 2 |  'clark-center-2019-02-28_0',
 3 |  'clark-center-intersection-2019-02-28_0',
 4 |  'cubberly-auditorium-2019-04-22_0',
 5 |  'gates-159-group-meeting-2019-04-03_0',
 6 |  'gates-ai-lab-2019-02-08_0',
 7 |  'gates-to-clark-2019-02-28_1',
 8 |  'hewlett-packard-intersection-2019-01-24_0',
 9 |  'huang-basement-2019-01-25_0',
10 |  'huang-lane-2019-02-12_0',
11 |  'memorial-court-2019-03-16_0',
12 |  'meyer-green-2019-03-16_0',
13 |  'packard-poster-session-2019-03-20_0',
14 |  'packard-poster-session-2019-03-20_1',
15 |  'stlc-111-2019-04-19_0',
16 |  'svl-meeting-gates-2-2019-04-08_0',
17 |  'tressider-2019-03-16_0',
18 |  'tressider-2019-03-16_1',
19 |  'cubberly-auditorium-2019-04-22_1_test',
20 |  'discovery-walk-2019-02-28_0_test',
21 |  'food-trucks-2019-02-12_0_test',
22 |  'gates-ai-lab-2019-04-17_0_test',
23 |  'gates-foyer-2019-01-17_0_test',
24 |  'gates-to-clark-2019-02-28_0_test',
25 |  'hewlett-class-2019-01-23_1_test',
26 |  'huang-2-2019-01-25_1_test',
27 |  'indoor-coupa-cafe-2019-02-06_0_test',
28 |  'lomita-serra-intersection-2019-01-30_0_test',
29 |  'nvidia-aud-2019-01-25_0_test',
30 |  'nvidia-aud-2019-04-18_1_test',
31 |  'outdoor-coupa-cafe-2019-02-06_0_test',
32 |  'quarry-road-2019-02-28_0_test',
33 |  'stlc-111-2019-04-19_1_test',
34 |  'stlc-111-2019-04-19_2_test',
35 |  'tressider-2019-04-26_0_test',
36 |  'tressider-2019-04-26_1_test']
37 | 
38 | TEST_SCENES = ['clark-center-2019-02-28_1',
39 |  'forbes-cafe-2019-01-22_0',
40 |  'gates-basement-elevators-2019-01-17_1',
41 |  'huang-2-2019-01-25_0',
42 |  'jordan-hall-2019-04-22_0',
43 |  'nvidia-aud-2019-04-18_0',
44 |  'packard-poster-session-2019-03-20_2',
45 |  'svl-meeting-gates-2-2019-04-08_1',
46 |  'tressider-2019-04-26_2',
47 |  'discovery-walk-2019-02-28_1_test',
48 |  'gates-basement-elevators-2019-01-17_0_test',
49 |  'hewlett-class-2019-01-23_0_test',
50 |  'huang-intersection-2019-01-22_0_test',
51 |  'meyer-green-2019-03-16_1_test',
52 |  'nvidia-aud-2019-04-18_2_test',
53 |  'serra-street-2019-01-30_0_test',
54 |  'tressider-2019-03-16_2_test',
55 |  'tressider-2019-04-26_3_test']
56 | 
57 | 
58 | JRDBDatasetParams.path = '<dataset_path>'
59 | 
60 | JRDBDatasetParams.train_scenes = %TRAIN_SCENES
61 | JRDBDatasetParams.eval_scenes = %TEST_SCENES
62 | JRDBDatasetParams.features = [
63 |         'agents/position',
64 |         'agents/keypoints',
65 |         'robot/position',
66 |         #'robot/orientation',
67 |         #'scene/pc'
68 |         ]
69 | 
70 | JRDBDatasetParams.train_split = (0., 1.0)
71 | JRDBDatasetParams.eval_split = (0.0, 1.0)
72 | 
73 | 
74 | JRDBDatasetParams.num_history_steps = 6
75 | JRDBDatasetParams.num_steps = 19
76 | JRDBDatasetParams.num_agents = 8
77 | JRDBDatasetParams.timestep = 0.33333333333334
78 | 
79 | JRDBDatasetParams.subsample = 5
80 | JRDBDatasetParams.num_pointcloud_points = 512
81 | 
82 | JRDBDatasetParams.min_distance_to_robot = 7.0


--------------------------------------------------------------------------------
/human_scene_transformer/config/jrdb/metrics.gin:
--------------------------------------------------------------------------------
 1 | # All available metrics.
 2 | min_ade/metrics.ade.MinADE.cutoff_seconds = None
 3 | min_ade1s/metrics.ade.MinADE.cutoff_seconds = 1.0
 4 | min_ade2s/metrics.ade.MinADE.cutoff_seconds = 2.0
 5 | min_ade3s/metrics.ade.MinADE.cutoff_seconds = 3.0
 6 | min_ade4s/metrics.ade.MinADE.cutoff_seconds = 4.0
 7 | 
 8 | ml_ade/metrics.ade.MLADE.cutoff_seconds = None
 9 | ml_ade1s/metrics.ade.MLADE.cutoff_seconds = 1.0
10 | ml_ade2s/metrics.ade.MLADE.cutoff_seconds = 2.0
11 | ml_ade3s/metrics.ade.MLADE.cutoff_seconds = 3.0
12 | ml_ade4s/metrics.ade.MLADE.cutoff_seconds = 4.0
13 | 
14 | pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = None
15 | pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 1.0
16 | pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 2.0
17 | pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 3.0
18 | pos_nll4s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 4.0
19 | 
20 | # Training metrics.
21 | get_metrics.train_metrics = {
22 |     'loss': @metrics.Mean,
23 |     'loss_position': @metrics.Mean,
24 |     'loss_orientation': @metrics.Mean,
25 | 
26 |     'min_ade': @min_ade/metrics.ade.MinADE,
27 |     'min_ade1s': @min_ade1s/metrics.ade.MinADE,
28 |     'min_ade2s': @min_ade2s/metrics.ade.MinADE,
29 |     'min_ade3s': @min_ade3s/metrics.ade.MinADE,
30 |     'min_ade4s': @min_ade4s/metrics.ade.MinADE,
31 | 
32 |     'ml_ade': @ml_ade/metrics.ade.MLADE,
33 |     'ml_ade1s': @ml_ade1s/metrics.ade.MLADE,
34 |     'ml_ade2s': @ml_ade2s/metrics.ade.MLADE,
35 |     'ml_ade3s': @ml_ade3s/metrics.ade.MLADE,
36 |     'ml_ade4s': @ml_ade4s/metrics.ade.MLADE,
37 | 
38 |     'pos_nll': @pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood,
39 |     'pos_nll1s': @pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood,
40 |     'pos_nll2s': @pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood,
41 |     'pos_nll3s': @pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood,
42 |     'pos_nll4s': @pos_nll4s/metrics.pos_nll.PositionNegativeLogLikelihood,
43 | }
44 | 
45 | # Eval metrics.
46 | get_metrics.eval_metrics = {
47 |     'loss': @metrics.Mean,
48 |     'loss_position': @metrics.Mean,
49 |     'loss_orientation': @metrics.Mean,
50 | 
51 |     'min_ade': @min_ade/metrics.ade.MinADE,
52 |     'min_ade1s': @min_ade1s/metrics.ade.MinADE,
53 |     'min_ade2s': @min_ade2s/metrics.ade.MinADE,
54 |     'min_ade3s': @min_ade3s/metrics.ade.MinADE,
55 |     'min_ade4s': @min_ade4s/metrics.ade.MinADE,
56 | 
57 |     'ml_ade': @ml_ade/metrics.ade.MLADE,
58 |     'ml_ade1s': @ml_ade1s/metrics.ade.MLADE,
59 |     'ml_ade2s': @ml_ade2s/metrics.ade.MLADE,
60 |     'ml_ade3s': @ml_ade3s/metrics.ade.MLADE,
61 |     'ml_ade4s': @ml_ade4s/metrics.ade.MLADE,
62 | 
63 |     'pos_nll': @pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood,
64 |     'pos_nll1s': @pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood,
65 |     'pos_nll2s': @pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood,
66 |     'pos_nll3s': @pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood,
67 |     'pos_nll4s': @pos_nll4s/metrics.pos_nll.PositionNegativeLogLikelihood,
68 | }


--------------------------------------------------------------------------------
/human_scene_transformer/config/jrdb_challenge/metrics.gin:
--------------------------------------------------------------------------------
 1 | # All available metrics.
 2 | min_ade/metrics.ade.MinADE.cutoff_seconds = None
 3 | min_ade1s/metrics.ade.MinADE.cutoff_seconds = 1.0
 4 | min_ade2s/metrics.ade.MinADE.cutoff_seconds = 2.0
 5 | min_ade3s/metrics.ade.MinADE.cutoff_seconds = 3.0
 6 | min_ade4s/metrics.ade.MinADE.cutoff_seconds = 4.0
 7 | 
 8 | ml_ade/metrics.ade.MLADE.cutoff_seconds = None
 9 | ml_ade1s/metrics.ade.MLADE.cutoff_seconds = 1.0
10 | ml_ade2s/metrics.ade.MLADE.cutoff_seconds = 2.0
11 | ml_ade3s/metrics.ade.MLADE.cutoff_seconds = 3.0
12 | ml_ade4s/metrics.ade.MLADE.cutoff_seconds = 4.8
13 | 
14 | pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = None
15 | pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 1.0
16 | pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 2.0
17 | pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 3.0
18 | pos_nll4s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 4.8
19 | 
20 | # Training metrics.
21 | get_metrics.train_metrics = {
22 |     'loss': @metrics.Mean,
23 |     'loss_position': @metrics.Mean,
24 |     'loss_orientation': @metrics.Mean,
25 | 
26 |     'min_ade': @min_ade/metrics.ade.MinADE,
27 |     'min_ade1s': @min_ade1s/metrics.ade.MinADE,
28 |     'min_ade2s': @min_ade2s/metrics.ade.MinADE,
29 |     'min_ade3s': @min_ade3s/metrics.ade.MinADE,
30 |     'min_ade4s': @min_ade4s/metrics.ade.MinADE,
31 | 
32 |     'ml_ade': @ml_ade/metrics.ade.MLADE,
33 |     'ml_ade1s': @ml_ade1s/metrics.ade.MLADE,
34 |     'ml_ade2s': @ml_ade2s/metrics.ade.MLADE,
35 |     'ml_ade3s': @ml_ade3s/metrics.ade.MLADE,
36 |     'ml_ade4s': @ml_ade4s/metrics.ade.MLADE,
37 | 
38 |     'pos_nll': @pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood,
39 |     'pos_nll1s': @pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood,
40 |     'pos_nll2s': @pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood,
41 |     'pos_nll3s': @pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood,
42 |     'pos_nll4s': @pos_nll4s/metrics.pos_nll.PositionNegativeLogLikelihood,
43 | }
44 | 
45 | # Eval metrics.
46 | get_metrics.eval_metrics = {
47 |     'loss': @metrics.Mean,
48 |     'loss_position': @metrics.Mean,
49 |     'loss_orientation': @metrics.Mean,
50 | 
51 |     'min_ade': @min_ade/metrics.ade.MinADE,
52 |     'min_ade1s': @min_ade1s/metrics.ade.MinADE,
53 |     'min_ade2s': @min_ade2s/metrics.ade.MinADE,
54 |     'min_ade3s': @min_ade3s/metrics.ade.MinADE,
55 |     'min_ade4s': @min_ade4s/metrics.ade.MinADE,
56 | 
57 |     'ml_ade': @ml_ade/metrics.ade.MLADE,
58 |     'ml_ade1s': @ml_ade1s/metrics.ade.MLADE,
59 |     'ml_ade2s': @ml_ade2s/metrics.ade.MLADE,
60 |     'ml_ade3s': @ml_ade3s/metrics.ade.MLADE,
61 |     'ml_ade4s': @ml_ade4s/metrics.ade.MLADE,
62 | 
63 |     'pos_nll': @pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood,
64 |     'pos_nll1s': @pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood,
65 |     'pos_nll2s': @pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood,
66 |     'pos_nll3s': @pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood,
67 |     'pos_nll4s': @pos_nll4s/metrics.pos_nll.PositionNegativeLogLikelihood,
68 | }


--------------------------------------------------------------------------------
/human_scene_transformer/config/pedestrians/metrics.gin:
--------------------------------------------------------------------------------
 1 | # All available metrics.
 2 | min_ade/metrics.ade.MinADE.cutoff_seconds = None
 3 | min_ade1s/metrics.ade.MinADE.cutoff_seconds = 1.0
 4 | min_ade2s/metrics.ade.MinADE.cutoff_seconds = 2.0
 5 | min_ade3s/metrics.ade.MinADE.cutoff_seconds = 3.0
 6 | min_ade5s/metrics.ade.MinADE.cutoff_seconds = 5.0
 7 | 
 8 | ml_ade/metrics.ade.MLADE.cutoff_seconds = None
 9 | ml_ade1s/metrics.ade.MLADE.cutoff_seconds = 1.0
10 | ml_ade2s/metrics.ade.MLADE.cutoff_seconds = 2.0
11 | ml_ade3s/metrics.ade.MLADE.cutoff_seconds = 3.0
12 | ml_ade5s/metrics.ade.MLADE.cutoff_seconds = 5.0
13 | 
14 | pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = None
15 | pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 1.0
16 | pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 2.0
17 | pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 3.0
18 | pos_nll5s/metrics.pos_nll.PositionNegativeLogLikelihood.cutoff_seconds = 5.0
19 | 
20 | # Training metrics.
21 | get_metrics.train_metrics = {
22 |     'loss': @metrics.Mean,
23 |     'loss_position': @metrics.Mean,
24 |     'loss_orientation': @metrics.Mean,
25 | 
26 |     'min_ade': @min_ade/metrics.ade.MinADE,
27 |     'min_ade1s': @min_ade1s/metrics.ade.MinADE,
28 |     'min_ade2s': @min_ade2s/metrics.ade.MinADE,
29 |     'min_ade3s': @min_ade3s/metrics.ade.MinADE,
30 |     #'min_ade5s': @min_ade5s/metrics.ade.MinADE,
31 | 
32 |     'ml_ade': @ml_ade/metrics.ade.MLADE,
33 |     'ml_ade1s': @ml_ade1s/metrics.ade.MLADE,
34 |     'ml_ade2s': @ml_ade2s/metrics.ade.MLADE,
35 |     'ml_ade3s': @ml_ade3s/metrics.ade.MLADE,
36 |     #'ml_ade5s': @ml_ade5s/metrics.ade.MLADE,
37 | 
38 |     'pos_nll': @pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood,
39 |     'pos_nll1s': @pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood,
40 |     'pos_nll2s': @pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood,
41 |     'pos_nll3s': @pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood,
42 |     #'pos_nll5s': @pos_nll5s/metrics.pos_nll.PositionNegativeLogLikelihood,
43 | }
44 | 
45 | # Eval metrics.
46 | get_metrics.eval_metrics = {
47 |     'loss': @metrics.Mean,
48 |     'loss_position': @metrics.Mean,
49 |     'loss_orientation': @metrics.Mean,
50 | 
51 |     'min_ade': @min_ade/metrics.ade.MinADE,
52 |     'min_ade1s': @min_ade1s/metrics.ade.MinADE,
53 |     'min_ade2s': @min_ade2s/metrics.ade.MinADE,
54 |     'min_ade3s': @min_ade3s/metrics.ade.MinADE,
55 |     #'min_ade5s': @min_ade5s/metrics.ade.MinADE,
56 | 
57 |     'ml_ade': @ml_ade/metrics.ade.MLADE,
58 |     'ml_ade1s': @ml_ade1s/metrics.ade.MLADE,
59 |     'ml_ade2s': @ml_ade2s/metrics.ade.MLADE,
60 |     'ml_ade3s': @ml_ade3s/metrics.ade.MLADE,
61 |     #'ml_ade5s': @ml_ade5s/metrics.ade.MLADE,
62 | 
63 |     'pos_nll': @pos_nll/metrics.pos_nll.PositionNegativeLogLikelihood,
64 |     'pos_nll1s': @pos_nll1s/metrics.pos_nll.PositionNegativeLogLikelihood,
65 |     'pos_nll2s': @pos_nll2s/metrics.pos_nll.PositionNegativeLogLikelihood,
66 |     'pos_nll3s': @pos_nll3s/metrics.pos_nll.PositionNegativeLogLikelihood,
67 |     #'pos_nll5s': @pos_nll5s/metrics.pos_nll.PositionNegativeLogLikelihood,
68 | }


--------------------------------------------------------------------------------
/human_scene_transformer/training_params.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Training parameter class.
16 | 
17 | The module contains a dataclasses used to configure the training procedure.
18 | """
19 | 
20 | import gin
21 | 
22 | from human_scene_transformer import losses
23 | 
24 | 
25 | @gin.configurable
26 | class TrainingParams(object):
27 |   """This object configures the model and training/eval."""
28 | 
29 |   def __init__(
30 |       self,
31 |       batch_size=32,
32 |       shuffle_buffer_size=10000,
33 |       total_train_steps=5e6,
34 |       warmup_steps=5e4,
35 |       peak_learning_rate=1e-4,
36 |       global_clipnorm=None,
37 |       batches_per_train_step=2000,
38 |       batches_per_eval_step=2000,
39 |       eval_every_n_step=1e4,
40 |       loss=losses.MultimodalPositionOrientationNLLLoss,
41 |   ):
42 |     """Initialize the TrainParam object.
43 | 
44 |     This is where gin injection occurs.
45 | 
46 |     Args:
47 |       batch_size: The batch size of training and eval. On TPUs, each core will
48 |         receive a batch size that equals batch_size/num_cores.
49 |       shuffle_buffer_size: The size of the tf.dataset.shuffle() operation.
50 |       total_train_steps: The total number of training steps (will automatically
51 |         be converted to int).
52 |       warmup_steps: The number of warmup steps during training. The learning
53 |         rate will increase linearly to the peak at this step (from 1/10 of
54 |         peak). Will automatically be converted to int.
55 |       peak_learning_rate: The maximum learning rate during training.
56 |       global_clipnorm: Gradient norm clipping value.
57 |       batches_per_train_step: Number of batches per training step. This is
58 |         required to improve the efficiency for Keras custom training loops. Will
59 |         automatically be converted to int.
60 |       batches_per_eval_step: Number of batches per eval step. The total number
61 |         of eval examples will be batches_per_eval_step * batch_size. Will
62 |         automatically be converted to int.
63 |       eval_every_n_step: Number of training steps before evaluating using the
64 |         eval set. Will automatically be converted to int.
65 |       loss: A class used as loss.
66 |     """
67 |     self.batch_size = batch_size
68 |     self.shuffle_buffer_size = shuffle_buffer_size
69 |     self.total_train_steps = int(total_train_steps)
70 |     self.warmup_steps = int(warmup_steps)
71 |     self.peak_learning_rate = peak_learning_rate
72 |     self.global_clipnorm = global_clipnorm
73 |     self.batches_per_train_step = int(batches_per_train_step)
74 |     self.batches_per_eval_step = int(batches_per_eval_step)
75 |     self.eval_every_n_step = int(eval_every_n_step)
76 |     self.loss = loss
77 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/head.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Contains prediction head layers."""
16 | 
17 | import gin
18 | 
19 | import tensorflow as tf
20 | 
21 | 
22 | @gin.register
23 | class PredictionHeadLayer(tf.keras.layers.Layer):
24 |   """Converts transformer hidden vectors to model predictions."""
25 | 
26 |   def __init__(self, hidden_units=None):
27 |     super().__init__()
28 | 
29 |     self.dense_layers = []
30 |     # Add hidden layers.
31 |     if hidden_units is not None:
32 |       for units in hidden_units:
33 |         self.dense_layers.append(
34 |             tf.keras.layers.Dense(units, activation='relu'))
35 |     self.dense_layers.append(
36 |         tf.keras.layers.EinsumDense(
37 |             '...h,hf->...f',
38 |             output_shape=11,
39 |             bias_axes='f',
40 |             activation=None))
41 | 
42 |   def call(self, input_batch):
43 |     # [b, a, t, h]
44 |     hidden_vecs = input_batch['hidden_vecs']
45 |     x = hidden_vecs
46 |     # [b, a, t, 7]
47 |     for layer in self.dense_layers:
48 |       x = layer(x)
49 |     pred = x
50 |     predictions = {
51 |         'agents/position': pred[..., 0:3],
52 |         'agents/orientation': pred[..., 3:4],
53 |         'agents/position/raw_scale_tril': pred[..., 4:10],
54 |         'agents/orientation/raw_concentration': pred[..., 10:11],
55 |     }
56 |     if 'mixture_logits' in input_batch:
57 |       predictions['mixture_logits'] = input_batch['mixture_logits']
58 |     return predictions
59 | 
60 | 
61 | @gin.register
62 | class Prediction2DPositionHeadLayer(tf.keras.layers.Layer):
63 |   """Converts transformer hidden vectors to model predictions."""
64 | 
65 |   def __init__(self, hidden_units=None, num_stages=5):
66 |     super().__init__()
67 | 
68 |     self.dense_layers = []
69 |     # Add hidden layers.
70 |     if hidden_units is not None:
71 |       for units in hidden_units:
72 |         self.dense_layers.append(
73 |             tf.keras.layers.Dense(units, activation='relu'))
74 |     # Add the final prediction head.
75 |     self.dense_layers.append(
76 |         tf.keras.layers.EinsumDense(
77 |             '...h,hf->...f',
78 |             output_shape=5,
79 |             bias_axes='f',
80 |             activation=None))
81 | 
82 |   def call(self, input_batch):
83 |     # [b, a, t, h]
84 |     hidden_vecs = input_batch['hidden_vecs']
85 |     x = hidden_vecs
86 |     # [b, a, t, 5]
87 |     for layer in self.dense_layers:
88 |       x = layer(x)
89 |     pred = x
90 |     predictions = {
91 |         'agents/position': pred[..., 0:2],
92 |         'agents/position/raw_scale_tril': pred[..., 2:5],
93 |     }
94 |     if 'mixture_logits' in input_batch:
95 |       predictions['mixture_logits'] = input_batch['mixture_logits']
96 |     return predictions
97 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/output_distributions.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Output distribution functions to further process the model's raw output."""
16 | 
17 | import tensorflow as tf
18 | import tensorflow_probability as tfp
19 | 
20 | 
21 | def force_positive(x, eps=1e-6):
22 |   return tf.keras.activations.elu(x) + 1. + eps
23 | 
24 | 
25 | @tf.function
26 | def to_positive_definite_scale_tril(logit_sigma):
27 |   tril = tfp.math.fill_triangular(logit_sigma)
28 |   scale_tril = tf.linalg.set_diag(
29 |       tril,
30 |       force_positive(tf.linalg.diag_part(tril)))
31 |   return scale_tril
32 | 
33 | 
34 | @tf.function
35 | def to_concentration(raw_concentration):
36 |   return tf.math.reciprocal(force_positive(raw_concentration))
37 | 
38 | 
39 | def get_position_distribution(model_output):
40 |   """Multivariate Normal distribution over position."""
41 |   p_pos = tfp.distributions.MultivariateNormalTriL(
42 |       loc=model_output['agents/position'],
43 |       scale_tril=to_positive_definite_scale_tril(
44 |           model_output['agents/position/raw_scale_tril']))
45 | 
46 |   return p_pos
47 | 
48 | 
49 | def get_multimodal_position_distribution(model_output):
50 |   """Multivariate Normal Mixture distribution over position."""
51 |   p_pos = get_position_distribution(model_output)
52 | 
53 |   p_pos_mm = tfp.distributions.MixtureSameFamily(
54 |       mixture_distribution=tfp.distributions.Categorical(
55 |           logits=model_output['mixture_logits']),
56 |       components_distribution=p_pos)
57 | 
58 |   return p_pos_mm
59 | 
60 | 
61 | def get_orientation_distribution(model_output):
62 |   """VonMises distribution over the yaw orientation.
63 | 
64 |   VonMises has a concentration input -> high values lead to unbounded log
65 |   likelihoods. Thus we use the reciprocal of the elu to force vanishing
66 |   gradients for high concentrations.
67 |   Args:
68 |     model_output: Raw model output dictionary.
69 |       Required keys: agents/orientation, agents/orientation/raw_concentration
70 |   Returns:
71 |     VanMises distribution.
72 |   """
73 |   # [b, a, t, n, 1]
74 |   p_orientation = tfp.distributions.VonMises(
75 |       loc=model_output['agents/orientation'][..., 0],
76 |       concentration=to_concentration(
77 |           model_output['agents/orientation/raw_concentration'][..., 0]))
78 | 
79 |   return p_orientation
80 | 
81 | 
82 | def get_multimodal_orientation_distribution(model_output):
83 |   """VonMises distribution over position."""
84 |   p_orientation = get_orientation_distribution(model_output)
85 | 
86 |   p_orientation_mm = tfp.distributions.MixtureSameFamily(
87 |       mixture_distribution=tfp.distributions.Categorical(
88 |           logits=model_output['mixture_logits']),
89 |       components_distribution=p_orientation)
90 | 
91 |   return p_orientation_mm
92 | 


--------------------------------------------------------------------------------
/human_scene_transformer/metrics/pos_nll.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2024 The human_scene_transformer Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | """Position Negative Log Likelihood Keras metric."""
16 | 
17 | import gin
18 | 
19 | from human_scene_transformer.model import output_distributions
20 | 
21 | import tensorflow as tf
22 | 
23 | 
24 | @gin.configurable
25 | class PositionNegativeLogLikelihood(tf.keras.metrics.Metric):
26 |   """Position Negative Log Likelihood."""
27 | 
28 |   def __init__(self, params, cutoff_seconds=None, at_cutoff=False,
29 |                name='PosNLL'):
30 |     """Initializes the PositionNegativeLogLikelihood metric.
31 | 
32 |     Args:
33 |       params: ModelParams
34 |       cutoff_seconds: Cutoff up to which time the metric should be calculated
35 |         in seconds.
36 |       at_cutoff: If True metric will be calculated at cutoff timestep.
37 |         Otherwise metric is calculated as average up to cutoff_seconds.
38 |       name: Metric name.
39 |     """
40 |     super().__init__(name=name)
41 |     self.agents_position_key = params.agents_position_key
42 |     self.cutoff_seconds = cutoff_seconds
43 |     self.at_cutoff = at_cutoff
44 |     if cutoff_seconds is None:
45 |       self.cutoff_idx = None
46 |     else:
47 |       # +1 due to current time step.
48 |       self.cutoff_idx = int(
49 |           params.num_history_steps +
50 |           cutoff_seconds / params.timestep) + 1
51 | 
52 |     self.num_predictions = self.add_weight(
53 |         name='num_predictions', initializer='zeros')
54 |     self.total_deviation = self.add_weight(
55 |         name='total_deviation', initializer='zeros')
56 | 
57 |   def update_state(self, input_batch, predictions):
58 |     should_predict = tf.cast(input_batch['should_predict'], tf.float32)
59 | 
60 |     p_pos = output_distributions.get_multimodal_position_distribution(
61 |         predictions)
62 | 
63 |     target = input_batch[f'{self.agents_position_key}/target']
64 |     target = target[..., :p_pos.event_shape_tensor()[0]]
65 | 
66 |     # [b, a, t, n, 1]
67 |     per_position_nll = -p_pos.log_prob(target)[..., tf.newaxis]
68 | 
69 |     # Non-observed or past should not contribute to metric.
70 |     nll = tf.math.multiply_no_nan(per_position_nll, should_predict)
71 |     # Chop off the un-wanted time part.
72 |     # [b, a, cutoff_idx, 1]
73 |     if self.at_cutoff and self.cutoff_seconds is not None:
74 |       nll = nll[:, :, self.cutoff_idx-1:self.cutoff_idx, :]
75 |       num_predictions = tf.reduce_sum(
76 |           should_predict[:, :, self.cutoff_idx-1::self.cutoff_idx, :])
77 |     else:
78 |       nll = nll[:, :, :self.cutoff_idx, :]
79 |       num_predictions = tf.reduce_sum(should_predict[:, :, :self.cutoff_idx, :])
80 | 
81 |     # [1]
82 |     nll = tf.reduce_sum(nll)
83 | 
84 |     self.num_predictions.assign_add(num_predictions)
85 |     self.total_deviation.assign_add(nll)
86 | 
87 |   def result(self):
88 |     return self.total_deviation / self.num_predictions
89 | 
90 |   def reset_states(self):
91 |     self.num_predictions.assign(0)
92 |     self.total_deviation.assign(0.0)
93 | 


--------------------------------------------------------------------------------
/human_scene_transformer/data/extract_robot_odometry_from_rosbag.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Extracts robot odometry from JRDB raw rosbags."""
 16 | 
 17 | 
 18 | import collections
 19 | import glob
 20 | import json
 21 | import os
 22 | 
 23 | from typing import Sequence
 24 | from absl import app
 25 | from absl import flags
 26 | from absl import logging
 27 | 
 28 | import pandas as pd
 29 | import rosbag
 30 | 
 31 | 
 32 | _INPUT_PATH = flags.DEFINE_string(
 33 |     'input_path',
 34 |     None,
 35 |     'Input path to folder with rosbags.',
 36 | )
 37 | 
 38 | _TIMESTAMPS_PATH = flags.DEFINE_string(
 39 |     'timestamps_path',
 40 |     None,
 41 |     'Input path to folder with timestamps.',
 42 | )
 43 | 
 44 | _OUTPUT_PATH = flags.DEFINE_string(
 45 |     'output_path',
 46 |     None,
 47 |     'Output path',
 48 | )
 49 | 
 50 | 
 51 | def extract_odometry(bag_file):
 52 |   """Extracts robot odometry from JRDB raw rosbags."""
 53 |   bag = rosbag.Bag(bag_file)
 54 | 
 55 |   data_list = list()
 56 |   for _, msg, t in bag.read_messages(topics=['segway/feedback/wheel_odometry']):
 57 |     data_point = dict()
 58 |     data_point['timestamp'] = t.to_sec()
 59 |     data_point['pose'] = dict()
 60 |     data_point['pose']['position'] = collections.OrderedDict()
 61 |     data_point['pose']['orientation'] = collections.OrderedDict()
 62 | 
 63 |     data_point['pose']['position']['x'] = msg.pose.pose.position.x
 64 |     data_point['pose']['position']['y'] = msg.pose.pose.position.y
 65 |     data_point['pose']['position']['z'] = msg.pose.pose.position.z
 66 | 
 67 |     data_point['pose']['orientation']['x'] = msg.pose.pose.orientation.x
 68 |     data_point['pose']['orientation']['y'] = msg.pose.pose.orientation.y
 69 |     data_point['pose']['orientation']['z'] = msg.pose.pose.orientation.z
 70 |     data_point['pose']['orientation']['w'] = msg.pose.pose.orientation.w
 71 |     data_list.append(data_point)
 72 | 
 73 |   return data_list
 74 | 
 75 | 
 76 | def fix_timestamps(data_list, timestamp_file):
 77 |   """Fix timestamps with expected format by JRDB dataset."""
 78 |   with open(timestamp_file) as f:
 79 |     timestamps = json.load(f)['data']
 80 | 
 81 |   df = pd.DataFrame(data_list).set_index('timestamp')
 82 | 
 83 |   data_dict = collections.OrderedDict()
 84 |   for timestamp in timestamps:
 85 |     ts_id = timestamp['pointclouds'][0]['url'].split('/')[-1]
 86 |     ts = timestamp['pointclouds'][0]['timestamp']
 87 |     data_dict[ts_id] = df.iloc[df.index.get_indexer([ts], method='nearest')[0]][
 88 |         'pose'
 89 |     ]
 90 | 
 91 |   return data_dict
 92 | 
 93 | 
 94 | def main(argv: Sequence[str]) -> None:
 95 |   if len(argv) > 1:
 96 |     raise app.UsageError('Too many command-line arguments.')
 97 | 
 98 |   bag_files = glob.glob('*.bag', root_dir=_INPUT_PATH.value)
 99 |   for bag_file in bag_files[1:2]:
100 |     data_list = extract_odometry(os.path.join(_INPUT_PATH.value, bag_file))
101 |     fixed_data_list = fix_timestamps(
102 |         data_list,
103 |         os.path.join(_TIMESTAMPS_PATH.value, bag_file[:-4], 'frames_pc.json'),
104 |     )
105 |     data_dict = {'odometry': fixed_data_list}
106 |     with open(
107 |         os.path.join(_OUTPUT_PATH.value, bag_file.replace('bag', 'json')), 'w'
108 |     ) as f:
109 |       json.dump(data_dict, f, indent=2)
110 | 
111 | if __name__ == '__main__':
112 |   logging.set_verbosity(logging.ERROR)
113 |   app.run(main)
114 | 


--------------------------------------------------------------------------------
/human_scene_transformer/is_hidden_generators.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """is_hidden generator classes for deciding HST operation mode.
 16 | 
 17 | Currently includes BP (predicting all agents), Conditional-BP (predicting
 18 | agents conditioned on the 0th agent, usually the ego-robot).
 19 | """
 20 | 
 21 | import gin
 22 | import numpy as np
 23 | 
 24 | 
 25 | @gin.configurable
 26 | class IsHiddenGenerator(object):
 27 |   """Base class generator for the is_hidden tensor."""
 28 | 
 29 |   def __init__(self, num_steps, num_history_steps):
 30 |     self._num_steps = num_steps
 31 |     self._num_history_steps = num_history_steps
 32 | 
 33 |   def __call__(self, num_agents: int, train_progress: float = 0.0):
 34 |     """Returns the is_hidden tensor.
 35 | 
 36 |     Args:
 37 |       num_agents: Number of agents in the scene.
 38 |       train_progress: A float between 0 to 1 representing the overall progress
 39 |         of training. This float can be current_step / total_training_steps. This
 40 |         float can be used for training w/ an annealing schedule.
 41 |     """
 42 |     raise NotImplementedError('Calling the base class is prohibited.')
 43 | 
 44 | 
 45 | @gin.configurable
 46 | class BPIsHiddenGenerator(IsHiddenGenerator):
 47 |   """is_hidden generator for BP. All agents futures are hidden."""
 48 | 
 49 |   def __call__(self, num_agents: int, train_progress: float = 0.0):
 50 |     """Returns the is_hidden tensor for behavior prediction.
 51 | 
 52 |     Always returns 0 (not hidden) for history/current steps and 1 (hidden)
 53 |     for future steps.
 54 | 
 55 |     Args:
 56 |       num_agents: Number of agents in the scene.
 57 |       train_progress: A float between 0 to 1 representing the overall progress
 58 |         of training. This float can be current_step / total_training_steps. This
 59 |         float can be used for training w/ an annealing schedule.
 60 | 
 61 |     Returns:
 62 |       is_hidden: The is_hidden tensor for behavior prediction.
 63 |     """
 64 | 
 65 |     # [1, a, t, 1].
 66 |     is_hidden = np.ones([1, num_agents, self._num_steps, 1],
 67 |                         dtype=bool)
 68 |     is_hidden[:, :, :self._num_history_steps + 1, :] = False
 69 |     return is_hidden
 70 | 
 71 | 
 72 | @gin.configurable
 73 | class CBPIsHiddenGenerator(IsHiddenGenerator):
 74 |   """is_hidden generator for Conditional-BP."""
 75 | 
 76 |   def __call__(self, num_agents: int, train_progress: float = 0.0):
 77 |     """Returns the is_hidden tensor for conditional behavior prediction.
 78 | 
 79 |     Always returns 0 (not hidden) for history/current steps and 1 (hidden) for
 80 |     future steps except for the 0th agent, which is all 0. This is often used
 81 |     along with ModelParams.robot_as_0th_agent=True so that the model can predict
 82 |     how other agents move conditioned on a given robot trajectory.
 83 | 
 84 |     Args:
 85 |       num_agents: Number of agents in the scene.
 86 |       train_progress: A float between 0 to 1 representing the overall progress
 87 |         of training. This float can be current_step / total_training_steps. This
 88 |         float can be used for training w/ an annealing schedule.
 89 | 
 90 |     Returns:
 91 |       is_hidden: The is_hidden tensor for conditional behavior prediction.
 92 |     """
 93 | 
 94 |     # [1, a, t, 1].
 95 |     is_hidden = np.ones([1, num_agents, self._num_steps, 1],
 96 |                         dtype=bool)
 97 |     is_hidden[:, :, :self._num_history_steps + 1, :] = False
 98 |     is_hidden[:, 0, :, :] = False
 99 |     return is_hidden
100 | 


--------------------------------------------------------------------------------
/human_scene_transformer/pedestrians/eval.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Evaluates Model on Pedestrian dataset."""
 16 | 
 17 | import os
 18 | from typing import Sequence
 19 | from absl import app
 20 | from absl import flags
 21 | from absl import logging
 22 | import gin
 23 | from human_scene_transformer.model import model as hst_model
 24 | from human_scene_transformer.model import model_params
 25 | from human_scene_transformer.pedestrians import dataset_params
 26 | from human_scene_transformer.pedestrians import input_fn
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | import tqdm
 30 | 
 31 | 
 32 | _MODEL_PATH = flags.DEFINE_string(
 33 |     'model_path',
 34 |     None,
 35 |     'Path to model directory.',
 36 | )
 37 | 
 38 | _CKPT_PATH = flags.DEFINE_string(
 39 |     'checkpoint_path',
 40 |     None,
 41 |     'Path to model checkpoint.',
 42 | )
 43 | 
 44 | 
 45 | def min_ade(pred, gt):
 46 |   dist = np.linalg.norm(pred - gt[..., tf.newaxis, :], axis=-1)
 47 |   ade = np.mean(dist, axis=0)
 48 |   minade = np.min(ade, axis=-1)
 49 |   return minade
 50 | 
 51 | 
 52 | def min_fde(pred, gt):
 53 |   fde = np.linalg.norm(pred[-1] - gt[-1, ..., tf.newaxis, :], axis=-1)
 54 |   minfde = np.min(fde, axis=-1)
 55 |   return minfde
 56 | 
 57 | 
 58 | def evaluation(checkpoint_path):
 59 |   """Evaluates Model on Pedestrian dataset."""
 60 |   d_params = dataset_params.PedestriansDatasetParams(
 61 |       num_agents=None,
 62 |       eval_config='test'
 63 |       )
 64 | 
 65 |   dataset = input_fn.load_dataset(
 66 |       d_params,
 67 |       split='test',
 68 |       augment=False,
 69 |       shuffle=False,
 70 |       repeat=False,
 71 |       deterministic=True,
 72 |   )
 73 | 
 74 |   model_p = model_params.ModelParams()
 75 | 
 76 |   model = hst_model.HumanTrajectorySceneTransformer(model_p)
 77 | 
 78 |   _, _ = model(next(iter(dataset.batch(1))), training=False)
 79 | 
 80 |   checkpoint_mngr = tf.train.Checkpoint(model=model)
 81 |   checkpoint_mngr.restore(checkpoint_path).assert_existing_objects_matched()
 82 |   logging.info('Restored checkpoint: %s', checkpoint_path)
 83 | 
 84 |   min_ades = []
 85 |   min_fdes = []
 86 |   for input_batch in tqdm.tqdm(dataset.batch(1)):
 87 |     if input_batch['agents/position'].shape[1] == 0:
 88 |       continue
 89 |     target = tf.identity(input_batch['agents/position'])
 90 |     target_np = input_batch['agents/position'].numpy()
 91 |     target_np[:, :, 8:] = np.nan
 92 |     input_batch['agents/position'] = tf.convert_to_tensor(target_np)
 93 |     full_pred, _ = model(input_batch, training=False)
 94 |     samples = full_pred['agents/position']
 95 |     for a in range(input_batch['agents/position'].shape[1]):
 96 |       if not tf.reduce_any(tf.math.is_nan(target[0, a])):
 97 |         min_ades.append(min_ade(samples[0, a, 8:], target[0, a, 8:]))
 98 |         min_fdes.append(min_fde(samples[0, a, 8:], target[0, a, 8:]))
 99 | 
100 |   print(f'MinADE: {np.nanmean(min_ades):.2f}')
101 |   print(f'MinFDE: {np.nanmean(min_fdes):.2f}')
102 | 
103 | 
104 | def main(argv: Sequence[str]) -> None:
105 |   if len(argv) > 1:
106 |     raise app.UsageError('Too many command-line arguments.')
107 | 
108 |   gin.parse_config_files_and_bindings(
109 |       [os.path.join(_MODEL_PATH.value, 'params', 'operative_config.gin')],
110 |       None,
111 |       skip_unknown=True)
112 |   print('Actual gin config used:')
113 |   print(gin.config_str())
114 | 
115 |   evaluation(_CKPT_PATH.value)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |   logging.set_verbosity(logging.ERROR)
120 |   app.run(main)
121 | 


--------------------------------------------------------------------------------
/human_scene_transformer/config/jrdb_challenge/dataset_params.gin:
--------------------------------------------------------------------------------
  1 | TRAIN_SCENES = ['bytes-cafe-2019-02-07_0',
  2 |  'clark-center-2019-02-28_0',
  3 |  'clark-center-intersection-2019-02-28_0',
  4 |  'cubberly-auditorium-2019-04-22_0',
  5 |  'gates-159-group-meeting-2019-04-03_0',
  6 |  'gates-ai-lab-2019-02-08_0',
  7 |  'gates-to-clark-2019-02-28_1',
  8 |  'hewlett-packard-intersection-2019-01-24_0',
  9 |  'huang-basement-2019-01-25_0',
 10 |  'huang-lane-2019-02-12_0',
 11 |  'memorial-court-2019-03-16_0',
 12 |  'meyer-green-2019-03-16_0',
 13 |  'packard-poster-session-2019-03-20_0',
 14 |  'packard-poster-session-2019-03-20_1',
 15 |  'stlc-111-2019-04-19_0',
 16 |  'svl-meeting-gates-2-2019-04-08_0',
 17 |  'tressider-2019-03-16_0',
 18 |  'tressider-2019-03-16_1',
 19 |  'cubberly-auditorium-2019-04-22_1_test',
 20 |  'discovery-walk-2019-02-28_0_test',
 21 |  'food-trucks-2019-02-12_0_test',
 22 |  'gates-ai-lab-2019-04-17_0_test',
 23 |  'gates-foyer-2019-01-17_0_test',
 24 |  'gates-to-clark-2019-02-28_0_test',
 25 |  'hewlett-class-2019-01-23_1_test',
 26 |  'huang-2-2019-01-25_1_test',
 27 |  'indoor-coupa-cafe-2019-02-06_0_test',
 28 |  'lomita-serra-intersection-2019-01-30_0_test',
 29 |  'nvidia-aud-2019-01-25_0_test',
 30 |  'nvidia-aud-2019-04-18_1_test',
 31 |  'outdoor-coupa-cafe-2019-02-06_0_test',
 32 |  'quarry-road-2019-02-28_0_test',
 33 |  'stlc-111-2019-04-19_1_test',
 34 |  'stlc-111-2019-04-19_2_test',
 35 |  'tressider-2019-04-26_0_test',
 36 |  'tressider-2019-04-26_1_test',
 37 |  'clark-center-2019-02-28_1',
 38 |  'forbes-cafe-2019-01-22_0',
 39 |  'gates-basement-elevators-2019-01-17_1',
 40 |  'huang-2-2019-01-25_0',
 41 |  'jordan-hall-2019-04-22_0',
 42 |  'nvidia-aud-2019-04-18_0',
 43 |  'packard-poster-session-2019-03-20_2',
 44 |  'svl-meeting-gates-2-2019-04-08_1',
 45 |  'tressider-2019-04-26_2',
 46 |  'discovery-walk-2019-02-28_1_test',
 47 |  'gates-basement-elevators-2019-01-17_0_test',
 48 |  'hewlett-class-2019-01-23_0_test',
 49 |  'huang-intersection-2019-01-22_0_test',
 50 |  'meyer-green-2019-03-16_1_test',
 51 |  'nvidia-aud-2019-04-18_2_test',
 52 |  'serra-street-2019-01-30_0_test',
 53 |  'tressider-2019-03-16_2_test',
 54 |  'tressider-2019-04-26_3_test']
 55 | 
 56 | TEST_SCENES = [
 57 |     'cubberly-auditorium-2019-04-22_1_test',
 58 |     'discovery-walk-2019-02-28_0_test',
 59 |     'discovery-walk-2019-02-28_1_test',
 60 |     'food-trucks-2019-02-12_0_test',
 61 |     'gates-ai-lab-2019-04-17_0_test',
 62 |     'gates-basement-elevators-2019-01-17_0_test',
 63 |     'gates-foyer-2019-01-17_0_test',
 64 |     'gates-to-clark-2019-02-28_0_test',
 65 |     'hewlett-class-2019-01-23_0_test',
 66 |     'hewlett-class-2019-01-23_1_test',
 67 |     'huang-2-2019-01-25_1_test',
 68 |     'huang-intersection-2019-01-22_0_test',
 69 |     'indoor-coupa-cafe-2019-02-06_0_test',
 70 |     'lomita-serra-intersection-2019-01-30_0_test',
 71 |     'meyer-green-2019-03-16_1_test',
 72 |     'nvidia-aud-2019-01-25_0_test',
 73 |     'nvidia-aud-2019-04-18_1_test',
 74 |     'nvidia-aud-2019-04-18_2_test',
 75 |     'outdoor-coupa-cafe-2019-02-06_0_test',
 76 |     'quarry-road-2019-02-28_0_test',
 77 |     'serra-street-2019-01-30_0_test',
 78 |     'stlc-111-2019-04-19_1_test',
 79 |     'stlc-111-2019-04-19_2_test',
 80 |     'tressider-2019-03-16_2_test',
 81 |     'tressider-2019-04-26_0_test',
 82 |     'tressider-2019-04-26_1_test',
 83 |     'tressider-2019-04-26_3_test',
 84 | ]
 85 | 
 86 | 
 87 | JRDBDatasetParams.path = <dataset_path>
 88 | 
 89 | JRDBDatasetParams.train_scenes = %TRAIN_SCENES
 90 | JRDBDatasetParams.eval_scenes = %TEST_SCENES
 91 | JRDBDatasetParams.features = [
 92 |         'agents/position',
 93 |         'agents/keypoints',
 94 |         'robot/position',
 95 |         'robot/orientation',
 96 |         'scene/pc'
 97 |         ]
 98 | 
 99 | JRDBDatasetParams.train_split = (0., 1.0)
100 | JRDBDatasetParams.eval_split = (0., 1.0)
101 | 
102 | 
103 | JRDBDatasetParams.num_history_steps = 11
104 | JRDBDatasetParams.num_steps = 24
105 | JRDBDatasetParams.num_agents = 16
106 | JRDBDatasetParams.timestep = 0.4
107 | 
108 | JRDBDatasetParams.subsample = 6
109 | JRDBDatasetParams.num_pointcloud_points = 512
110 | 
111 | JRDBDatasetParams.min_distance_to_robot = 50.0


--------------------------------------------------------------------------------
/human_scene_transformer/data/README.md:
--------------------------------------------------------------------------------
 1 | *This is not an officially supported Google product.*
 2 | 
 3 | # JRDB for Prediction Dataset Setup
 4 | 
 5 | ## Get the JRDB Dataset
 6 | 
 7 | 1. Go to https://jrdb.erc.monash.edu/#downloads
 8 | 2. Create a User or login.
 9 | 3. Download and extract [JRDB 2022 Full Train Dataset](https://jrdb.erc.monash.edu/static/downloads/JRDB2022/train_dataset_with_activity/train_dataset_with_activity.zip) to `<data_path>/train_dataset`.
10 | 4. Download and extract [JRDB 2022 Full Test Dataset](https://jrdb.erc.monash.edu/static/downloads/JRDB2022/test_dataset_without_labels/jrdb22_test.zip) to `<data_path>/test_dataset`.
11 | 5. Download and extract [Train Detections](https://jrdb.erc.monash.edu/static/downloads/train_detections.zip) from the JRDB 2019 section to `<data_path>/detections`.
12 | 
13 | ## Get the Leaderboard Test Set Tracks
14 | 
15 | ### For the JRDB Challenge Dataset
16 | Download and extract this leaderboard  [3D tracking result](https://jrdb.erc.monash.edu/leaderboards/download/1762) to `<data_path>/test_dataset/labels/PiFeNet/`. Such that you have `<data_path>/test_dataset/labels/PiFeNet/00XX.txt`.
17 | 
18 | ### For the Orginal Dataset used in the Paper
19 | Download and extract this leaderboard  [3D tracking result](https://jrdb.erc.monash.edu/leaderboards/download/1605) to `<data_path>/test_dataset/labels/ss3d_mot/`. Such that you have `<data_path>/test_dataset/labels/ss3d_mot/00XX.txt`. This was the best available leaderboard tracker at the time the method was developed.
20 | 
21 | ## Get the Robot Odometry
22 | 
23 | Download the compressed Odometry data file [here](https://storage.googleapis.com/gresearch/human_scene_transformer/odometry.zip).
24 | 
25 | Extract the files and move them to `<data_path>/processed/` such that you have `<data_path>/processed/odoemtry/train`,  `<data_path>/processed/odoemtry/test`.
26 | 
27 | Alternatively you can extract the robot odometry from the raw rosbags yourself via `extract_robot_odometry_from_rosbag.py`.
28 | 
29 | ## Get the Preprocessed Keypoints
30 | 
31 | Download the compressed Keypoints data file [here](https://storage.googleapis.com/gresearch/human_scene_transformer/keypoints.zip).
32 | 
33 | Extract the files and move them to `<data_path>/processed/` such that you have  `<data_path>/processed/labels/labels_3d_keypoints/train/`, `<data_path>/processed/labels/labels_3d_keypoints/test/`.
34 | 
35 | ## Create Real-World Tracks for Train Data
36 | 
37 | Run
38 | 
39 | ```python jrdb_train_detections_to_tracks.py --input_path=<data_path>```
40 | 
41 | ## Dataset Folder
42 | 
43 | You should end up with a dataset folder of the following structure
44 | 
45 | ```
46 | - <data_path>
47 |   - train_dataset
48 |     - calibration
49 |     - detections
50 |     - images
51 |     - labels
52 |     - pointclouds
53 |   - test_dataset
54 |     - calibration
55 |     - images
56 |     - labels
57 |     - pointclouds
58 |   - processed
59 |     - labels
60 |       - labels_3d_keypoints
61 |         - train
62 |         - test
63 |       - labels_detections_3d
64 |     - odoemtry
65 |       - train
66 |       - test
67 | ```
68 | 
69 | ## Generate the Tensorflow Dataset
70 | ### For the JRDB Challenge Dataset
71 | ```python jrdb_preprocess_train.py --input_path=<data_path> --output_path=<output_path> --max_distance_to_robot=50.0```
72 | 
73 | ```python jrdb_preprocess_test.py --input_path=<data_path> --output_path=<output_path> --max_distance_to_robot=50.0 --tracking_method=PiFeNet --tracking_confidence_threshold=0.01```
74 | 
75 | Please note that this can take multiple hours due to the processing of the scene's
76 | pointclouds. If you do not need the pointclouds you can speed up the processing
77 | by passing `--process_pointclouds=False` for both.
78 | 
79 | ### For the Orginal Dataset used in the Paper
80 | ```python jrdb_preprocess_train.py --input_path=<data_path> --output_path=<output_path> --max_distance_to_robot=15.0```
81 | 
82 | ```python jrdb_preprocess_test.py --input_path=<data_path> --output_path=<output_path> --max_distance_to_robot=15.0 --tracking_method=ss3d_mot```
83 | 
84 | Please note that this can take multiple hours due to the processing of the scene's
85 | pointclouds. If you do not need the pointclouds you can speed up the processing
86 | by passing `--process_pointclouds=False` for both.


--------------------------------------------------------------------------------
/human_scene_transformer/data/math/math_types.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Pytype definitions and conversion utilities.
 16 | 
 17 | (python3).
 18 | 
 19 | Utility functions:
 20 |   is_scalar - Distinguishes between scalar and vector values.
 21 |   get_matching_arrays - Gets two arrays with the same shape from scalar or
 22 |     matrix arguments.
 23 | """
 24 | 
 25 | from collections import abc
 26 | from typing import Iterable, Text, Tuple, Union
 27 | 
 28 | import numpy as np
 29 | 
 30 | # ---------------------------------------------------------------------------
 31 | # PyType Definitions
 32 | #
 33 | # A list or vector of floating point values.
 34 | ScalarType = Union[float, int]
 35 | Float2Type = Tuple[float, float]
 36 | Float3Type = Tuple[float, float, float]
 37 | Float4Type = Tuple[float, float, float, float]
 38 | Vector2Type = Union[np.ndarray, Iterable[float], Float2Type]
 39 | Vector3Type = Union[np.ndarray, Iterable[float], Float3Type]
 40 | Vector4Type = Union[np.ndarray, Iterable[float], Float4Type]
 41 | VectorType = Union[np.ndarray, Iterable[float], Float2Type, Float3Type,
 42 |                    Float4Type]
 43 | # A scalar or a vector of floating point values.
 44 | VectorOrValueType = Union[VectorType, float]
 45 | Vector3OrValueType = Union[Vector3Type, float]
 46 | Vector4OrValueType = Union[Vector4Type, float]
 47 | 
 48 | # Error messages for exceptions.
 49 | SHAPE_MISMATCH_MESSAGE = 'lhs and rhs should have the same dimension'
 50 | DIMENSION_INVALID_MESSAGE = 'Dimension is undefined or invalid'
 51 | DIMENSION_MISMATCH_MESSAGE = 'Dimension is defined, but does not match inputs'
 52 | 
 53 | # ---------------------------------------------------------------------------
 54 | # Global Constants
 55 | #
 56 | # Default values for rtol and atol arguments to np.isclose are taken from
 57 | #   third_party/py/numpy/core/numeric.py
 58 | DEFAULT_RTOL_VALUE_FOR_NP_IS_CLOSE = 1e-5  # Relative tolerance
 59 | DEFAULT_ATOL_VALUE_FOR_NP_IS_CLOSE = 1e-8  # Absolute tolerance
 60 | 
 61 | 
 62 | def is_scalar(scalar_or_array: VectorOrValueType) -> bool:
 63 |   """Returns true if the argument has a scalar type.
 64 | 
 65 |   Args:
 66 |     scalar_or_array: A scalar floating point value or an array of values.
 67 | 
 68 |   Returns:
 69 |     True if the value is a scalar (single value).
 70 |   """
 71 |   return not isinstance(scalar_or_array, abc.Sized)
 72 | 
 73 | 
 74 | def get_matching_arrays(lhs: VectorOrValueType,
 75 |                         rhs: VectorOrValueType,
 76 |                         err_msg: Text = '') -> Tuple[np.ndarray, np.ndarray]:
 77 |   """Converts both inputs to numpy arrays with the same shape.
 78 | 
 79 |   If either input is a scalar, it will be converted to a constant array with
 80 |   the same shape as the other input.  If both inputs are scalars, they will
 81 |   both be converted to arrays containing a single value.
 82 | 
 83 |   If they are not scalars, the inputs must have the same shape.
 84 | 
 85 |   If they are both scalars, the inputs will be converted to arrays with a
 86 |   single element.
 87 | 
 88 |   Args:
 89 |     lhs: Left hand side scalar or array argument.
 90 |     rhs: Right hand side scalar or array argument.
 91 |     err_msg: String error message added to exception in case of failure.
 92 | 
 93 |   Returns:
 94 |     lhs, rhs: The arguments converted to numpy arrays.
 95 | 
 96 |   Raises:
 97 |     ValueError: If the lhs and rhs arrays not have the same shape.
 98 |   """
 99 |   if is_scalar(rhs):
100 |     lhs = np.asarray(lhs)
101 |     rhs = np.full_like(lhs, rhs)
102 |   else:
103 |     rhs = np.asarray(rhs)
104 |     if is_scalar(lhs):
105 |       lhs = np.full_like(rhs, lhs)
106 |     else:
107 |       lhs = np.asarray(lhs)
108 |   if lhs.shape != rhs.shape:
109 |     raise ValueError(
110 |         'lhs and rhs should have the same dimension: %s != %s; %s' %
111 |         (lhs.shape, rhs.shape, err_msg))
112 |   return lhs, rhs
113 | 


--------------------------------------------------------------------------------
/human_scene_transformer/metrics/mean_angle_error.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """The Mean Angle Error Keras Metric class."""
 16 | 
 17 | import gin
 18 | import numpy as np
 19 | import tensorflow as tf
 20 | 
 21 | 
 22 | @gin.configurable
 23 | class MeanAngleError(tf.keras.metrics.Metric):
 24 |   """Mean Angle Error over yaw angle.
 25 | 
 26 |   Calculates the mean angular distance over all predicted timesteps.
 27 |   """
 28 | 
 29 |   def __init__(self, params, cutoff_seconds=None, at_cutoff=False,
 30 |                name='AngleError'):
 31 |     """Initializes the MeanAngleError metric.
 32 | 
 33 |     Args:
 34 |       params: ModelParams
 35 |       cutoff_seconds: Cutoff up to which time the metric should be calculated
 36 |         in seconds.
 37 |       at_cutoff: If True metric will be calculated at cutoff timestep.
 38 |         Otherwise metric is calculated as average up to cutoff_seconds.
 39 |       name: Metric name.
 40 |     """
 41 |     super().__init__(name=name)
 42 |     self.agents_orientation_key = params.agents_orientation_key
 43 |     self.cutoff_seconds = cutoff_seconds
 44 |     self.at_cutoff = at_cutoff
 45 |     if cutoff_seconds is None:
 46 |       self.cutoff_idx = None
 47 |     else:
 48 |       # +1 due to current time step.
 49 |       self.cutoff_idx = int(
 50 |           params.num_history_steps +
 51 |           cutoff_seconds / params.timestep) + 1
 52 | 
 53 |     self.num_predictions = self.add_weight(
 54 |         name='num_predictions', initializer='zeros')
 55 |     self.total_deviation = self.add_weight(
 56 |         name='total_deviation', initializer='zeros')
 57 | 
 58 |   def _reduce(self, mae, input_batch, predictions):
 59 |     return mae[..., 0, :]
 60 | 
 61 |   def update_state(self, input_batch, predictions):
 62 |     should_predict = tf.cast(input_batch['should_predict'], tf.float32)
 63 |     # [b, a, t, n, 1]
 64 |     diff = (
 65 |         input_batch[f'{self.agents_orientation_key}/target'][..., tf.newaxis, :]
 66 |         - predictions['agents/orientation'])
 67 |     deviation = tf.abs(tf.math.mod((diff + np.pi), (2 * np.pi)) - np.pi)
 68 | 
 69 |     deviation = self._reduce(deviation, input_batch, predictions)
 70 | 
 71 |     # Non-observed or past should not contribute to ade.
 72 |     deviation = tf.math.multiply_no_nan(deviation, should_predict)
 73 |     # Chop off the un-wanted time part.
 74 |     # [b, a, cutoff_idx, 1]
 75 |     if self.at_cutoff and self.cutoff_seconds is not None:
 76 |       deviation = deviation[:, :, self.cutoff_idx-1:self.cutoff_idx, :]
 77 |       num_predictions = tf.reduce_sum(
 78 |           should_predict[:, :, self.cutoff_idx-1::self.cutoff_idx, :])
 79 |     else:
 80 |       deviation = deviation[:, :, :self.cutoff_idx, :]
 81 |       num_predictions = tf.reduce_sum(should_predict[:, :, :self.cutoff_idx, :])
 82 |     # [1]
 83 |     deviation = tf.reduce_sum(deviation)
 84 | 
 85 |     self.num_predictions.assign_add(num_predictions)
 86 |     self.total_deviation.assign_add(deviation)
 87 | 
 88 |   def result(self):
 89 |     return self.total_deviation / self.num_predictions
 90 | 
 91 |   def reset_states(self):
 92 |     self.num_predictions.assign(0)
 93 |     self.total_deviation.assign(0.0)
 94 | 
 95 | 
 96 | @gin.configurable
 97 | class MinMeanAngleError(MeanAngleError):
 98 |   """Takes the minimum over all modes."""
 99 | 
100 |   def _reduce(self, mae, input_batch, predictions):
101 |     return tf.reduce_min(mae, axis=-2)
102 | 
103 | 
104 | @gin.configurable
105 | class MLMeanAngleError(MeanAngleError):
106 |   """Takes the maximum likelihood mode."""
107 | 
108 |   def _reduce(self, mae, input_batch, predictions):
109 |     # Get index of mixture component with highest probability
110 |     ml_indices = tf.math.argmax(predictions['mixture_logits'], axis=-1)
111 |     a = mae.shape[1]
112 |     t = mae.shape[2]
113 |     ml_indices = tf.tile(ml_indices, [1, a, t])[..., tf.newaxis]
114 | 
115 |     return tf.gather(mae, indices=ml_indices, batch_dims=3, axis=-2)[..., 0, :]
116 | 
117 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/multimodality_induction.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains muldimodality induction layers."""
 16 | 
 17 | import tensorflow as tf
 18 | 
 19 | 
 20 | class MultimodalityInduction(tf.keras.layers.Layer):
 21 |   """Enables the model to forward and predict multi-mode predictions.
 22 | 
 23 |   1) Features are broadcasted to number of modes and summed with learned mode
 24 |       tensors.
 25 |   2) Mixture Weights are generated by cross-attention over all dimensions
 26 |       between learned mode tensors and hidden tensors.
 27 |   """
 28 | 
 29 |   def __init__(self,
 30 |                num_modes=5,
 31 |                num_heads=8,
 32 |                hidden_size=256,
 33 |                drop_prob=0.1,
 34 |                ln_eps=1e-6,
 35 |                ff_dim=128):
 36 |     super().__init__()
 37 |     self.num_modes = num_modes
 38 |     self.hidden_size = hidden_size
 39 |     if hidden_size % num_heads != 0:
 40 |       raise ValueError(f'hidden_size ({hidden_size}) must be an integer '
 41 |                        f'times bigger than num_heads ({num_heads}).')
 42 |     self.mm_attn_layer = tf.keras.layers.MultiHeadAttention(
 43 |         num_heads=num_heads,
 44 |         key_dim=hidden_size // num_heads,
 45 |         attention_axes=3)
 46 |     self.mm_attn_ln = tf.keras.layers.LayerNormalization(epsilon=ln_eps)
 47 |     self.mm_ff_layer1 = tf.keras.layers.EinsumDense(
 48 |         '...h,hf->...f', output_shape=ff_dim, bias_axes='f', activation='relu')
 49 |     self.mm_ff_layer2 = tf.keras.layers.EinsumDense(
 50 |         '...f,fh->...h',
 51 |         output_shape=hidden_size,
 52 |         bias_axes='h',
 53 |         activation=None)
 54 |     self.mm_ff_ln = tf.keras.layers.LayerNormalization(epsilon=ln_eps)
 55 | 
 56 |     self.mw_attn_layer = tf.keras.layers.MultiHeadAttention(
 57 |         num_heads=num_heads,
 58 |         key_dim=hidden_size // num_heads,
 59 |         attention_axes=None)
 60 |     self.mw_attn_ln = tf.keras.layers.LayerNormalization(epsilon=ln_eps)
 61 |     self.mw_ff_layer1 = tf.keras.layers.EinsumDense(
 62 |         '...h,hf->...f', output_shape=ff_dim, bias_axes='f', activation='relu')
 63 |     self.mw_ff_layer2 = tf.keras.layers.EinsumDense(
 64 |         '...f,fh->...h',
 65 |         output_shape=1,  # Single logit per mode
 66 |         bias_axes='h',
 67 |         activation=None)
 68 |     self.mw_ff_ln = tf.keras.layers.LayerNormalization(epsilon=ln_eps)
 69 | 
 70 |     self.attn_dropout = tf.keras.layers.Dropout(drop_prob)
 71 |     self.ff_dropout = tf.keras.layers.Dropout(drop_prob)
 72 | 
 73 |     # [1, 1, 1, h]
 74 |     self.learned_add_mm = tf.Variable(
 75 |         tf.random_uniform_initializer(
 76 |             minval=-1.,
 77 |             maxval=1.)(shape=[1, 1, 1, self.num_modes, hidden_size]),
 78 |         trainable=True,
 79 |         dtype=tf.float32)
 80 | 
 81 |   def call(self, input_batch, training=None):
 82 |     input_batch = input_batch.copy()
 83 | 
 84 |     # [b, a, t, 1, h]
 85 |     hidden_vecs = input_batch['hidden_vecs'][..., tf.newaxis, :]
 86 | 
 87 |     # Multi Modes
 88 |     mm_add = self.mm_attn_ln(self.learned_add_mm + hidden_vecs)
 89 | 
 90 |     # Feed-forward layers.
 91 |     out = self.mm_ff_layer1(mm_add)
 92 |     out = self.mm_ff_layer2(out)
 93 |     out = self.ff_dropout(out)
 94 |     out = self.mm_ff_ln(out + hidden_vecs)
 95 | 
 96 |     input_batch['hidden_vecs'] = out
 97 | 
 98 |     # Mixture Weights
 99 |     # [b, 1, 1, n, h]
100 |     b = tf.shape(input_batch['hidden_vecs'])[0]
101 |     attn_out_mw = self.mw_attn_layer(
102 |         query=tf.tile(self.learned_add_mm, [b, 1, 1, 1, 1]),
103 |         key=mm_add,
104 |         value=mm_add,
105 |         return_attention_scores=False)
106 | 
107 |     attn_out_mw = self.attn_dropout(attn_out_mw, training=training)
108 | 
109 |     # [b, 1, 1, n, h]
110 |     attn_out_mw = self.mw_attn_ln(attn_out_mw)
111 | 
112 |     # Feed-forward layers.
113 |     out_mw = self.mw_ff_layer1(attn_out_mw)
114 |     out_mw = self.mw_ff_layer2(out_mw)
115 |     out_mw = self.ff_dropout(out_mw, training=training)
116 | 
117 |     # [b, 1, 1, n]
118 |     input_batch['mixture_logits'] = out_mw[..., 0]
119 |     return input_batch
120 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/agent_self_alignment.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains agent self alignment layers."""
 16 | 
 17 | import tensorflow as tf
 18 | 
 19 | 
 20 | class AgentSelfAlignmentLayer(tf.keras.layers.Layer):
 21 |   """Enables agent to become aware of its temporal identity.
 22 | 
 23 |   Agent features are cross-attended with a learned query in temporal dimension.
 24 |   """
 25 | 
 26 |   def __init__(self,
 27 |                num_heads=8,
 28 |                hidden_size=256,
 29 |                ln_eps=1e-6,
 30 |                ff_dim=128,
 31 |                mask_style=None):
 32 |     super().__init__()
 33 |     self.hidden_size = hidden_size
 34 |     self.mask_style = mask_style
 35 |     if hidden_size % num_heads != 0:
 36 |       raise ValueError(f'hidden_size ({hidden_size}) must be an integer '
 37 |                        f'times bigger than num_heads ({num_heads}).')
 38 |     self.attn_layer = tf.keras.layers.MultiHeadAttention(
 39 |         num_heads=num_heads,
 40 |         key_dim=hidden_size // num_heads,
 41 |         attention_axes=2)
 42 |     self.attn_ln = tf.keras.layers.LayerNormalization(epsilon=ln_eps)
 43 |     self.ff_layer1 = tf.keras.layers.EinsumDense(
 44 |         '...h,hf->...f', output_shape=ff_dim, bias_axes='f', activation='relu')
 45 |     self.ff_layer2 = tf.keras.layers.EinsumDense(
 46 |         '...f,fh->...h',
 47 |         output_shape=hidden_size,
 48 |         bias_axes='h',
 49 |         activation=None)
 50 |     self.ff_ln = tf.keras.layers.LayerNormalization(epsilon=ln_eps)
 51 | 
 52 |     # [1, 1, 1, h]
 53 |     self.learned_query_vec = tf.Variable(
 54 |         tf.random_uniform_initializer(
 55 |             minval=-1., maxval=1.)(shape=[1, 1, 1, hidden_size]),
 56 |         trainable=True,
 57 |         dtype=tf.float32)
 58 | 
 59 |   def build_learned_query(self, input_batch):
 60 |     """Converts self.learned_query_vec into a learned query vector."""
 61 |     # This weird thing is for exporting and loading keras model...
 62 |     b = tf.shape(input_batch['hidden_vecs'])[0]
 63 |     a = tf.shape(input_batch['hidden_vecs'])[1]
 64 |     t = tf.shape(input_batch['hidden_vecs'])[2]
 65 | 
 66 |     # [b, a, t, 1, h]
 67 |     return tf.tile(self.learned_query_vec, [b, a, t, 1])
 68 | 
 69 |   def call(self, input_batch):
 70 |     input_batch = input_batch.copy()
 71 | 
 72 |     # [b, a, t, h]
 73 |     hidden_vecs = input_batch['hidden_vecs']
 74 | 
 75 |     # Expand the attention mask with new dims so that Keras can broadcast to
 76 |     # the same shape as the attn_score: [b, num_heads, a, t, a, t].
 77 |     # attn_mask shape: [b, 1, 1, 1, a, t,]
 78 |     # True means the position participate in the attention while all
 79 |     # False positions are ignored.
 80 |     if self.mask_style is None:
 81 |       attn_mask = None
 82 |     elif self.mask_style == 'has_historic_data':
 83 |       has_historic_data = tf.logical_and(
 84 |           input_batch['has_historic_data'][..., 0],
 85 |           tf.logical_not(input_batch['is_hidden'][..., 0]))
 86 |       attn_mask = has_historic_data[:, :, tf.newaxis, tf.newaxis, :]
 87 |     elif self.mask_style == 'has_data':
 88 |       has_data_historic = tf.logical_and(
 89 |           input_batch['has_data'][..., 0],
 90 |           tf.logical_not(input_batch['is_hidden'][..., 0]))
 91 |       attn_mask = has_data_historic[:, :, tf.newaxis, tf.newaxis, :,]
 92 |     else:
 93 |       raise ValueError(f'Unrecognized mask style: {self.mask_style}. '
 94 |                        "Must be either None, 'fully_padded' or 'any_padded'.")
 95 | 
 96 |     # [b, a, t, 1, h]
 97 |     learned_query = self.build_learned_query(input_batch)
 98 |     attn_out, attn_score = self.attn_layer(
 99 |         query=learned_query,
100 |         key=hidden_vecs,
101 |         value=hidden_vecs,
102 |         attention_mask=attn_mask,
103 |         return_attention_scores=True)
104 | 
105 |     attn_out = self.attn_ln(attn_out + hidden_vecs)
106 | 
107 |     # Feed-forward layers.
108 |     out = self.ff_layer1(attn_out)
109 |     out = self.ff_layer2(out)
110 |     out = self.ff_ln(out + attn_out)
111 | 
112 |     input_batch['hidden_vecs'] = out
113 |     input_batch[f'attn_scores/{self.name}'] = attn_score
114 | 
115 |     return input_batch
116 | 


--------------------------------------------------------------------------------
/human_scene_transformer/metrics/ade.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """The ADE Keras Metric class."""
 16 | 
 17 | import gin
 18 | import tensorflow as tf
 19 | 
 20 | 
 21 | def distance_error(target: tf.Tensor, pred: tf.Tensor) -> tf.Tensor:
 22 |   return tf.sqrt(
 23 |       tf.reduce_sum(tf.square(pred - target), axis=-1, keepdims=True))
 24 | 
 25 | 
 26 | @gin.configurable
 27 | class ADE(tf.keras.metrics.Metric):
 28 |   """Average Displacement Error over a n dimensional track.
 29 | 
 30 |   Calculates the mean L2 distance over all predicted timesteps.
 31 |   """
 32 | 
 33 |   def __init__(self, params, cutoff_seconds=None, at_cutoff=False, name='ADE'):
 34 |     """Initializes the ADE metric.
 35 | 
 36 |     Args:
 37 |       params: ModelParams
 38 |       cutoff_seconds: Cutoff up to which time the metric should be calculated
 39 |         in seconds.
 40 |       at_cutoff: If True metric will be calculated at cutoff timestep.
 41 |         Otherwise metric is calculated as average up to cutoff_seconds.
 42 |       name: Metric name.
 43 |     """
 44 |     super().__init__(name=name)
 45 |     self.agents_position_key = params.agents_position_key
 46 |     self.cutoff_seconds = cutoff_seconds
 47 |     self.at_cutoff = at_cutoff
 48 |     if cutoff_seconds is None:
 49 |       self.cutoff_idx = None
 50 |     else:
 51 |       # +1 due to current time step.
 52 |       self.cutoff_idx = int(
 53 |           params.num_history_steps +
 54 |           cutoff_seconds / params.timestep) + 1
 55 | 
 56 |     self.num_predictions = self.add_weight(
 57 |         name='num_predictions', initializer='zeros')
 58 |     self.total_deviation = self.add_weight(
 59 |         name='total_deviation', initializer='zeros')
 60 | 
 61 |   def _reduce(self, ade_with_modes, input_batch, predictions):
 62 |     """Reduces mode dimension. The base class squeezes a single mode."""
 63 |     return tf.squeeze(ade_with_modes, axis=-1)
 64 | 
 65 |   def update_state(self, input_batch, predictions):
 66 |     should_predict = tf.cast(input_batch['should_predict'], tf.float32)
 67 | 
 68 |     target = input_batch[f'{self.agents_position_key}/target']
 69 |     target = target[..., :predictions['agents/position'].shape[-1]]
 70 |     # [b, a, t, n, 3] -> [b, a, t, n, 1]
 71 |     per_position_ade = distance_error(
 72 |         target[..., tf.newaxis, :],
 73 |         predictions['agents/position'])
 74 | 
 75 |     # Non-observed or past should not contribute to ade.
 76 |     deviation = tf.math.multiply_no_nan(per_position_ade,
 77 |                                         should_predict[..., tf.newaxis, :])
 78 |     # Chop off the un-wanted time part.
 79 |     # [b, a, cutoff_idx, 1]
 80 |     if self.at_cutoff and self.cutoff_seconds is not None:
 81 |       deviation = deviation[:, :, self.cutoff_idx-1:self.cutoff_idx, :]
 82 |       num_predictions = tf.reduce_sum(
 83 |           should_predict[:, :, self.cutoff_idx-1:self.cutoff_idx, :])
 84 |     else:
 85 |       deviation = deviation[:, :, :self.cutoff_idx, :]
 86 |       num_predictions = tf.reduce_sum(should_predict[:, :, :self.cutoff_idx, :])
 87 | 
 88 |     # Reduce along time
 89 |     deviation = tf.reduce_sum(deviation, axis=2)
 90 |     # Reduce along modes
 91 |     deviation = self._reduce(deviation, input_batch, predictions)
 92 |     # [1]
 93 |     deviation = tf.reduce_sum(deviation)
 94 | 
 95 |     self.num_predictions.assign_add(num_predictions)
 96 |     self.total_deviation.assign_add(deviation)
 97 | 
 98 |   def result(self):
 99 |     return self.total_deviation / self.num_predictions
100 | 
101 |   def reset_states(self):
102 |     self.num_predictions.assign(0)
103 |     self.total_deviation.assign(0.0)
104 | 
105 | 
106 | @gin.configurable
107 | class MinADE(ADE):
108 |   """Takes the minimum over all modes."""
109 | 
110 |   def _reduce(self, ade_with_modes, input_batch, predictions):
111 |     return tf.reduce_min(ade_with_modes, axis=-2)
112 | 
113 | 
114 | @gin.configurable
115 | class MLADE(ADE):
116 |   """Takes the maximum likelihood mode."""
117 | 
118 |   def _reduce(self, ade_with_modes, input_batch, predictions):
119 |     # Get index of mixture component with highest probability
120 |     # [b, a=1, t=1, n]
121 |     ml_indices = tf.math.argmax(predictions['mixture_logits'], axis=-1)
122 |     a = ade_with_modes.shape[1]
123 |     ml_indices = tf.tile(
124 |         tf.squeeze(ml_indices, axis=2), [1, a])[..., tf.newaxis]
125 | 
126 |     return tf.gather(
127 |         ade_with_modes, indices=ml_indices, batch_dims=2, axis=-2)[..., 0, :]
128 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/scene_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains scene encoder layers."""
 16 | 
 17 | import gin
 18 | 
 19 | from human_scene_transformer.model import embedding
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | 
 24 | @gin.register
 25 | class ConvOccupancyGridEncoderLayer(tf.keras.layers.Layer):
 26 |   """Uses only the frame on the current step for now."""
 27 | 
 28 |   def __init__(self, params):
 29 |     super().__init__()
 30 |     self.xyz_embedding_layer = embedding.SinusoidalEmbeddingLayer(
 31 |         hidden_size=params.hidden_size)
 32 |     self.current_step_idx = params.num_history_steps
 33 |     self.num_filters = params.num_conv_filters
 34 |     self.hidden_size = params.hidden_size
 35 |     drop_prob = params.drop_prob
 36 |     self.mlp = tf.keras.layers.EinsumDense(
 37 |         '...f,fh->...h',
 38 |         output_shape=self.xyz_embedding_layer.hidden_size,
 39 |         bias_axes='h',
 40 |         activation=None)
 41 | 
 42 |     # Construct conv and max pooling layers.
 43 |     self.layers = []
 44 |     for i, num_filter in enumerate(self.num_filters):
 45 |       if i == 0 or i == 1:
 46 |         strides = 2
 47 |       else:
 48 |         strides = 1
 49 |       conv_layer = tf.keras.layers.Conv2D(
 50 |           filters=num_filter,
 51 |           kernel_size=3,
 52 |           strides=strides,
 53 |           padding='same',
 54 |           activation='relu',
 55 |           data_format='channels_last')
 56 |       self.layers.append(conv_layer)
 57 |       dropout = tf.keras.layers.Dropout(drop_prob)
 58 |       self.layers.append(dropout)
 59 |       pooling_layer = tf.keras.layers.MaxPool2D(
 60 |           pool_size=(2, 2), data_format='channels_last')
 61 |       self.layers.append(pooling_layer)
 62 |     # Flatten and use MLP to map to [b, h].
 63 |     self.layers.append(tf.keras.layers.Flatten())
 64 |     self.layers.append(
 65 |         tf.keras.layers.Dense(units=self.hidden_size, activation='relu'))
 66 |     self.layers.append(tf.keras.layers.Dropout(drop_prob))
 67 | 
 68 |     self.seq_layers = tf.keras.Sequential(self.layers)
 69 | 
 70 |   def _get_origin_embedding(self, input_batch, training=None):
 71 |     # [b]
 72 |     x = input_batch['scene/origin'][:, self.current_step_idx, 0]
 73 |     y = input_batch['scene/origin'][:, self.current_step_idx, 1]
 74 |     # [b, hidden_size]
 75 |     origin_embedding_x = self.xyz_embedding_layer(x, training=training)
 76 |     origin_embedding_y = self.xyz_embedding_layer(y, training=training)
 77 |     origin_embedding = tf.concat([origin_embedding_x, origin_embedding_y],
 78 |                                  axis=-1)
 79 |     origin_embedding = self.mlp(origin_embedding)
 80 | 
 81 |     return origin_embedding
 82 | 
 83 |   def _apply_img_layers(self, input_batch, training=None):
 84 |     # [b, width, height].
 85 |     occ_grid = input_batch['scene/grid'][:, self.current_step_idx, :, :,
 86 |                                          tf.newaxis]
 87 | 
 88 |     occ_grid = self.seq_layers(occ_grid, training=training)
 89 |     return occ_grid
 90 | 
 91 |   def call(self, input_batch, training=None):
 92 |     input_batch = input_batch.copy()
 93 | 
 94 |     # Embed occupation grid origin w/ sinusoidal embedding.
 95 |     origin_embedding = self._get_origin_embedding(input_batch)
 96 | 
 97 |     # Apply layers.
 98 |     occ_grid = self._apply_img_layers(input_batch, training=training)
 99 | 
100 |     # [b, hidden_size]. Combine origin and grid.
101 |     occ_grid += origin_embedding
102 |     input_batch['scene_hidden_vec'] = occ_grid[:, tf.newaxis, tf.newaxis :]
103 |     return input_batch
104 | 
105 | 
106 | @gin.register
107 | class PointCloudEncoderLayer(tf.keras.layers.Layer):
108 |   """Retrieves the point cloud at the current timestep."""
109 | 
110 |   def __init__(self, params):
111 |     super().__init__()
112 |     self.current_step_idx = params.num_history_steps
113 |     self.embedding_layer = embedding.SinusoidalEmbeddingLayer(
114 |         hidden_size=params.feature_embedding_size)
115 | 
116 |   def call(self, input_batch, training=None):
117 |     input_batch = input_batch.copy()
118 | 
119 |     pc = input_batch['scene/pc'][:, self.current_step_idx, ..., :2]
120 | 
121 |     pc = tf.where(tf.math.is_nan(pc), 0., pc)
122 | 
123 |     pc_emb = self.embedding_layer(pc, training=training)
124 | 
125 |     pc_emb = tf.concat([pc_emb[..., 0, :], pc_emb[..., 1, :]], axis=-1)
126 | 
127 |     input_batch['scene_hidden_vec'] = pc_emb
128 | 
129 |     return input_batch
130 | 


--------------------------------------------------------------------------------
/human_scene_transformer/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Train human scene transformer."""
 16 | 
 17 | from typing import Sequence
 18 | 
 19 | from absl import app
 20 | from absl import flags
 21 | from absl import logging
 22 | import gin
 23 | 
 24 | from human_scene_transformer import train_model
 25 | from human_scene_transformer import training_params as tp
 26 | from human_scene_transformer.jrdb import dataset_params as jrdb_dp
 27 | from human_scene_transformer.jrdb import input_fn as jrdb_input_fn
 28 | from human_scene_transformer.model import model_params as mp
 29 | from human_scene_transformer.pedestrians import dataset_params as pedestrians_dp
 30 | from human_scene_transformer.pedestrians import input_fn as pedestrians_input_fn
 31 | 
 32 | import tensorflow as tf
 33 | 
 34 | 
 35 | _DEVICE = flags.DEFINE_string(
 36 |     'device', 'cpu', 'The device identification.'
 37 | )
 38 | _REPLACE_DATE_TIME_STRING_BY = flags.DEFINE_string(
 39 |     'replace_date_time_string_by',
 40 |     None,
 41 |     (
 42 |         'A string to replace the automatic date-time string for the model'
 43 |         ' foldername.'
 44 |     ),
 45 | )
 46 | _MODEL_BASE_DIR = flags.DEFINE_string(
 47 |     'model_base_dir',
 48 |     '/tmp/model_dir',
 49 |     (
 50 |         'The base folder for the trained model, contains ckpts, etc. '
 51 |         'Usually this comes from xmanager.'
 52 |     ),
 53 | )
 54 | _TENSORBOARD_DIR = flags.DEFINE_string(
 55 |     'tensorboard_dir',
 56 |     '/tmp/tensorboard',
 57 |     'The base folder to save tensorboard summaries.',
 58 | )
 59 | _LEARNING_RATE = flags.DEFINE_float(
 60 |     'learning_rate', 1e-4, 'The maximum learning rate'
 61 | )
 62 | _DATASET = flags.DEFINE_string(
 63 |     'dataset', 'EDR', 'Dataset to train and evaluate on.'
 64 | )
 65 | 
 66 | _GIN_FILE = flags.DEFINE_multi_string(
 67 |     'gin_files',
 68 |     None,
 69 |     'A newline separated list of paths to Gin configuration files used to '
 70 |     'configure training, model, and dataset generation')
 71 | 
 72 | _GIN_OVERRIDE = flags.DEFINE_multi_string(
 73 |     'gin_overrides',
 74 |     [],
 75 |     'A newline separated list of parameters used to override ones specified in'
 76 |     ' the gin file.')
 77 | 
 78 | _LOGGING_INTERVAL = 1
 79 | 
 80 | 
 81 | def main(argv: Sequence[str]) -> None:
 82 |   if len(argv) > 1:
 83 |     raise app.UsageError('Too many command-line arguments.')
 84 | 
 85 |   gin.parse_config_files_and_bindings(_GIN_FILE.value, _GIN_OVERRIDE.value)
 86 |   logging.info('Actual gin config used:')
 87 |   logging.info(gin.config_str())
 88 | 
 89 |   if 'cpu' in _DEVICE.value or 'gpu' in _DEVICE.value:
 90 |     strategy = tf.distribute.OneDeviceStrategy(_DEVICE.value)
 91 |   else:  # TPU
 92 |     resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
 93 |         tpu=_DEVICE.value)
 94 |     tf.config.experimental_connect_to_cluster(resolver)
 95 |     tf.tpu.experimental.initialize_tpu_system(resolver)
 96 |     strategy = tf.distribute.TPUStrategy(resolver)
 97 | 
 98 |   train_params = tp.TrainingParams()
 99 | 
100 |   if _DATASET.value == 'JRDB':
101 |     dataset_params = jrdb_dp.JRDBDatasetParams()
102 |     ds_train = jrdb_input_fn.get_train_dataset(dataset_params)
103 |     ds_eval = jrdb_input_fn.get_eval_dataset(dataset_params)
104 |     ds_train = ds_train.batch(train_params.batch_size, drop_remainder=True)
105 |     ds_eval = ds_eval.batch(train_params.batch_size, drop_remainder=True)
106 |     dist_train_dataset = strategy.experimental_distribute_dataset(ds_train)
107 |     dist_eval_dataset = strategy.experimental_distribute_dataset(ds_eval)
108 |   elif _DATASET.value == 'PEDESTRIANS':
109 |     dataset_params = pedestrians_dp.PedestriansDatasetParams()
110 |     ds_train = pedestrians_input_fn.get_train_dataset(dataset_params)
111 |     ds_eval = pedestrians_input_fn.get_eval_dataset(dataset_params)
112 |     ds_train = ds_train.batch(train_params.batch_size, drop_remainder=True)
113 |     ds_eval = ds_eval.batch(train_params.batch_size, drop_remainder=True)
114 |     dist_train_dataset = strategy.experimental_distribute_dataset(ds_train)
115 |     dist_eval_dataset = strategy.experimental_distribute_dataset(ds_eval)
116 |   else:
117 |     raise ValueError
118 | 
119 |   model_params = mp.ModelParams()
120 | 
121 |   with strategy.scope():
122 |     metrics_tuple = train_model.get_metrics(model_params)
123 | 
124 |   train_model.train_model(
125 |       train_params=train_params,
126 |       model_params=model_params,
127 |       train_ds=dist_train_dataset,
128 |       eval_ds=dist_eval_dataset,
129 |       metrics=metrics_tuple,
130 |       strategy=strategy,
131 |       replace_date_time_string_by=_REPLACE_DATE_TIME_STRING_BY.value,
132 |       model_base_dir=_MODEL_BASE_DIR.value,
133 |       tensorboard_dir=_TENSORBOARD_DIR.value
134 |   )
135 | 
136 | 
137 | if __name__ == '__main__':
138 |   flags.mark_flags_as_required([
139 |       'model_base_dir', 'gin_files'
140 |   ])
141 |   app.run(main)
142 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/model_params.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Model parameter classes.
 16 | 
 17 | The module contains a dataclasses used to configure the model.
 18 | """
 19 | 
 20 | import gin
 21 | 
 22 | from human_scene_transformer import is_hidden_generators
 23 | from human_scene_transformer.model import agent_feature_encoder
 24 | from human_scene_transformer.model import head
 25 | from human_scene_transformer.model import scene_encoder as _
 26 | 
 27 | 
 28 | @gin.configurable
 29 | class ModelParams(object):
 30 |   """This object configures the model."""
 31 | 
 32 |   def __init__(
 33 |       self,
 34 |       agents_feature_config=None,
 35 |       agents_position_key='agents/position',
 36 |       agents_orientation_key='agents/orientation',
 37 |       hidden_size=128,
 38 |       feature_embedding_size=128,
 39 |       transformer_ff_dim=64,
 40 |       ln_eps=1e-6,
 41 |       num_heads=4,
 42 |       num_conv_filters=(32, 32, 64, 64, 128),
 43 |       num_modes=4,
 44 |       attn_architecture=(
 45 |           'self-attention',
 46 |           'cross-attention',
 47 |           'multimodality_induction',
 48 |           'self-attention',
 49 |           'self-attention-mode',
 50 |           'self-attention',
 51 |           'self-attention-mode',
 52 |       ),
 53 |       mask_style=None,
 54 |       scene_encoder=None,
 55 |       prediction_head=head.PredictionHeadLayer,
 56 |       prediction_head_hidden_units=None,
 57 |       drop_prob=0.1,
 58 |       num_history_steps=12,
 59 |       num_steps=49,
 60 |       timestep=1 / 6,
 61 |       is_hidden_generator=is_hidden_generators.BPIsHiddenGenerator,
 62 |   ):
 63 |     """Initialize the TrainParam object.
 64 | 
 65 |     This is where gin injection occurs.
 66 | 
 67 |     Args:
 68 |       agents_feature_config: Dict mapping input features to encoders.
 69 |       agents_position_key: Name of the position key.
 70 |       agents_orientation_key: Name of the orientation key.
 71 |       hidden_size: The hidden vector size of the transformer.
 72 |       feature_embedding_size: The embedding vector size of agent features.
 73 |       transformer_ff_dim: The hidden layer size of transformer layers.
 74 |       ln_eps: Epsilon value for layer norm operations.
 75 |       num_heads: The number of heads in multi-headed attention.
 76 |       num_conv_filters: The number of filters in the occupancy grid convolution
 77 |         layers.
 78 |       num_modes: Number of predicted possible future outcomes. Each agent has
 79 |         exactly one predicted trajectory per possible outcome.
 80 |       attn_architecture: A tuple of strings describing the transformer
 81 |         architecture. Options are: 'self-attention', 'cross-attention' and
 82 |         'multimodality_induction'.
 83 |       mask_style: A string or None describing the mask in self-attention.
 84 |         'fully_padded' means timesteps of padded agents will not participate in
 85 |         the attention computation. 'any_padded' means any padded timesteps will
 86 |         not participate in the attention computation.
 87 |       scene_encoder: The scene encoder to use.
 88 |       prediction_head: Type of prediction head to use: 'full' with all outputs
 89 |         or '2d_pos' where only the 2D position is predicted.
 90 |       prediction_head_hidden_units: # A tuple describing the number of hidden
 91 |         units in the Prediction head. Set to None to disable the hidden layer.
 92 |       drop_prob: Dropout probability during training.
 93 |       num_history_steps: Number of history timesteps per data point.
 94 |       num_steps: Number of timesteps per data point.
 95 |       timestep: Delta time of a timestep in seconds.
 96 |       is_hidden_generator: The class to generating the is_hidden tensor. When
 97 |         called, the generator class returns a np array representing which
 98 |         agents and timesteps should be masked from the model.
 99 |     """
100 |     if agents_feature_config is None:
101 |       self.agents_feature_config = {
102 |           'agents/position': agent_feature_encoder.AgentPositionEncoder,
103 |           'agents/orientation': agent_feature_encoder.Agent2DOrientationEncoder,
104 |           'agents/detection_score': agent_feature_encoder.AgentScalarEncoder,
105 |           'agents/detection_stage': agent_feature_encoder.AgentOneHotEncoder
106 |           }
107 |     else:
108 |       self.agents_feature_config = agents_feature_config
109 |     self.agents_position_key = agents_position_key
110 |     self.agents_orientation_key = agents_orientation_key
111 |     self.hidden_size = hidden_size
112 |     self.feature_embedding_size = feature_embedding_size
113 |     self.transformer_ff_dim = transformer_ff_dim
114 |     self.ln_eps = ln_eps
115 |     self.num_heads = num_heads
116 |     self.num_conv_filters = num_conv_filters
117 |     self.num_modes = num_modes
118 |     self.attn_architecture = attn_architecture
119 |     self.mask_style = mask_style
120 |     self.scene_encoder = scene_encoder
121 |     self.prediction_head = prediction_head
122 |     self.prediction_head_hidden_units = prediction_head_hidden_units
123 |     self.drop_prob = drop_prob
124 |     self.is_hidden_generator = is_hidden_generator
125 | 
126 |     self.num_history_steps = num_history_steps
127 |     self.num_steps = num_steps
128 |     self.timestep = timestep
129 | 


--------------------------------------------------------------------------------
/human_scene_transformer/jrdb/eval.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Evaluates Model on JRDB dataset."""
 16 | 
 17 | import os
 18 | from typing import Sequence
 19 | from absl import app
 20 | from absl import flags
 21 | from absl import logging
 22 | import gin
 23 | from human_scene_transformer.jrdb import dataset_params
 24 | from human_scene_transformer.jrdb import input_fn
 25 | from human_scene_transformer.metrics import metrics
 26 | from human_scene_transformer.model import model as hst_model
 27 | from human_scene_transformer.model import model_params
 28 | import tensorflow as tf
 29 | import tqdm
 30 | 
 31 | 
 32 | _MODEL_PATH = flags.DEFINE_string(
 33 |     'model_path',
 34 |     None,
 35 |     'Path to model directory.',
 36 | )
 37 | 
 38 | _CKPT_PATH = flags.DEFINE_string(
 39 |     'checkpoint_path',
 40 |     None,
 41 |     'Path to model checkpoint.',
 42 | )
 43 | 
 44 | 
 45 | def evaluation(checkpoint_path):
 46 |   """Evaluates Model on Pedestrian dataset."""
 47 |   d_params = dataset_params.JRDBDatasetParams(num_agents=None)
 48 | 
 49 |   dataset = input_fn.load_dataset(
 50 |       d_params,
 51 |       d_params.eval_scenes,
 52 |       augment=False,
 53 |       shuffle=False,
 54 |       allow_parallel=False,
 55 |       evaluation=False,
 56 |       repeat=False,
 57 |       keep_subsamples=False,
 58 |   )
 59 | 
 60 |   model_p = model_params.ModelParams()
 61 | 
 62 |   model = hst_model.HumanTrajectorySceneTransformer(model_p)
 63 | 
 64 |   _, _ = model(next(iter(dataset.batch(1))), training=False)
 65 | 
 66 |   checkpoint_mngr = tf.train.Checkpoint(model=model)
 67 |   checkpoint_mngr.restore(checkpoint_path).assert_existing_objects_matched()
 68 |   logging.info('Restored checkpoint: %s', checkpoint_path)
 69 | 
 70 |   ade_metric = metrics.ade.MinADE(model_p)
 71 |   ade_metric_1s = metrics.ade.MinADE(
 72 |       model_p, cutoff_seconds=1.0, at_cutoff=True
 73 |   )
 74 |   ade_metric_2s = metrics.ade.MinADE(
 75 |       model_p, cutoff_seconds=2.0, at_cutoff=True
 76 |   )
 77 |   ade_metric_3s = metrics.ade.MinADE(
 78 |       model_p, cutoff_seconds=3.0, at_cutoff=True
 79 |   )
 80 |   ade_metric_4s = metrics.ade.MinADE(
 81 |       model_p, cutoff_seconds=4.0, at_cutoff=True
 82 |   )
 83 | 
 84 |   mlade_metric = metrics.ade.MLADE(model_p)
 85 |   mlade_metric_1s = metrics.ade.MLADE(
 86 |       model_p, cutoff_seconds=1.0, at_cutoff=True
 87 |   )
 88 |   mlade_metric_2s = metrics.ade.MLADE(
 89 |       model_p, cutoff_seconds=2.0, at_cutoff=True
 90 |   )
 91 |   mlade_metric_3s = metrics.ade.MLADE(
 92 |       model_p, cutoff_seconds=3.0, at_cutoff=True
 93 |   )
 94 |   mlade_metric_4s = metrics.ade.MLADE(
 95 |       model_p, cutoff_seconds=4.0, at_cutoff=True
 96 |   )
 97 | 
 98 |   nll_metric = metrics.pos_nll.PositionNegativeLogLikelihood(model_p)
 99 |   nll_metric_1s = metrics.pos_nll.PositionNegativeLogLikelihood(
100 |       model_p, cutoff_seconds=1.0, at_cutoff=True
101 |   )
102 |   nll_metric_2s = metrics.pos_nll.PositionNegativeLogLikelihood(
103 |       model_p, cutoff_seconds=2.0, at_cutoff=True
104 |   )
105 |   nll_metric_3s = metrics.pos_nll.PositionNegativeLogLikelihood(
106 |       model_p, cutoff_seconds=3.0, at_cutoff=True
107 |   )
108 |   nll_metric_4s = metrics.pos_nll.PositionNegativeLogLikelihood(
109 |       model_p, cutoff_seconds=4.0, at_cutoff=True
110 |   )
111 | 
112 |   for input_batch in tqdm.tqdm(dataset.batch(1)):
113 |     full_pred, output_batch = model(input_batch, training=False)
114 |     ade_metric.update_state(output_batch, full_pred)
115 |     ade_metric_1s.update_state(output_batch, full_pred)
116 |     ade_metric_2s.update_state(output_batch, full_pred)
117 |     ade_metric_3s.update_state(output_batch, full_pred)
118 |     ade_metric_4s.update_state(output_batch, full_pred)
119 | 
120 |     mlade_metric.update_state(output_batch, full_pred)
121 |     mlade_metric_1s.update_state(output_batch, full_pred)
122 |     mlade_metric_2s.update_state(output_batch, full_pred)
123 |     mlade_metric_3s.update_state(output_batch, full_pred)
124 |     mlade_metric_4s.update_state(output_batch, full_pred)
125 | 
126 |     nll_metric.update_state(output_batch, full_pred)
127 |     nll_metric_1s.update_state(output_batch, full_pred)
128 |     nll_metric_2s.update_state(output_batch, full_pred)
129 |     nll_metric_3s.update_state(output_batch, full_pred)
130 |     nll_metric_4s.update_state(output_batch, full_pred)
131 | 
132 |   print(f'MinADE: {ade_metric.result().numpy():.2f}')
133 |   print(f'MinADE @ 1s: {ade_metric_1s.result().numpy():.2f}')
134 |   print(f'MinADE @ 2s: {ade_metric_2s.result().numpy():.2f}')
135 |   print(f'MinADE @ 3s: {ade_metric_3s.result().numpy():.2f}')
136 |   print(f'MinADE @ 4s: {ade_metric_4s.result().numpy():.2f}')
137 | 
138 |   print(f'MLADE: {mlade_metric.result().numpy():.2f}')
139 |   print(f'MLADE @ 1s: {mlade_metric_1s.result().numpy():.2f}')
140 |   print(f'MLADE @ 2s: {mlade_metric_2s.result().numpy():.2f}')
141 |   print(f'MLADE @ 3s: {mlade_metric_3s.result().numpy():.2f}')
142 |   print(f'MLADE @ 4s: {mlade_metric_4s.result().numpy():.2f}')
143 | 
144 |   print(f'NLL: {nll_metric.result().numpy():.2f}')
145 |   print(f'NLL @ 1s: {nll_metric_1s.result().numpy():.2f}')
146 |   print(f'NLL @ 2s: {nll_metric_2s.result().numpy():.2f}')
147 |   print(f'NLL @ 3s: {nll_metric_3s.result().numpy():.2f}')
148 |   print(f'NLL @ 4s: {nll_metric_4s.result().numpy():.2f}')
149 | 
150 | 
151 | def main(argv: Sequence[str]) -> None:
152 |   if len(argv) > 1:
153 |     raise app.UsageError('Too many command-line arguments.')
154 | 
155 |   gin.parse_config_files_and_bindings(
156 |       [os.path.join(_MODEL_PATH.value, 'params', 'operative_config.gin')],
157 |       None,
158 |       skip_unknown=True)
159 |   print('Actual gin config used:')
160 |   print(gin.config_str())
161 | 
162 |   evaluation(_CKPT_PATH.value)
163 | 
164 | if __name__ == '__main__':
165 |   logging.set_verbosity(logging.ERROR)
166 |   app.run(main)
167 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/preprocess.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains preprocess layers."""
 16 | 
 17 | from typing import Dict, Optional
 18 | 
 19 | from human_scene_transformer.model.model_params import ModelParams
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | 
 24 | class PreprocessLayer(tf.keras.layers.Layer):
 25 |   """Preprocessing layer of the model.
 26 | 
 27 |   The preprocess layer applies the following preprocessing steps to the input.
 28 |   NOTE: we refer 'positions' as the transformer positions so each position
 29 |   corresponds to a unique agent at a specific timestep.
 30 | 
 31 |   1) Copy the raw input and save in `/original` and `/target` keys.
 32 |   2) Compute the 'has_data' mask for all features. True at fields where the
 33 |      feature has data. Further, compute the global 'has_data' mask. True where
 34 |      xyz data are available. Further, compute the 'has_historic_data' mask.
 35 |      True for agents which have at least one valid xyz data point in the xyz
 36 |      feature.
 37 |   3) Compute the `is_padded` bool tensor of shape [batch (b), num_agents (a)
 38 |      ,num_timesteps (t), 1]. True if the position is padded, ie, no valid
 39 |      observation.
 40 |   4) Compute which positions need to be predicted and save it to the
 41 |      `should_predict` bool tensor of shape [b, a, t, 1]. A position should be
 42 |      predicted if it is hidden, not padded and the agent has historic data.
 43 |   5) Mask agent features based on their 'has_data' mask.
 44 |   """
 45 | 
 46 |   def __init__(self, params: ModelParams):
 47 |     super().__init__(name='PreprocessLayer')
 48 |     self.params = params
 49 | 
 50 |     self.is_hidden_generator = params.is_hidden_generator(
 51 |         self.params.num_steps,
 52 |         self.params.num_history_steps)
 53 | 
 54 |     self.agents_feature_config = params.agents_feature_config
 55 |     self.agents_position_key = params.agents_position_key
 56 | 
 57 |   def call(self,
 58 |            raw_input_batch: Dict[str, tf.Tensor],
 59 |            is_hidden: Optional[tf.Tensor] = None) -> Dict[str, tf.Tensor]:
 60 |     input_batch = raw_input_batch.copy()
 61 | 
 62 |     if is_hidden is None:
 63 |       num_agents = input_batch[self.params.agents_position_key].shape[1]
 64 |       input_batch['is_hidden'] = self.is_hidden_generator(
 65 |           num_agents, train_progress=0.0)
 66 |     else:
 67 |       input_batch['is_hidden'] = is_hidden
 68 | 
 69 |     input_batch = self._add_original_and_target(input_batch)
 70 |     input_batch = self._add_has_data(input_batch)
 71 |     input_batch = self._add_should_predict(input_batch)
 72 |     input_batch = self._mask_agent_features(input_batch)
 73 | 
 74 |     return input_batch
 75 | 
 76 |   def _add_original_and_target(self, input_batch):
 77 |     """Adds original, directly from input_batch values so we can use it later.
 78 |     """
 79 |     input_batch_new = input_batch.copy()
 80 |     for feature in self.agents_feature_config.keys():
 81 |       input_batch_new[f'{feature}/original'] = input_batch[feature]
 82 |       input_batch_new[f'{feature}/target'] = input_batch[feature]
 83 |       data_type = input_batch_new[f'{feature}/target'].dtype
 84 |       if data_type.is_floating:
 85 |         # Set target to 0. where NaN
 86 |         input_batch_new[f'{feature}/target'] = tf.where(
 87 |             tf.math.is_nan(input_batch_new[f'{feature}/target']),
 88 |             tf.zeros_like(input_batch_new[f'{feature}/target']),
 89 |             input_batch_new[f'{feature}/target'])
 90 |       else:
 91 |         input_batch_new[f'{feature}/target'] = tf.where(
 92 |             input_batch_new[f'{feature}/target'] == data_type.min,
 93 |             tf.zeros_like(input_batch_new[f'{feature}/target']),
 94 |             input_batch_new[f'{feature}/target'])
 95 |     return input_batch_new
 96 | 
 97 |   def _add_has_data(self, input_batch):
 98 |     num_hist_steps = self.params.num_history_steps
 99 |     input_batch = input_batch.copy()
100 | 
101 |     def has_data(t):
102 |       if t.dtype.is_floating:
103 |         return tf.math.logical_not(
104 |             tf.reduce_any(tf.math.is_nan(t), axis=-1, keepdims=True))
105 |       else:
106 |         return tf.math.logical_not(
107 |             tf.reduce_any(t == t.dtype.min, axis=-1, keepdims=True))
108 | 
109 |     # Each Feature
110 |     for feature in self.agents_feature_config.keys():
111 |       f = input_batch[feature]
112 |       input_batch[f'has_data/{feature}'] = has_data(f)
113 | 
114 |     # Global
115 |     # [b, a, t, 1]
116 |     has_data = input_batch[f'has_data/{self.agents_position_key}']
117 | 
118 |     # [b, a, 1, 1]
119 |     has_historic_data = tf.reduce_any(
120 |         has_data[..., :num_hist_steps + 1, :], axis=-2, keepdims=True)
121 | 
122 |     input_batch['has_data'] = has_data
123 |     input_batch['has_historic_data'] = has_historic_data
124 |     return input_batch
125 | 
126 |   def _add_should_predict(self, input_batch):
127 |     # Only include in loss computation if it is:
128 |     # 1) hidden, 2) not padded, 3) has historic data
129 |     input_batch['should_predict'] = tf.logical_and(
130 |         input_batch['is_hidden'],
131 |         tf.logical_and(input_batch['has_data'],
132 |                        input_batch['has_historic_data']))
133 | 
134 |     return input_batch
135 | 
136 |   def _set_elems_to_value(self, target, should_set, new_val):
137 |     """Sets elements in the target marked by should_set to value_to_set.
138 | 
139 |     Args:
140 |       target: Target array to be operated on.
141 |       should_set: This must be a binary array with value 1 or 0 with the same
142 |         shape as the target. Elements with 1 will cause the element of the
143 |         target at the same indices to be changed to value_to_set.
144 |       new_val: The new value to set elements to.
145 | 
146 |     Returns:
147 |       target: The target array after the operation.
148 |     """
149 | 
150 |     target = tf.where(should_set, tf.cast(new_val, target.dtype), target)
151 | 
152 |     return target
153 | 
154 |   def _mask_agent_features(self, input_batch):
155 |     input_batch = input_batch.copy()
156 | 
157 |     for feature in self.agents_feature_config.keys():
158 |       feature_is_padded = tf.logical_not(input_batch[f'has_data/{feature}'])
159 |       input_batch[feature] = self._set_elems_to_value(
160 |           input_batch[feature],
161 |           tf.logical_or(feature_is_padded, input_batch['is_hidden']), 0.)
162 | 
163 |     return input_batch
164 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/agent_feature_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Agent feature encoder."""
 16 | 
 17 | import gin
 18 | 
 19 | from human_scene_transformer.model.embedding import SinusoidalEmbeddingLayer
 20 | 
 21 | import tensorflow as tf
 22 | 
 23 | 
 24 | @gin.register
 25 | class AgentTemporalEncoder(tf.keras.layers.Layer):
 26 |   """Encodes agents temporal positions."""
 27 | 
 28 |   def __init__(self, key, output_shape, params):
 29 |     super().__init__()
 30 |     self.key = key
 31 |     self.embedding_layer = SinusoidalEmbeddingLayer(
 32 |         max_freq=params.num_steps,
 33 |         hidden_size=params.feature_embedding_size)
 34 | 
 35 |     self.mlp = tf.keras.layers.EinsumDense(
 36 |         '...f,fh->...h',
 37 |         output_shape=output_shape,
 38 |         bias_axes='h',
 39 |         activation=None)
 40 | 
 41 |   def _get_temporal_embedding(self, input_batch):
 42 |     # This weird thing is for exporting and loading keras model...
 43 |     b = tf.shape(input_batch[self.key])[0]
 44 |     num_agents = tf.shape(input_batch[self.key])[1]
 45 |     num_steps = tf.shape(input_batch[self.key])[2]
 46 | 
 47 |     t = tf.range(0, num_steps, dtype=tf.float32)
 48 |     t = t[tf.newaxis, tf.newaxis, :]
 49 |     t = tf.tile(t, [b, num_agents, 1])
 50 |     return self.embedding_layer(t[..., tf.newaxis])
 51 | 
 52 |   def call(self, input_batch):
 53 |     return (self.mlp(self._get_temporal_embedding(input_batch)),
 54 |             tf.ones_like(input_batch['has_data']))
 55 | 
 56 | 
 57 | @gin.register
 58 | class AgentPositionEncoder(tf.keras.layers.Layer):
 59 |   """Encodes agents spatial positions."""
 60 | 
 61 |   def __init__(self, key, output_shape, params):
 62 |     super().__init__()
 63 |     self.key = key
 64 |     self.embedding_layer = SinusoidalEmbeddingLayer(
 65 |         hidden_size=params.feature_embedding_size)
 66 | 
 67 |     self.mlp = tf.keras.layers.EinsumDense(
 68 |         '...f,fh->...h',
 69 |         output_shape=output_shape,
 70 |         bias_axes='h',
 71 |         activation=None)
 72 | 
 73 |   def call(self, input_batch):
 74 |     not_is_hidden = tf.logical_not(input_batch['is_hidden'])
 75 |     mask = tf.logical_and(input_batch[f'has_data/{self.key}'], not_is_hidden)
 76 |     mask = tf.repeat(mask, tf.shape(input_batch[self.key])[-1], axis=-1)
 77 |     return self.mlp(self.embedding_layer(input_batch[self.key])), mask
 78 | 
 79 | 
 80 | @gin.register
 81 | class Agent2DOrientationEncoder(tf.keras.layers.Layer):
 82 |   """Encodes agents 2d orientation."""
 83 | 
 84 |   def __init__(self, key, output_shape, params):
 85 |     super().__init__()
 86 |     self.key = key
 87 |     self.embedding_layer = SinusoidalEmbeddingLayer(
 88 |         max_freq=2,
 89 |         hidden_size=params.feature_embedding_size//2)
 90 | 
 91 |     self.mlp = tf.keras.layers.EinsumDense(
 92 |         '...f,fh->...h',
 93 |         output_shape=output_shape,
 94 |         bias_axes='h',
 95 |         activation=None)
 96 | 
 97 |   def call(self, input_batch):
 98 |     orientation = input_batch[self.key]
 99 |     orientation_embedding = tf.concat([
100 |         self.embedding_layer(tf.math.sin(orientation)),
101 |         self.embedding_layer(tf.math.cos(orientation))
102 |     ], axis=-1)
103 | 
104 |     not_is_hidden = tf.logical_not(input_batch['is_hidden'])
105 |     mask = tf.logical_and(input_batch[f'has_data/{self.key}'], not_is_hidden)
106 | 
107 |     return self.mlp(orientation_embedding), mask
108 | 
109 | 
110 | @gin.register
111 | class AgentScalarEncoder(tf.keras.layers.Layer):
112 |   """Encodes a agent's scalar."""
113 | 
114 |   def __init__(self, key, output_shape, params):
115 |     super().__init__()
116 |     self.key = key
117 | 
118 |     self.mlp = tf.keras.layers.EinsumDense(
119 |         '...f,fh->...h',
120 |         output_shape=output_shape,
121 |         bias_axes='h',
122 |         activation=None)
123 | 
124 |   def call(self, input_batch):
125 |     not_is_hidden = tf.logical_not(input_batch['is_hidden'])
126 |     mask = tf.logical_and(input_batch[f'has_data/{self.key}'], not_is_hidden)
127 |     return self.mlp(input_batch[self.key])[..., tf.newaxis, :], mask
128 | 
129 | 
130 | @gin.configurable
131 | class AgentOneHotEncoder(tf.keras.layers.Layer):
132 |   """Encodes the detection stage."""
133 | 
134 |   def __init__(self, key, output_shape, params, depth=1):
135 |     super().__init__()
136 |     self.key = key
137 |     self.depth = depth
138 | 
139 |     self.mlp = tf.keras.layers.EinsumDense(
140 |         '...f,fh->...h',
141 |         output_shape=output_shape,
142 |         bias_axes='h',
143 |         activation=None)
144 | 
145 |   def call(self, input_batch):
146 |     stage_one_hot = tf.one_hot(
147 |         tf.squeeze(input_batch[self.key], axis=-1),
148 |         self.depth)
149 | 
150 |     not_is_hidden = tf.logical_not(input_batch['is_hidden'])
151 |     mask = tf.logical_and(input_batch[f'has_data/{self.key}'], not_is_hidden)
152 |     return self.mlp(stage_one_hot)[..., tf.newaxis, :], mask
153 | 
154 | 
155 | @gin.configurable
156 | class AgentKeypointsEncoder(tf.keras.layers.Layer):
157 |   """Encodes the agent's keypoints."""
158 | 
159 |   def __init__(self, key, output_shape, params):
160 |     super().__init__()
161 |     self.key = key
162 | 
163 |     self.mlp1 = tf.keras.layers.EinsumDense(
164 |         '...f,fh->...h',
165 |         output_shape=output_shape,
166 |         bias_axes='h',
167 |         activation=tf.nn.relu)
168 | 
169 |   def call(self, input_batch, training=None):
170 |     not_is_hidden = tf.logical_not(input_batch['is_hidden'])
171 |     mask = tf.logical_and(input_batch[f'has_data/{self.key}'], not_is_hidden)
172 | 
173 |     keypoints = input_batch[self.key]
174 | 
175 |     out = self.mlp1(keypoints)[..., tf.newaxis, :]
176 | 
177 |     return out, mask
178 | 
179 | 
180 | @gin.configurable
181 | class AgentHeadOrientationEncoder(tf.keras.layers.Layer):
182 |   """Encodes the detection stage."""
183 | 
184 |   def __init__(self, key, output_shape, params):
185 |     super().__init__()
186 |     self.key = key
187 | 
188 |     self.mlp = tf.keras.layers.EinsumDense(
189 |         '...f,fh->...h',
190 |         output_shape=output_shape,
191 |         bias_axes='h',
192 |         activation=None)
193 | 
194 |   def call(self, input_batch):
195 |     not_is_hidden = tf.logical_not(input_batch['is_hidden'])
196 |     mask = tf.logical_and(input_batch[f'has_data/{self.key}'], not_is_hidden)
197 |     return self.mlp(input_batch[self.key])[..., tf.newaxis, :], mask
198 | 


--------------------------------------------------------------------------------
/human_scene_transformer/data/jrdb_train_detections_to_tracks.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Fuses detections to tracks.
 16 | """
 17 | 
 18 | 
 19 | import json
 20 | import os
 21 | 
 22 | from absl import app
 23 | from absl import flags
 24 | 
 25 | from human_scene_transformer.data import box_utils
 26 | from human_scene_transformer.data import utils
 27 | import numpy as np
 28 | import pandas as pd
 29 | import scipy
 30 | import tensorflow as tf
 31 | import tqdm
 32 | 
 33 | 
 34 | _INPUT_PATH = flags.DEFINE_string(
 35 |     'input_path',
 36 |     default=None,
 37 |     help='Path to jrdb2022 dataset.'
 38 | )
 39 | 
 40 | _OUTPUT_PATH = flags.DEFINE_string(
 41 |     'output_path',
 42 |     default=None,
 43 |     help='Path to output folder.'
 44 | )
 45 | 
 46 | 
 47 | def get_agents_3d_bounding_box_dict(input_path, scene):
 48 |   """Returns a dict of agent labels and their bounding boxes."""
 49 |   scene_data_file = utils.get_file_handle(
 50 |       os.path.join(input_path, 'labels', 'labels_3d', scene + '.json')
 51 |   )
 52 |   scene_data = json.load(scene_data_file)
 53 | 
 54 |   agents = {}
 55 | 
 56 |   for frame in scene_data['labels']:
 57 |     ts = int(frame.split('.')[0])
 58 |     for det in scene_data['labels'][frame]:
 59 |       agents[(ts, det['label_id'])] = {
 60 |           'box': np.array([
 61 |               det['box']['cx'],
 62 |               det['box']['cy'],
 63 |               det['box']['l'],
 64 |               det['box']['w'],
 65 |               det['box']['rot_z']]),
 66 |           'box3d': np.array([
 67 |               det['box']['cx'],
 68 |               det['box']['cy'],
 69 |               det['box']['cz'],
 70 |               det['box']['l'],
 71 |               det['box']['w'],
 72 |               det['box']['h'],
 73 |               det['box']['rot_z']])
 74 |           }
 75 |   return agents
 76 | 
 77 | 
 78 | def get_agents_3d_bounding_box_detections_dict(input_path, scene):
 79 |   """Returns a dict of agent detections and their bounding boxes."""
 80 |   scene_data_file = utils.get_file_handle(
 81 |       os.path.join(input_path, 'detections', 'detections_3d', scene + '.json')
 82 |   )
 83 | 
 84 |   scene_data = json.load(scene_data_file)
 85 | 
 86 |   agents = []
 87 | 
 88 |   for frame in scene_data['detections']:
 89 |     ts = int(frame.split('.')[0])
 90 |     for det in scene_data['detections'][frame]:
 91 |       agents.append((
 92 |           ts,
 93 |           np.array(
 94 |               [det['box']['cx'],
 95 |                det['box']['cy'],
 96 |                det['box']['l'],
 97 |                det['box']['w'],
 98 |                det['box']['rot_z']]),
 99 |           np.array([
100 |               det['box']['cx'],
101 |               det['box']['cy'],
102 |               # Detections should be in sensor coordinate
103 |               # frame (0.6m from ground).
104 |               det['box']['cz'] - 0.606982,
105 |               det['box']['l'],
106 |               det['box']['w'],
107 |               det['box']['h'],
108 |               det['box']['rot_z']]),
109 |           det['score']
110 |       ))
111 |   return agents
112 | 
113 | 
114 | def detections_to_dict(df):
115 |   """Puts a detections dataframe into expected dict format."""
116 |   labels_dict = {}
117 |   for ts, group in df.groupby('timestep'):
118 |     agent_list = []
119 |     for (_, agent_id), row in group.iterrows():
120 |       agent_list.append({'label_id': agent_id,
121 |                          'box': {
122 |                              'cx': row['detection'][0],
123 |                              'cy': row['detection'][1],
124 |                              'cz': row['detection'][2],
125 |                              'l': row['detection'][3],
126 |                              'w': row['detection'][4],
127 |                              'h': row['detection'][5],
128 |                              'rot_z': row['detection'][6],
129 |                          },
130 |                          'attributes': {
131 |                              'distance': np.linalg.norm(row['detection'][:3])
132 |                              }})
133 |     labels_dict[f'{ts:06}.pcb'] = agent_list
134 |   return {'labels': labels_dict}
135 | 
136 | 
137 | def jrdb_train_detections_to_tracks(input_path, output_path):
138 |   """Fuses detections to tracks."""
139 |   utils.maybe_makedir(output_path)
140 |   scenes = utils.list_scenes(
141 |       os.path.join(input_path, 'train_dataset'))
142 |   for scene in tqdm.tqdm(scenes):
143 |     bb_dict = get_agents_3d_bounding_box_dict(
144 |         os.path.join(input_path, 'train_dataset'), scene)
145 |     bb_3d_df = pd.DataFrame.from_dict(
146 |         bb_dict, orient='index').rename_axis(['timestep', 'id'])  # pytype: disable=missing-parameter  # pandas-drop-duplicates-overloads
147 | 
148 |     bb_detections_list = get_agents_3d_bounding_box_detections_dict(
149 |         os.path.join(input_path, 'train_dataset'), scene)
150 |     bb_3d_detections_df = pd.DataFrame(
151 |         bb_detections_list, columns=['timestep', 'box', 'box3d', 'score'])
152 | 
153 |     dfs = []
154 |     for ts, gt_group in bb_3d_df.groupby('timestep'):
155 |       detections_group = bb_3d_detections_df.loc[
156 |           (bb_3d_detections_df['timestep'] == ts)]
157 |       detection_boxes = np.vstack(detections_group['box'])
158 |       gt_boxes = np.vstack(gt_group['box'])
159 | 
160 |       detection_boxes_rep = np.repeat(
161 |           detection_boxes, gt_boxes.shape[0], axis=0)
162 |       gt_boxes_til = np.tile(gt_boxes, (detection_boxes.shape[0], 1))
163 | 
164 |       iou = box_utils.compute_paired_bev_iou(
165 |           tf.convert_to_tensor(detection_boxes_rep),
166 |           tf.convert_to_tensor(gt_boxes_til)
167 |           )
168 |       assert isinstance(iou, tf.Tensor)
169 |       iou_np = iou.numpy().reshape(
170 |           (detection_boxes.shape[0], gt_boxes.shape[0]))
171 |       cost = 1 - iou_np
172 |       r, c = scipy.optimize.linear_sum_assignment(cost.T)
173 | 
174 |       unmatched_gt = np.argwhere(cost.T[r, c] == 1.)[..., 0]
175 | 
176 |       df_tmp = gt_group.iloc[r].copy()
177 |       df_tmp['detection'] = detections_group['box3d'].iloc[c].to_list()
178 | 
179 |       df_tmp = df_tmp.drop(index=df_tmp.index[unmatched_gt])
180 | 
181 |       dfs.append(df_tmp)
182 | 
183 |     matched_df = pd.concat(dfs)
184 | 
185 |     labels_dict = detections_to_dict(matched_df)
186 | 
187 |     with open(f"{output_path}/{scene}.json", 'w') as write_file:
188 |       json.dump(labels_dict, write_file, indent=2, ensure_ascii=True)
189 | 
190 | 
191 | def main(argv):
192 |   if len(argv) > 1:
193 |     raise app.UsageError('Too many command-line arguments.')
194 |   if _OUTPUT_PATH.value is None:
195 |     output_path = os.path.join(_INPUT_PATH.value,
196 |                                'processed/labels/labels_detections_3d')
197 |   else:
198 |     output_path = _OUTPUT_PATH.value
199 |   jrdb_train_detections_to_tracks(_INPUT_PATH.value, output_path)
200 | 
201 | if __name__ == '__main__':
202 |   flags.mark_flags_as_required(['input_path'])
203 |   app.run(main)
204 | 


--------------------------------------------------------------------------------
/human_scene_transformer/jrdb/eval_keypoints.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Evaluates Model on JRDB dataset."""
 16 | 
 17 | import os
 18 | from typing import Sequence
 19 | from absl import app
 20 | from absl import flags
 21 | from absl import logging
 22 | import gin
 23 | from human_scene_transformer.jrdb import dataset_params
 24 | from human_scene_transformer.jrdb import input_fn
 25 | from human_scene_transformer.metrics import metrics
 26 | from human_scene_transformer.model import model as hst_model
 27 | from human_scene_transformer.model import model_params
 28 | import tensorflow as tf
 29 | import tqdm
 30 | 
 31 | 
 32 | _MODEL_PATH = flags.DEFINE_string(
 33 |     'model_path',
 34 |     None,
 35 |     'Path to model directory.',
 36 | )
 37 | 
 38 | _CKPT_PATH = flags.DEFINE_string(
 39 |     'checkpoint_path',
 40 |     None,
 41 |     'Path to model checkpoint.',
 42 | )
 43 | 
 44 | 
 45 | def filter_non_moving_agents(input_dict, output_dict):
 46 |   """Filters non-moving agents."""
 47 |   pos = input_dict['agents/position'][..., :2]
 48 |   dist = tf.linalg.norm(
 49 |       tf.reduce_min(tf.where(tf.math.is_nan(pos), tf.float32.max, pos), axis=-2)
 50 |       - tf.reduce_max(
 51 |           tf.where(tf.math.is_nan(pos), tf.float32.min, pos), axis=-2
 52 |       ),
 53 |       axis=-1,
 54 |       keepdims=True,
 55 |   )[..., tf.newaxis]
 56 |   output_dict['should_predict'] = tf.math.logical_and(
 57 |       output_dict['should_predict'], dist > 0.5
 58 |   )
 59 | 
 60 |   return output_dict
 61 | 
 62 | 
 63 | def filter_agents_without_keypoints(input_dict, output_dict):
 64 |   """Filters agents without keypoints."""
 65 |   num_key = tf.reduce_sum(
 66 |       tf.cast(
 67 |           tf.logical_not(
 68 |               tf.reduce_any(
 69 |                   tf.math.is_nan(
 70 |                       input_dict['agents/keypoints'][:, :, :7]
 71 |                       ), axis=-1, keepdims=True)
 72 |               ), tf.float32), axis=-2, keepdims=True)
 73 | 
 74 |   output_dict['should_predict'] = tf.math.logical_and(
 75 |       output_dict['should_predict'], num_key > 5)
 76 | 
 77 |   return output_dict
 78 | 
 79 | 
 80 | def evaluation(checkpoint_path):
 81 |   """Evaluates Model on Pedestrian dataset."""
 82 |   d_params = dataset_params.JRDBDatasetParams(num_agents=None)
 83 | 
 84 |   dataset = input_fn.load_dataset(
 85 |       d_params,
 86 |       d_params.eval_scenes,
 87 |       augment=False,
 88 |       shuffle=False,
 89 |       allow_parallel=False,
 90 |       evaluation=False,
 91 |       repeat=False,
 92 |       keep_subsamples=False,
 93 |   )
 94 | 
 95 |   model_p = model_params.ModelParams()
 96 | 
 97 |   model = hst_model.HumanTrajectorySceneTransformer(model_p)
 98 | 
 99 |   _, _ = model(next(iter(dataset.batch(1))), training=False)
100 | 
101 |   checkpoint_mngr = tf.train.Checkpoint(model=model)
102 |   checkpoint_mngr.restore(checkpoint_path).assert_existing_objects_matched()
103 |   logging.info('Restored checkpoint: %s', checkpoint_path)
104 | 
105 |   ade_metric = metrics.ade.MinADE(model_p)
106 |   ade_metric_1s = metrics.ade.MinADE(
107 |       model_p, cutoff_seconds=1.0, at_cutoff=True
108 |   )
109 |   ade_metric_2s = metrics.ade.MinADE(
110 |       model_p, cutoff_seconds=2.0, at_cutoff=True
111 |   )
112 |   ade_metric_3s = metrics.ade.MinADE(
113 |       model_p, cutoff_seconds=3.0, at_cutoff=True
114 |   )
115 |   ade_metric_4s = metrics.ade.MinADE(
116 |       model_p, cutoff_seconds=4.0, at_cutoff=True
117 |   )
118 | 
119 |   mlade_metric = metrics.ade.MLADE(model_p)
120 |   mlade_metric_1s = metrics.ade.MLADE(
121 |       model_p, cutoff_seconds=1.0, at_cutoff=True
122 |   )
123 |   mlade_metric_2s = metrics.ade.MLADE(
124 |       model_p, cutoff_seconds=2.0, at_cutoff=True
125 |   )
126 |   mlade_metric_3s = metrics.ade.MLADE(
127 |       model_p, cutoff_seconds=3.0, at_cutoff=True
128 |   )
129 |   mlade_metric_4s = metrics.ade.MLADE(
130 |       model_p, cutoff_seconds=4.0, at_cutoff=True
131 |   )
132 | 
133 |   nll_metric = metrics.pos_nll.PositionNegativeLogLikelihood(model_p)
134 |   nll_metric_1s = metrics.pos_nll.PositionNegativeLogLikelihood(
135 |       model_p, cutoff_seconds=1.0, at_cutoff=True
136 |   )
137 |   nll_metric_2s = metrics.pos_nll.PositionNegativeLogLikelihood(
138 |       model_p, cutoff_seconds=2.0, at_cutoff=True
139 |   )
140 |   nll_metric_3s = metrics.pos_nll.PositionNegativeLogLikelihood(
141 |       model_p, cutoff_seconds=3.0, at_cutoff=True
142 |   )
143 |   nll_metric_4s = metrics.pos_nll.PositionNegativeLogLikelihood(
144 |       model_p, cutoff_seconds=4.0, at_cutoff=True
145 |   )
146 | 
147 |   for input_batch in tqdm.tqdm(dataset.batch(1)):
148 |     full_pred, output_batch = model(input_batch, training=False)
149 |     output_batch = filter_agents_without_keypoints(input_batch, output_batch)
150 |     output_batch = filter_non_moving_agents(input_batch, output_batch)
151 |     ade_metric.update_state(output_batch, full_pred)
152 |     ade_metric_1s.update_state(output_batch, full_pred)
153 |     ade_metric_2s.update_state(output_batch, full_pred)
154 |     ade_metric_3s.update_state(output_batch, full_pred)
155 |     ade_metric_4s.update_state(output_batch, full_pred)
156 | 
157 |     mlade_metric.update_state(output_batch, full_pred)
158 |     mlade_metric_1s.update_state(output_batch, full_pred)
159 |     mlade_metric_2s.update_state(output_batch, full_pred)
160 |     mlade_metric_3s.update_state(output_batch, full_pred)
161 |     mlade_metric_4s.update_state(output_batch, full_pred)
162 | 
163 |     nll_metric.update_state(output_batch, full_pred)
164 |     nll_metric_1s.update_state(output_batch, full_pred)
165 |     nll_metric_2s.update_state(output_batch, full_pred)
166 |     nll_metric_3s.update_state(output_batch, full_pred)
167 |     nll_metric_4s.update_state(output_batch, full_pred)
168 | 
169 |   print(f'MinADE: {ade_metric.result().numpy():.2f}')
170 |   print(f'MinADE @ 1s: {ade_metric_1s.result().numpy():.2f}')
171 |   print(f'MinADE @ 2s: {ade_metric_2s.result().numpy():.2f}')
172 |   print(f'MinADE @ 3s: {ade_metric_3s.result().numpy():.2f}')
173 |   print(f'MinADE @ 4s: {ade_metric_4s.result().numpy():.2f}')
174 | 
175 |   print(f'MLADE: {mlade_metric.result().numpy():.2f}')
176 |   print(f'MLADE @ 1s: {mlade_metric_1s.result().numpy():.2f}')
177 |   print(f'MLADE @ 2s: {mlade_metric_2s.result().numpy():.2f}')
178 |   print(f'MLADE @ 3s: {mlade_metric_3s.result().numpy():.2f}')
179 |   print(f'MLADE @ 4s: {mlade_metric_4s.result().numpy():.2f}')
180 | 
181 |   print(f'NLL: {nll_metric.result().numpy():.2f}')
182 |   print(f'NLL @ 1s: {nll_metric_1s.result().numpy():.2f}')
183 |   print(f'NLL @ 2s: {nll_metric_2s.result().numpy():.2f}')
184 |   print(f'NLL @ 3s: {nll_metric_3s.result().numpy():.2f}')
185 |   print(f'NLL @ 4s: {nll_metric_4s.result().numpy():.2f}')
186 | 
187 | 
188 | def main(argv: Sequence[str]) -> None:
189 |   if len(argv) > 1:
190 |     raise app.UsageError('Too many command-line arguments.')
191 | 
192 |   gin.parse_config_files_and_bindings(
193 |       [os.path.join(_MODEL_PATH.value, 'params', 'operative_config.gin')],
194 |       None,
195 |       skip_unknown=True)
196 |   print('Actual gin config used:')
197 |   print(gin.config_str())
198 | 
199 |   evaluation(_CKPT_PATH.value)
200 | 
201 | if __name__ == '__main__':
202 |   logging.set_verbosity(logging.ERROR)
203 |   app.run(main)
204 | 


--------------------------------------------------------------------------------
/human_scene_transformer/model/agent_encoder.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains agent encoder layers."""
 16 | 
 17 | from typing import Dict, Optional
 18 | 
 19 | from human_scene_transformer.model.agent_feature_encoder import AgentTemporalEncoder
 20 | from human_scene_transformer.model.model_params import ModelParams
 21 | 
 22 | import tensorflow as tf
 23 | 
 24 | 
 25 | class FeatureConcatAgentEncoderLayer(tf.keras.layers.Layer):
 26 |   """Independently encodes features and attends to them.
 27 | 
 28 |   Agent features are cross-attended with a learned query or hidden_vecs instead
 29 |   of MLP.
 30 |   """
 31 | 
 32 |   def __init__(self,
 33 |                params: ModelParams):
 34 |     super().__init__()
 35 | 
 36 |     # Cross Attention and learned query.
 37 |     self.ff_layer2 = tf.keras.layers.EinsumDense(
 38 |         '...f,fh->...h',
 39 |         output_shape=params.hidden_size,
 40 |         bias_axes='h',
 41 |         activation=None,
 42 |     )
 43 |     self.ff_dropout = tf.keras.layers.Dropout(params.drop_prob)
 44 | 
 45 |     self.agent_feature_embedding_layers = []
 46 |     # Position Feature
 47 |     self.agent_feature_embedding_layers.append(
 48 |         params.agents_feature_config[params.agents_position_key](
 49 |             params.agents_position_key, params.hidden_size - 8, params
 50 |         )
 51 |     )
 52 |     # Feature Embedding
 53 |     for key, layer in params.agents_feature_config.items():
 54 |       if key == params.agents_position_key:
 55 |         continue
 56 |       self.agent_feature_embedding_layers.append(
 57 |           layer(key, params.hidden_size - 8, params)
 58 |       )
 59 | 
 60 |     # Temporal Embedding
 61 |     self.agent_feature_embedding_layers.append(
 62 |         AgentTemporalEncoder(
 63 |             list(params.agents_feature_config.keys())[0],
 64 |             params.hidden_size - 8,
 65 |             params,
 66 |         )
 67 |     )
 68 | 
 69 |   def call(
 70 |       self, input_batch: Dict[str, tf.Tensor], training: Optional[bool] = None
 71 |   ):
 72 |     input_batch = input_batch.copy()
 73 | 
 74 |     layer_embeddings = []
 75 |     for layer in self.agent_feature_embedding_layers:
 76 |       layer_embedding, _ = layer(input_batch, training=training)
 77 |       layer_embedding = tf.reshape(
 78 |           layer_embedding,
 79 |           layer_embedding.shape[:-2]
 80 |           + [layer_embedding.shape[-2] * layer_embedding.shape[-1]],
 81 |       )
 82 |       layer_embeddings.append(layer_embedding)
 83 |     embedding = tf.concat(layer_embeddings, axis=-1)
 84 | 
 85 |     out = self.ff_layer2(embedding)
 86 | 
 87 |     input_batch['hidden_vecs'] = out
 88 |     input_batch['hidden_vecs_fe'] = out
 89 |     return input_batch
 90 | 
 91 | 
 92 | class FeatureAttnAgentEncoderLearnedLayer(tf.keras.layers.Layer):
 93 |   """Independently encodes features and attends to them.
 94 | 
 95 |   Agent features are cross-attended with a learned query or hidden_vecs instead
 96 |   of MLP.
 97 |   """
 98 | 
 99 |   def __init__(self,
100 |                params: ModelParams):
101 |     super(FeatureAttnAgentEncoderLearnedLayer, self).__init__()
102 | 
103 |     # Cross Attention and learned query.
104 |     self.attn_layer = tf.keras.layers.MultiHeadAttention(
105 |         num_heads=params.num_heads,
106 |         key_dim=params.hidden_size,  # "large" to prevent a bottleneck
107 |         attention_axes=3)
108 |     self.attn_ln = tf.keras.layers.LayerNormalization(epsilon=params.ln_eps)
109 |     self.ff_layer1 = tf.keras.layers.EinsumDense(
110 |         '...h,hf->...f',
111 |         output_shape=params.transformer_ff_dim,
112 |         bias_axes='f',
113 |         activation='relu')
114 |     self.ff_layer2 = tf.keras.layers.EinsumDense(
115 |         '...f,fh->...h',
116 |         output_shape=params.hidden_size,
117 |         bias_axes='h',
118 |         activation=None)
119 |     self.ff_dropout = tf.keras.layers.Dropout(params.drop_prob)
120 |     self.ff_ln = tf.keras.layers.LayerNormalization(epsilon=params.ln_eps)
121 | 
122 |     self.agent_feature_embedding_layers = []
123 |     # Position Feature
124 |     self.agent_feature_embedding_layers.append(
125 |         params.agents_feature_config[params.agents_position_key](
126 |             params.agents_position_key, params.hidden_size-8, params))
127 |     # Feature Embedding
128 |     for key, layer in params.agents_feature_config.items():
129 |       if key == params.agents_position_key:
130 |         continue
131 |       self.agent_feature_embedding_layers.append(
132 |           layer(key, params.hidden_size-8, params))
133 | 
134 |     # Temporal Embedding
135 |     self.agent_feature_embedding_layers.append(
136 |         AgentTemporalEncoder(list(params.agents_feature_config.keys())[0],
137 |                              params.hidden_size-8, params))
138 | 
139 |     # [1, 1, 1, 1, h]
140 |     self.learned_query_vec = tf.Variable(
141 |         tf.random_uniform_initializer(
142 |             minval=-1., maxval=1.)(shape=[1, 1, 1, 1, params.hidden_size]),
143 |         trainable=True,
144 |         dtype=tf.float32)
145 | 
146 |   def _build_learned_query(self, input_batch):
147 |     """Converts self.learned_query_vec into a learned query vector."""
148 |     # This weird thing is for exporting and loading keras model...
149 |     b = tf.shape(input_batch['agents/position'])[0]
150 |     num_agents = tf.shape(input_batch['agents/position'])[1]
151 |     num_steps = tf.shape(input_batch['agents/position'])[2]
152 | 
153 |     # [b, num_agents, num_steps, 1, h]
154 |     return tf.tile(self.learned_query_vec, [b, num_agents, num_steps, 1, 1])
155 | 
156 |   def call(self, input_batch: Dict[str, tf.Tensor],
157 |            training: Optional[bool] = None):
158 |     input_batch = input_batch.copy()
159 | 
160 |     layer_embeddings = []
161 |     layer_masks = []
162 |     for layer in self.agent_feature_embedding_layers:
163 |       layer_embedding, layer_mask = layer(input_batch, training=training)
164 |       layer_embeddings.append(layer_embedding)
165 |       layer_masks.append(layer_mask)
166 |     embedding = tf.concat(layer_embeddings, axis=3)
167 | 
168 |     b = tf.shape(embedding)[0]
169 |     a = tf.shape(embedding)[1]
170 |     t = tf.shape(embedding)[2]
171 |     n = tf.shape(embedding)[3]
172 | 
173 |     # [1, 1, 1, N, 8]
174 |     one_hot = tf.one_hot(tf.range(0, n), 8)[None, None, None]
175 |     # [b, a, t, N, 8]
176 |     one_hot_id = tf.tile(one_hot, (b, a, t, 1, 1))
177 | 
178 |     embedding = tf.concat([embedding, one_hot_id], axis=-1)
179 | 
180 |     attention_mask = tf.concat(layer_masks, axis=-1)
181 | 
182 |     # [b, a, t, num_heads, 1, num_features] <- broadcasted
183 |     # Newaxis for num_heads, num_features
184 |     attention_mask = attention_mask[..., tf.newaxis, tf.newaxis, :]
185 | 
186 |     learned_query = self._build_learned_query(input_batch)
187 | 
188 |     # Attention along axis 3
189 |     attn_out, attn_score = self.attn_layer(
190 |         query=learned_query,
191 |         key=embedding,
192 |         value=embedding,
193 |         attention_mask=attention_mask,
194 |         return_attention_scores=True)
195 |     # [b, a, t, h]
196 |     attn_out = attn_out[..., 0, :]
197 |     out = self.ff_layer1(attn_out)
198 |     out = self.ff_layer2(out)
199 |     out = self.ff_dropout(out, training=training)
200 |     out = self.ff_ln(out + attn_out)
201 | 
202 |     input_batch['hidden_vecs'] = out
203 |     input_batch[f'attn_scores/{self.name}'] = attn_score
204 |     return input_batch
205 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | :trophy: Winner of the [2023 JRDB Trajectory Prediction Challenge](https://jrdb.erc.monash.edu/leaderboards/trajectory) - [Reproduce](#jrdb-trajectory-prediction-challenge-results) our Result!
  2 | 
  3 | # Human Scene Transformer
  4 | 
  5 | The (Human) Scene Transformer architecture (as described [here](https://arxiv.org/pdf/2309.17209.pdf) and [here)](https://arxiv.org/pdf/2106.08417.pdf) is a general and extendable trajectory prediction framework which threats trajectory prediction as a sequence to sequence problem and models it in a Transformer architecture.
  6 | 
  7 | It is straightforward to extend with
  8 | 
  9 | - additional input features
 10 | - custom environment encoder
 11 | - different loss functions
 12 | - ...
 13 | 
 14 | *This is not an officially supported Google product.*
 15 | 
 16 | ---
 17 | 
 18 | ![Human Scene Transformer](./human_scene_transformer/images/hero.png)
 19 | 
 20 | Anticipating the motion of all humans in dynamic environments such as homes and offices is critical to enable safe and effective robot navigation. Such spaces remain challenging as humans do not follow strict rules of motion and there are often multiple occluded entry points such as corners and doors that create opportunities for sudden encounters. In this work, we present a Transformer based architecture to predict human future trajectories in human-centric environments from input features including human positions, head orientations, and 3D skeletal keypoints from onboard in-the-wild sensory information. The resulting model captures the inherent uncertainty for future human trajectory prediction and achieves state-of-the-art performance on common prediction benchmarks and a human tracking dataset captured from a mobile robot adapted for the prediction task. Furthermore, we identify new agents with limited historical data as a major contributor to error and demonstrate the complementary nature of 3D skeletal poses in reducing prediction error in such challenging scenarios.
 21 | 
 22 | If you use this work please cite our paper
 23 | 
 24 | ```
 25 | @article{salzmann2023hst,
 26 |   title={Robots That Can See: Leveraging Human Pose for Trajectory Prediction},
 27 |   author={Salzmann, Tim and Chiang, Lewis and Ryll, Markus and Sadigh, Dorsa and Parada, Carolina and Bewley, Alex}
 28 |   journal={IEEE Robotics and Automation Letters},
 29 |   title={Robots That Can See: Leveraging Human Pose for Trajectory Prediction},
 30 |   year={2023}, volume={8}, number={11}, pages={7090-7097},
 31 |   doi={10.1109/LRA.2023.3312035}
 32 | }
 33 | ```
 34 | 
 35 | ---
 36 | 
 37 | ## Prerequisites
 38 | 
 39 | Install requirements via `pip install -r requirements.txt`.
 40 | 
 41 | Please note that this codebase is not compatible with the Intel MKL backend for
 42 | tensorflow. The MKL backend supports tensors up to 5 dimensions which is
 43 | not sufficient for parts of this codebase. Should you have a MKL backed
 44 | tensorflow installation or are running into MKL related
 45 | [errors](https://github.com/google-research/human-scene-transformer/issues/11),
 46 | please disable the tensorflow MKL backend by setting the environment variable
 47 | `TF_ENABLE_ONEDNN_OPTS=0` and `TF_DISABLE_MKL=1`.
 48 | 
 49 | ## Data
 50 | 
 51 | ### JRDB
 52 | 
 53 | We provide a extensive prep-processing pipeline to convert the JRDB dataset,
 54 | JRDB was created as a detection and tracking dataset rather than a prediction
 55 | dataset. To make the data suitable for a prediction task, we first extract the
 56 | robot motion from the raw sensor data to account for the robot's motion.
 57 | Further, on the JRDB training split we combine algorithmic detection with the
 58 | ground truth labels from the tracking dataset to create authentic tracks as
 59 | input and labels for HST.
 60 | Note that we do not purely use the ground truth hand labeled tracks in the JRDB
 61 | train dataset as we find them to be overly smoothed giving away the future human
 62 | movement.
 63 | To adapt the JRDB dataset for prediction please follow [this](/human_scene_transformer/data) README.
 64 | 
 65 | Make sure to adapt `<data_path>` in `config/<jrdb/pedestrians>/dataset_params.gin` accordingly.
 66 | 
 67 | If you want to use the JRDB dataset for trajectory prediction in PyTorch we
 68 | provide a [PyTorch Dataset wrapper](/human_scene_transformer/jrdb/torch_dataset.py) for the processed dataset.
 69 | 
 70 | ### Pedestrians ETH/UCY
 71 | Please download the raw data [here](https://github.com/StanfordASL/Trajectron-plus-plus/tree/master/experiments/pedestrians/raw).
 72 | 
 73 | ## Training
 74 | 
 75 | ### JRDB
 76 | ```
 77 | python train.py --model_base_dir=./model/jrdb  --gin_files=./config/jrdb/training_params.gin --gin_files=./config/jrdb/model_params.gin --gin_files=./config/jrdb/dataset_params.gin --gin_files=./config/jrdb/metrics.gin --dataset=JRDB
 78 | ```
 79 | 
 80 | ### Pedestrians ETH/UCY
 81 | ```
 82 | python train.py --model_base_dir=./models/pedestrians_eth  --gin_files=..config/pedestrians/training_params.gin --gin_files=..config/pedestrians/model_params.gin --gin_files=./config/pedestrians/dataset_params.gin --gin_files=./config/pedestrians/metrics.gin --dataset=PEDESTRIANS
 83 | ```
 84 | 
 85 | ---
 86 | 
 87 | ## JRDB Trajectory Prediction Challenge Results
 88 | To reproduce our winning results in the [2023 JRDB Trajectory Prediction Challenge](https://jrdb.erc.monash.edu/leaderboards/trajectory):
 89 | 
 90 | - Make sure that you follow the [data pre-processing instructions](/human_scene_transformer/data) and pay special attention to where the instructions differentiate between the JRDB Challenge dataset and the original paper dataset.
 91 | 
 92 | - Download the trained challenge model [here](https://storage.googleapis.com/gresearch/human_scene_transformer/jrdb_challenge_checkpoint.zip)
 93 | 
 94 | - Run
 95 | 
 96 | ```
 97 | python jrdb/eval_challenge.py --model_path=<path_to_challenge_model_folder> --checkpoint_path=<path_to_challenge_model_folder>/ckpts/ckpt-20 --dataset_path=<dataset_path> --output_path=<result_folder>
 98 | ```
 99 | 
100 | ---
101 | 
102 | ## Evaluation
103 | 
104 | ### JRDB
105 | ```
106 | python jrdb/eval.py --model_path=./models/jrdb/ --checkpoint_path=./models/jrdb/ckpts/ckpt-30
107 | ```
108 | 
109 | #### Keypoints Impact Evaluation
110 | ```
111 | python jrdb/eval_keypoints.py --model_path=./models/jrdb/ --checkpoint_path=./models/jrdb/ckpts/ckpt-30
112 | ```
113 | 
114 | vs
115 | 
116 | ```
117 | python jrdb/eval_keypoints.py --model_path=./models/jrdb_no_keypoints/ --checkpoint_path=./models/jrdb_no_keypoints/ckpts/ckpt-30
118 | ```
119 | 
120 | ### Pedestrians ETH/UCY
121 | ```
122 | python pedestrians/eval.py --model_path=./models/pedestrians_eth/ --checkpoint_path=./models/pedestrians_eth/ckpts/ckpt-20
123 | ```
124 | 
125 | ---
126 | 
127 | ## Results
128 | 
129 | Compared to the published paper we improved our data processing and fixed small
130 | bugs in this code release. If you compare against our method please use the
131 | following updated results.
132 | 
133 | On the JRDB dataset with dataset options as set [here](/human_scene_transformer/config/jrdb/dataset_params.py):
134 | 
135 | |        | AVG  |  @ 1s | @ 2s |  @ 3s | @ 4s  |
136 | |--------|------|-------|------|-------|-------|
137 | | MinADE | 0.26 | 0.12  | 0.20 | 0.28  | 0.37  |
138 | | MinFDE | 0.45 | 0.21  | 0.39 | 0.56  | 0.71  |
139 | |  NLL   |-0.59 | -0.90 | -0.65| -0.08 | 0.32  |
140 | 
141 | On the ETH/UCY Pedestrians Dataset:
142 | 
143 | |        | ETH  | Hotel | Univ | Zara1 | Zara2 |  Avg  |
144 | |--------|------|-------|------|-------|-------|-------|
145 | | MinADE | 0.41 | 0.10  | 0.24 | 0.17  | 0.14  | 0.21  |
146 | | MinFDE | 0.73 | 0.14  | 0.44 | 0.30  | 0.24  | 0.37  |
147 | 
148 | ### JRDB Train / Test Split
149 | The train / test split is implemented [here](/human_scene_transformer/config/jrdb/dataset_params.py).
150 | 
151 | ### Checkpoints
152 | You can download trained model checkpoints for both `JRDB` and `Pedestrians (ETH/UCY)` datasets [here](https://storage.googleapis.com/gresearch/human_scene_transformer/checkpoints.zip).
153 | 
154 | To evaluate the pre-trained checkpoints you will have to adjust the path to the dataset in the respective `params/operative_config.gin` file.
155 | 
156 | ## Runtime
157 | Evaluation of forward inference runtime with single output mode:
158 | 
159 | | #Humans | M1 - CPU | A100 - GPU |
160 | |---------|----------|------------|
161 | | 1       |   40Hz   |    12Hz    |
162 | | 10      |   30Hz   |    11Hz    |
163 | | 20      |   23Hz   |    11Hz    |
164 | | 50      |   12Hz   |    11Hz    |
165 | | 100     |    5Hz   |    11Hz    |
166 | | 150     |          |    11Hz    |


--------------------------------------------------------------------------------
/human_scene_transformer/model/model.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Contains Human Scene Transformer keras model."""
 16 | 
 17 | from typing import Any, Dict, Optional, Tuple
 18 | 
 19 | from human_scene_transformer.model import agent_encoder
 20 | from human_scene_transformer.model import agent_self_alignment
 21 | from human_scene_transformer.model import attention
 22 | from human_scene_transformer.model import multimodality_induction
 23 | from human_scene_transformer.model import preprocess
 24 | from human_scene_transformer.model.model_params import ModelParams
 25 | 
 26 | import tensorflow as tf
 27 | 
 28 | 
 29 | class HumanTrajectorySceneTransformer(tf.keras.Model):
 30 |   """A variant of Scene Transformer adapted for human trajectory prediction.
 31 | 
 32 |   This class is a Keras model that is designed to predict the trajectory of
 33 |   bounding boxes of nearby humans (pedestrians). It is designed to be used on
 34 |   the proxy robots.
 35 |   The input of the model is an input_batch dict that maps input features to
 36 |   tensors. The exact features depends on the dataset (currently the EDR and
 37 |   Jackrobot dataset), but the input and output layers of the model will convert
 38 |   them to a dict with common feature names before passing to the transformer
 39 |   network.
 40 | 
 41 |   The model is a transformer that encodes each agent-timestep into a fixed size
 42 |   vector before doing self-attention and cross-attention w/ scene features such
 43 |   as the occupancy grid.
 44 |   The model outputs each agents future trajectories in the form of Gaussian
 45 |   distributions over the agents positions and von-Mises distributions over their
 46 |   corresponding orientations at each timestep. Furthermore, these distributions
 47 |   are multimodal similar to a Mixture Distributions but are shared for each
 48 |   agent with scene level mixture probabilities.
 49 | 
 50 |   """
 51 | 
 52 |   def __init__(self,
 53 |                params: ModelParams,
 54 |                input_layer: Optional[tf.keras.layers.Layer] = None,
 55 |                output_layer: Optional[tf.keras.layers.Layer] = None):
 56 |     """Constructs the model.
 57 | 
 58 |     Args:
 59 |       params: A model parameters object containing the model configuration.
 60 |       input_layer: A Keras Layer object that maps various features names in the
 61 |         dataset to a set of common names. E.g., tracks_tensors/xyz ->
 62 |         agent/positions
 63 |       output_layer: A Keras Layer object that maps various features predicted by
 64 |         the model to a set of names used in the dataset E.g., xyz ->
 65 |         agent/positions
 66 |     """
 67 | 
 68 |     super().__init__()
 69 |     self.input_layer = input_layer
 70 |     self.output_layer = output_layer
 71 | 
 72 |     # Preprocess layer.
 73 |     self.preprocess_layer = preprocess.PreprocessLayer(params)
 74 | 
 75 |     self.agent_encoding_layer = (
 76 |         agent_encoder.FeatureAttnAgentEncoderLearnedLayer(params)
 77 |     )
 78 | 
 79 |     if params.scene_encoder is not None:
 80 |       self.scene_encoder = params.scene_encoder(params)
 81 |     else:
 82 |       self.scene_encoder = None
 83 | 
 84 |     # Transformer layers.
 85 |     self.transformer_layers = []
 86 |     # Agent self alignment layer
 87 |     layer = agent_self_alignment.AgentSelfAlignmentLayer(
 88 |         hidden_size=params.hidden_size,
 89 |         ff_dim=params.transformer_ff_dim,
 90 |         num_heads=params.num_heads,
 91 |         mask_style='has_data')
 92 |     self.transformer_layers.append(layer)
 93 | 
 94 |     multimodality_induced = False
 95 |     for arch in params.attn_architecture:
 96 |       if arch == 'self-attention':
 97 |         layer = attention.SelfAttnTransformerLayer(
 98 |             hidden_size=params.hidden_size,
 99 |             ff_dim=params.transformer_ff_dim,
100 |             num_heads=params.num_heads,
101 |             drop_prob=params.drop_prob,
102 |             mask_style=params.mask_style,
103 |             flatten=True,
104 |             multimodality_induced=multimodality_induced)
105 |       elif arch == 'self-attention-mode':
106 |         layer = attention.SelfAttnModeTransformerLayer(
107 |             hidden_size=params.hidden_size,
108 |             ff_dim=params.transformer_ff_dim,
109 |             num_heads=params.num_heads,
110 |             drop_prob=params.drop_prob)
111 |       elif arch == 'cross-attention':
112 |         if not multimodality_induced:
113 |           layer = attention.SceneNonMultimodalCrossAttnTransformerLayer(
114 |               hidden_size=params.hidden_size,
115 |               ff_dim=params.transformer_ff_dim,
116 |               num_heads=params.num_heads,
117 |               drop_prob=params.drop_prob)
118 |         else:
119 |           layer = attention.SceneMultimodalCrossAttnTransformerLayer(
120 |               hidden_size=params.hidden_size,
121 |               ff_dim=params.transformer_ff_dim,
122 |               num_heads=params.num_heads,
123 |               drop_prob=params.drop_prob)
124 |       elif arch == 'multimodality_induction':
125 |         multimodality_induced = True
126 |         layer = multimodality_induction.MultimodalityInduction(
127 |             num_modes=params.num_modes,
128 |             hidden_size=params.hidden_size,
129 |             ff_dim=params.transformer_ff_dim,
130 |             num_heads=params.num_heads,
131 |             drop_prob=params.drop_prob)
132 |       else:
133 |         raise ValueError(f'Unknown attn architecture: {arch}. ' +
134 |                          'Must be either self-attention or cross-attention.')
135 |       self.transformer_layers.append(layer)
136 |     # Prediction head.
137 |     self.prediction_layer = params.prediction_head(
138 |         hidden_units=params.prediction_head_hidden_units)
139 | 
140 |   def call(
141 |       self,
142 |       input_batch: Dict[str, tf.Tensor],
143 |       is_hidden: Optional[Any] = None,
144 |       training: bool = False) -> Tuple[Dict[str, tf.Tensor], Dict[str, Any]]:
145 |     """Override the standard call() function to provide more flexibility.
146 | 
147 |     Args:
148 |       input_batch: A dictionary that maps a str to a tensor. The tensor's first
149 |         dimensions is always the batch dimension. These tensors include all
150 |         timesteps (history, current and future) and all agents (observed and
151 |         padded).
152 |       is_hidden: An optional bool np array or tf.tensor of the shape: [batch,
153 |         max_num_agents, num_time_steps, 1]. This tensor instructs the model on
154 |         which agent-timestep needs to be predicted (if set to True). Note that
155 |         you do not have to worry about padded agent or timesteps being
156 |         predicted. The preprocessing layer will handle that for you. If you do
157 |         not put in anything, a default behavior predction tensor will be used,
158 |         which looks like: False for all agents in the history or current
159 |         timesteps and True for all future timesteps.
160 |       training: An optional bool that instructs the model if the call is used
161 |         during training.
162 | 
163 |     Returns:
164 |       output: A dict containing the model prediction. Note that the predicted
165 |         tensors has the same shape as the input_batch so the history and
166 |         current steps are included.
167 |       input_batch: The input batch modified to include features generated by
168 |         the preprocess layer, e.g., has_data.
169 |     """
170 | 
171 |     if self.input_layer is not None:
172 |       input_batch = self.input_layer(input_batch, training=training)
173 | 
174 |     # Preprocess input_batch.
175 |     input_batch = self.preprocess_layer(input_batch, is_hidden=is_hidden)
176 | 
177 |     # Feed the input_batch through the network.
178 |     input_batch = self.agent_encoding_layer(input_batch, training=training)
179 |     if self.scene_encoder is not None:
180 |       input_batch = self.scene_encoder(input_batch, training=training)
181 |     for layer in self.transformer_layers:
182 |       input_batch = layer(input_batch, training=training)
183 |     predictions = self.prediction_layer(input_batch, training=training)
184 |     if self.output_layer is not None:
185 |       output = self.output_layer(predictions, training=training)
186 |     else:
187 |       output = predictions
188 |     return output, input_batch
189 | 


--------------------------------------------------------------------------------
/human_scene_transformer/data/math/data_types.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """A hidden layer to define all of the type definitions."""
 16 | 
 17 | from typing import Any, Optional, Sequence
 18 | 
 19 | from human_scene_transformer.data.math import math_types
 20 | from human_scene_transformer.data.math import pose3
 21 | from human_scene_transformer.data.math import quaternion
 22 | from human_scene_transformer.data.math import rotation3
 23 | from human_scene_transformer.data.math import vector_util
 24 | 
 25 | import numpy as np
 26 | 
 27 | 
 28 | VECTOR_TYPE = math_types.VectorType
 29 | Rotation3 = rotation3.Rotation3
 30 | Pose3 = pose3.Pose3
 31 | Quaternion = quaternion.Quaternion
 32 | 
 33 | 
 34 | class Twist:
 35 |   """A class which represents a twist with linear and angular components."""
 36 | 
 37 |   def __init__(self,
 38 |                linear: Optional[VECTOR_TYPE] = None,
 39 |                angular: Optional[VECTOR_TYPE] = None):
 40 |     """Constructs a new twist.
 41 | 
 42 |     Initializes a new twist object.
 43 | 
 44 |     If no parameters are specified the default will be a zero twist.
 45 | 
 46 |     Args:
 47 |       linear: Linear component of twist, zero by default.
 48 |       angular: Angular component of twist, zero by default.
 49 |     """
 50 |     if linear is None:
 51 |       self.linear = np.zeros(3)
 52 |     else:
 53 |       self.linear = vector_util.as_vector3(
 54 |           linear, dtype=np.float64, err_msg='Linear vector in Twist').copy()
 55 |     if angular is None:
 56 |       self.angular = np.zeros(3)
 57 |     else:
 58 |       self.angular = vector_util.as_vector3(
 59 |           angular, dtype=np.float64, err_msg='Angular vector in Twist').copy()
 60 | 
 61 |   @property
 62 |   def vec(self) -> np.ndarray:
 63 |     """Returns six-value representation: [linear, angular]."""
 64 |     return np.hstack((self.linear, self.angular))
 65 | 
 66 |   def __repr__(self) -> str:
 67 |     """Generates a string that describes the twist.
 68 | 
 69 |     Returns:
 70 |       A string describing the twist.
 71 |     """
 72 |     return 'Twist({}, {})'.format(self.linear, self.angular)
 73 | 
 74 | 
 75 | class Wrench:
 76 |   """A class which represents a wrench with force and torque components."""
 77 | 
 78 |   def __init__(self,
 79 |                force: Optional[VECTOR_TYPE] = None,
 80 |                torque: Optional[VECTOR_TYPE] = None):
 81 |     """Constructs a new wrench.
 82 | 
 83 |     Initializes a new wrench object.
 84 | 
 85 |     If no parameters are specified the default will be a zero wrench.
 86 | 
 87 |     Args:
 88 |       force: Force, zero by default.
 89 |       torque: Torque, zero by default.
 90 |     """
 91 |     if force is None:
 92 |       self.force = np.zeros(3)
 93 |     else:
 94 |       self.force = vector_util.as_vector3(
 95 |           force, dtype=np.float64, err_msg='Force vector in Wrench').copy()
 96 |     if torque is None:
 97 |       self.torque = np.zeros(3)
 98 |     else:
 99 |       self.torque = vector_util.as_vector3(
100 |           torque, dtype=np.float64, err_msg='Torque vector in Wrench').copy()
101 | 
102 |   @property
103 |   def vec(self) -> np.ndarray:
104 |     """Returns six-value representation: [force, torque]."""
105 |     return np.hstack((self.force, self.torque))
106 | 
107 |   def __repr__(self) -> str:
108 |     """Generates a string that describes the wrench.
109 | 
110 |     Returns:
111 |       A string describing the wrench.
112 |     """
113 |     return 'Wrench({}, {})'.format(self.force, self.torque)
114 | 
115 | 
116 | class Stiffness:
117 |   """A class which represents stiffness."""
118 | 
119 |   def __init__(self,
120 |                linear: Optional[VECTOR_TYPE] = None,
121 |                torsional: Optional[VECTOR_TYPE] = None,
122 |                matrix6x6: Optional[np.ndarray] = None):
123 |     """Constructs a new stiffness.
124 | 
125 |     Initializes a new stiffness object.
126 | 
127 |     If no parameters are specified the default will be a zero stiffness.
128 | 
129 |     Args:
130 |       linear: Linear component of stiffness, zero by default.
131 |       torsional: Torsional component of stiffness, zero by default.
132 |       matrix6x6: A full 6x6 stiffness matrix.  When 'matrix6x6' is provided,
133 |         'linear' and 'torsional' are ignored and the linear and torsional
134 |         components of Stiffness will be set as the diagonal elements of
135 |         'matrix6x6'.
136 | 
137 |     Raises:
138 |       ValueError if the shape of 'matrix6x6' is not 6x6.
139 |     """
140 |     if matrix6x6 is not None:
141 |       if not np.array_equal(matrix6x6.shape, [6, 6]):
142 |         raise ValueError('Invalid matrix6x6 shape: {0}'.format(matrix6x6.shape))
143 |       self.matrix6x6 = np.copy(matrix6x6).astype(np.float64)
144 |       diag = self.matrix6x6.diagonal()
145 |       self.linear = diag[:3]
146 |       self.torsional = diag[3:]
147 |     else:
148 |       # Sets linear stiffness.
149 |       if linear is None:
150 |         self.linear = np.zeros(3)
151 |       else:
152 |         self.linear = vector_util.as_vector3(
153 |             linear, dtype=np.float64,
154 |             err_msg='Linear vector in Stiffness').copy()
155 |       # Sets torsional stiffness.
156 |       if torsional is None:
157 |         self.torsional = np.zeros(3)
158 |       else:
159 |         self.torsional = vector_util.as_vector3(
160 |             torsional, dtype=np.float64,
161 |             err_msg='Angular vector in Stiffness').copy()
162 |       # Sets the 6x6 stiffness matrix.
163 |       self.matrix6x6 = np.diag(self.vec)
164 | 
165 |   @classmethod
166 |   def from_scalar(cls, scalar: float) -> 'Stiffness':
167 |     return cls(linear=[scalar] * 3, torsional=[scalar] * 3)
168 | 
169 |   @property
170 |   def vec(self) -> np.ndarray:
171 |     """Returns the diagonal elements: [linear, torsional]."""
172 |     return np.hstack((self.linear, self.torsional))
173 | 
174 |   def __repr__(self) -> str:
175 |     """Generates a string that describes the stiffness.
176 | 
177 |     Returns:
178 |       A string describing the stiffness.
179 |     """
180 |     return 'Stiffness({})'.format(self.matrix6x6)
181 | 
182 | 
183 | # Convenience functions.
184 | def vec6_to_pose3(vec6: VECTOR_TYPE) -> Pose3:
185 |   """Creates a Pose3 object from a given 6-D vector.
186 | 
187 |   Args:
188 |     vec6: A 6-D vector, [x, y, z, rx, ry, rz], that denotes translation [x, y,
189 |       z] and orientation [rx, ry, rz] as Euler angles roll, pitch, yaw, in
190 |       radians.
191 | 
192 |   Returns:
193 |     A Pose3 object.
194 |   """
195 |   assert len(vec6) == 6
196 |   return Pose3(
197 |       translation=vec6[:3],
198 |       rotation=Rotation3.from_euler_angles(rpy_radians=vec6[3:]))
199 | 
200 | 
201 | def pose3_to_vec6(pose: Pose3) -> np.ndarray:
202 |   """Returns a 6-D vector representation of the pose.
203 | 
204 |   Args:
205 |     pose: A Pose3 object.
206 | 
207 |   Returns:
208 |     A 6-D vector, [x, y, z, rx, ry, rz], that denotes translation
209 |     [x, y, z] and orientation [rx, ry, rz].
210 |   """
211 |   return np.hstack((pose.translation, pose.rotation.euler_angles()))
212 | 
213 | 
214 | def pose3_to_vec6_in_axis_angles(pose: Pose3) -> np.ndarray:
215 |   """Returns a 6-D vector representation of the pose.
216 | 
217 |   Args:
218 |     pose: A Pose3 object.
219 | 
220 |   Returns:
221 |     A 6-D vector, [x, y, z, rx, ry, rz], that denotes translation
222 |     [x, y, z] and orientation [rx, ry, rz] as axis angles.
223 |   """
224 |   return np.hstack(
225 |       (pose.translation, pose.rotation.axis() * pose.rotation.angle()))
226 | 
227 | 
228 | def vec6_to_twist(vec6: VECTOR_TYPE) -> Twist:
229 |   """Creates a Twist object from a given 6-D vector.
230 | 
231 |   Args:
232 |     vec6: A 6-D vector, [vx, vy, vz, wx, wy, wz], that denotes linear velocity
233 |       along X, Y, Z and angular velocity around X, Y, Z.
234 | 
235 |   Returns:
236 |     A Twist object.
237 |   """
238 |   assert len(vec6) == 6
239 |   return Twist(linear=vec6[:3], angular=vec6[3:])
240 | 
241 | 
242 | def vec6_to_wrench(vec6: VECTOR_TYPE) -> Wrench:
243 |   """Creates a Wrench object from a given 6-D vector.
244 | 
245 |   Args:
246 |     vec6: A 6-D vector, [fx, fy, fz, tx, ty, tz], that denotes force and torque.
247 | 
248 |   Returns:
249 |     A Wrench object.
250 |   """
251 |   assert len(vec6) == 6
252 |   return Wrench(force=vec6[:3], torque=vec6[3:])
253 | 
254 | 
255 | def vec6_to_stiffness(vec6: VECTOR_TYPE) -> Stiffness:
256 |   """Creates a Stiffness object from a given 6-D vector.
257 | 
258 |   Args:
259 |     vec6: A 6-D vector, [x, y, z, rx, ry, rz], that denotes translational and
260 |       rotational stiffness for X, Y, Z.
261 | 
262 |   Returns:
263 |     A Stiffness object.
264 |   """
265 |   assert len(vec6) == 6
266 |   return Stiffness(linear=vec6[:3], torsional=vec6[3:])
267 | 
268 | 
269 | def to_list(data: Any) -> Sequence[float]:
270 |   """Flattens a common data type to a list of float.
271 | 
272 |   Args:
273 |     data: The input data to flatten.
274 | 
275 |   Returns:
276 |     The flattend list, i.e., the vectorized parametrization of the input data.
277 |   """
278 |   if isinstance(data, float):
279 |     return [data]
280 |   elif isinstance(data, np.ndarray):
281 |     return data.tolist()
282 |   elif isinstance(data, Pose3):
283 |     return data.vec7.tolist()
284 |   elif isinstance(data, Twist):
285 |     return data.vec.tolist()
286 |   elif isinstance(data, Wrench):
287 |     return data.vec.tolist()
288 |   else:
289 |     raise ValueError('{} is not supported in to_list()'.format(type(data)))
290 | 


--------------------------------------------------------------------------------
/human_scene_transformer/data/jrdb_preprocess_test.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Preprocesses the raw test split of JRDB.
 16 | """
 17 | 
 18 | import os
 19 | 
 20 | from absl import app
 21 | from absl import flags
 22 | 
 23 | from human_scene_transformer.data import utils
 24 | import numpy as np
 25 | import pandas as pd
 26 | import tensorflow as tf
 27 | import tqdm
 28 | 
 29 | 
 30 | _INPUT_PATH = flags.DEFINE_string(
 31 |     'input_path',
 32 |     default=None,
 33 |     help='Path to jrdb2022 dataset.'
 34 | )
 35 | 
 36 | _OUTPUT_PATH = flags.DEFINE_string(
 37 |     'output_path',
 38 |     default=None,
 39 |     help='Path to output folder.'
 40 | )
 41 | 
 42 | _PROCESS_POINTCLOUDS = flags.DEFINE_bool(
 43 |     'process_pointclouds',
 44 |     default=True,
 45 |     help='Whether to process pointclouds.'
 46 | )
 47 | 
 48 | _MAX_DISTANCE_TO_ROBOT = flags.DEFINE_float(
 49 |     'max_distance_to_robot',
 50 |     default=15.,
 51 |     help=('Maximum distance of agent to the robot to be included'
 52 |           ' in the processed dataset.')
 53 | )
 54 | 
 55 | _MAX_PC_DISTANCE_TO_ROBOT = flags.DEFINE_float(
 56 |     'max_pc_distance_to_robot',
 57 |     default=10.,
 58 |     help=('Maximum distance of pointcloud point to the robot to be included'
 59 |           ' in the processed dataset.')
 60 | )
 61 | 
 62 | _TRACKING_METHOD = flags.DEFINE_string(
 63 |     'tracking_method',
 64 |     default='ss3d_mot',
 65 |     help='Name of tracking method to use.'
 66 | )
 67 | 
 68 | _TRACKING_CONFIDENCE_THRESHOLD = flags.DEFINE_float(
 69 |     'tracking_confidence_threshold',
 70 |     default=.0,
 71 |     help=('Confidence threshold for tracked agent instance to be included'
 72 |           ' in the processed dataset.')
 73 | )
 74 | 
 75 | AGENT_KEYPOINTS = True
 76 | FROM_DETECTIONS = True
 77 | 
 78 | 
 79 | def list_test_scenes(input_path):
 80 |   scenes = os.listdir(os.path.join(input_path, 'images', 'image_0'))
 81 |   scenes.sort()
 82 |   return scenes
 83 | 
 84 | 
 85 | def get_agents_features_df_with_box(
 86 |     input_path, scene_id, max_distance_to_robot=10.0
 87 | ):
 88 |   """Returns agents features with bounding box from raw leaderboard data."""
 89 |   jrdb_header = [
 90 |       'frame',
 91 |       'track id',
 92 |       'type',
 93 |       'truncated',
 94 |       'occluded',
 95 |       'alpha',
 96 |       'bb_left',
 97 |       'bb_top',
 98 |       'bb_width',
 99 |       'bb_height',
100 |       'x',
101 |       'y',
102 |       'z',
103 |       'height',
104 |       'width',
105 |       'length',
106 |       'rotation_y',
107 |       'score',
108 |   ]
109 |   scene_data_file = utils.get_file_handle(
110 |       os.path.join(
111 |           input_path, 'labels', _TRACKING_METHOD.value,
112 |           f'{scene_id:04}' + '.txt'
113 |       )
114 |   )
115 |   df = pd.read_csv(scene_data_file, sep=' ', names=jrdb_header)
116 | 
117 |   def camera_to_lower_velodyne(p):
118 |     return np.stack(
119 |         [p[..., 2], -p[..., 0], -p[..., 1] + (0.742092 - 0.606982)], axis=-1
120 |     )
121 | 
122 |   df = df[df['score'] >= _TRACKING_CONFIDENCE_THRESHOLD.value]
123 | 
124 |   df['p'] = df[['x', 'y', 'z']].apply(
125 |       lambda s: camera_to_lower_velodyne(s.to_numpy()), axis=1
126 |   )
127 |   df['distance'] = df['p'].apply(lambda s: np.linalg.norm(s, axis=-1))
128 |   df['l'] = df['height']
129 |   df['h'] = df['width']
130 |   df['w'] = df['length']
131 |   df['yaw'] = df['rotation_y']
132 | 
133 |   df['id'] = df['track id'].apply(lambda s: f'pedestrian:{s}')
134 |   df['timestep'] = df['frame']
135 | 
136 |   df = df.set_index(['timestep', 'id'])
137 | 
138 |   df = df[df['distance'] <= max_distance_to_robot]
139 | 
140 |   return df[['p', 'yaw', 'l', 'h', 'w']]
141 | 
142 | 
143 | def jrdb_preprocess_test(input_path, output_path):
144 |   """Preprocesses the raw test split of JRDB."""
145 | 
146 |   tf.keras.utils.set_random_seed(123)
147 | 
148 |   scenes = list_test_scenes(os.path.join(input_path, 'test_dataset'))
149 |   subsample = 1
150 |   for scene in tqdm.tqdm(scenes):
151 |     scene_save_name = scene + '_test'
152 |     agents_df = get_agents_features_df_with_box(
153 |         os.path.join(input_path, 'test_dataset'),
154 |         scenes.index(scene),
155 |         max_distance_to_robot=_MAX_DISTANCE_TO_ROBOT.value,
156 |     )
157 | 
158 |     robot_odom = utils.get_robot(
159 |         os.path.join(input_path, 'processed', 'odometry', 'test'), scene
160 |     )
161 | 
162 |     if AGENT_KEYPOINTS:
163 |       keypoints = utils.get_agents_keypoints(
164 |           os.path.join(
165 |               input_path, 'processed', 'labels',
166 |               'labels_3d_keypoints', 'test', _TRACKING_METHOD.value
167 |           ),
168 |           scene,
169 |       )
170 |       keypoints_df = pd.DataFrame.from_dict(
171 |           keypoints, orient='index'
172 |       ).rename_axis(['timestep', 'id'])  # pytype: disable=missing-parameter  # pandas-drop-duplicates-overloads
173 | 
174 |       agents_df = agents_df.join(keypoints_df)
175 |       agents_df.keypoints.fillna(
176 |           dict(
177 |               zip(
178 |                   agents_df.index[agents_df['keypoints'].isnull()],
179 |                   [np.ones((33, 3)) * np.nan]
180 |                   * len(
181 |                       agents_df.loc[
182 |                           agents_df['keypoints'].isnull(), 'keypoints'
183 |                       ]
184 |                   ),
185 |               )
186 |           ),
187 |           inplace=True,
188 |       )
189 | 
190 |     robot_df = pd.DataFrame.from_dict(robot_odom, orient='index').rename_axis(  # pytype: disable=missing-parameter  # pandas-drop-duplicates-overloads
191 |         ['timestep']
192 |     )
193 |     # Remove extra data odometry datapoints
194 |     robot_df = robot_df.iloc[agents_df.index.levels[0]]
195 | 
196 |     assert (agents_df.index.levels[0] == robot_df.index).all()
197 | 
198 |     # Subsample
199 |     assert len(agents_df.index.levels[0]) == agents_df.index.levels[0].max() + 1
200 |     agents_df_subsampled_index = agents_df.unstack('id').iloc[::subsample].index
201 |     agents_df = (
202 |         agents_df.unstack('id')
203 |         .iloc[::subsample]
204 |         .reset_index(drop=True)
205 |         .stack('id', dropna=True)
206 |     )
207 | 
208 |     agents_in_odometry_df = utils.agents_to_odometry_frame(
209 |         agents_df, robot_df.iloc[::subsample].reset_index(drop=True)
210 |     )
211 | 
212 |     agents_pos_ragged_tensor = utils.agents_pos_to_ragged_tensor(
213 |         agents_in_odometry_df
214 |     )
215 |     agents_yaw_ragged_tensor = utils.agents_yaw_to_ragged_tensor(
216 |         agents_in_odometry_df
217 |     )
218 |     assert (
219 |         agents_pos_ragged_tensor.shape[0] == agents_yaw_ragged_tensor.shape[0]
220 |     )
221 | 
222 |     tf.data.Dataset.from_tensors(agents_pos_ragged_tensor).save(
223 |         os.path.join(output_path, scene_save_name, 'agents', 'position')
224 |     )
225 |     tf.data.Dataset.from_tensors(agents_yaw_ragged_tensor).save(
226 |         os.path.join(output_path, scene_save_name, 'agents', 'orientation')
227 |     )
228 | 
229 |     if AGENT_KEYPOINTS:
230 |       agents_keypoints_ragged_tensor = utils.agents_keypoints_to_ragged_tensor(
231 |           agents_in_odometry_df
232 |       )
233 |       tf.data.Dataset.from_tensors(agents_keypoints_ragged_tensor).save(
234 |           os.path.join(output_path, scene_save_name, 'agents', 'keypoints')
235 |       )
236 | 
237 |     robot_in_odometry_df = utils.robot_to_odometry_frame(robot_df)
238 |     robot_pos = tf.convert_to_tensor(
239 |         np.stack(robot_in_odometry_df.iloc[::subsample]['p'].values).astype(
240 |             np.float32
241 |         )
242 |     )
243 |     robot_orientation = tf.convert_to_tensor(
244 |         np.stack(robot_in_odometry_df.iloc[::subsample]['yaw'].values).astype(
245 |             np.float32
246 |         )
247 |     )[..., tf.newaxis]
248 | 
249 |     tf.data.Dataset.from_tensors(robot_pos).save(
250 |         os.path.join(output_path, scene_save_name, 'robot', 'position')
251 |     )
252 |     tf.data.Dataset.from_tensors(robot_orientation).save(
253 |         os.path.join(output_path, scene_save_name, 'robot', 'orientation')
254 |     )
255 | 
256 |     if _PROCESS_POINTCLOUDS.value:
257 |       scene_pointcloud_dict = utils.get_scene_poinclouds(
258 |           os.path.join(input_path, 'test_dataset'), scene, subsample=subsample
259 |       )
260 |       # Remove extra timesteps
261 |       scene_pointcloud_dict = {
262 |           ts: scene_pointcloud_dict[ts] for ts in agents_df_subsampled_index
263 |       }
264 | 
265 |       scene_pc_odometry = utils.pc_to_odometry_frame(
266 |           scene_pointcloud_dict, robot_df
267 |       )
268 | 
269 |       filtered_pc = utils.filter_agents_and_ground_from_point_cloud(
270 |           agents_in_odometry_df, scene_pc_odometry, robot_in_odometry_df,
271 |           max_dist=_MAX_PC_DISTANCE_TO_ROBOT.value,
272 |       )
273 | 
274 |       scene_pc_ragged_tensor = tf.ragged.stack(filtered_pc)
275 | 
276 |       assert (
277 |           agents_pos_ragged_tensor.bounding_shape()[1]
278 |           == scene_pc_ragged_tensor.shape[0]
279 |       )
280 | 
281 |       tf.data.Dataset.from_tensors(scene_pc_ragged_tensor).save(
282 |           os.path.join(output_path, scene_save_name, 'scene', 'pc'),
283 |           compression='GZIP',
284 |       )
285 | 
286 | 
287 | def main(argv):
288 |   if len(argv) > 1:
289 |     raise app.UsageError('Too many command-line arguments.')
290 |   jrdb_preprocess_test(_INPUT_PATH.value, _OUTPUT_PATH.value)
291 | 
292 | if __name__ == '__main__':
293 |   flags.mark_flags_as_required([
294 |       'input_path', 'output_path'
295 |   ])
296 |   app.run(main)
297 | 


--------------------------------------------------------------------------------
/human_scene_transformer/data/jrdb_preprocess_train.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Preprocesses the raw train split of JRDB.
 16 | """
 17 | 
 18 | import collections
 19 | import json
 20 | import os
 21 | 
 22 | from absl import app
 23 | from absl import flags
 24 | 
 25 | from human_scene_transformer.data import utils
 26 | import numpy as np
 27 | import pandas as pd
 28 | import tensorflow as tf
 29 | import tqdm
 30 | 
 31 | _INPUT_PATH = flags.DEFINE_string(
 32 |     'input_path',
 33 |     default=None,
 34 |     help='Path to jrdb2022 dataset.'
 35 | )
 36 | 
 37 | _OUTPUT_PATH = flags.DEFINE_string(
 38 |     'output_path',
 39 |     default=None,
 40 |     help='Path to output folder.'
 41 | )
 42 | 
 43 | _PROCESS_POINTCLOUDS = flags.DEFINE_bool(
 44 |     'process_pointclouds',
 45 |     default=True,
 46 |     help='Whether to process pointclouds.'
 47 | )
 48 | 
 49 | _MAX_DISTANCE_TO_ROBOT = flags.DEFINE_float(
 50 |     'max_distance_to_robot',
 51 |     default=15.,
 52 |     help=('Maximum distance of agent to the robot to be included'
 53 |           ' in the processed dataset.')
 54 | )
 55 | 
 56 | _MAX_PC_DISTANCE_TO_ROBOT = flags.DEFINE_float(
 57 |     'max_pc_distance_to_robot',
 58 |     default=10.,
 59 |     help=('Maximum distance of pointcloud point to the robot to be included'
 60 |           ' in the processed dataset.')
 61 | )
 62 | 
 63 | 
 64 | AGENT_KEYPOINTS = True
 65 | FROM_DETECTIONS = True
 66 | 
 67 | 
 68 | def get_agents_dict(input_path, scene):
 69 |   """Returns agents GT data from raw data."""
 70 |   scene_data_file = utils.get_file_handle(
 71 |       os.path.join(input_path, 'labels', 'labels_3d', scene + '.json')
 72 |   )
 73 |   scene_data = json.load(scene_data_file)
 74 | 
 75 |   agents = collections.defaultdict(list)
 76 | 
 77 |   for frame in scene_data['labels']:
 78 |     ts = int(frame.split('.')[0])
 79 |     for det in scene_data['labels'][frame]:
 80 |       agents[det['label_id']].append((ts, det))
 81 |   return agents
 82 | 
 83 | 
 84 | def get_agents_dict_from_detections(input_path, scene):
 85 |   """Returns agents data from fused detections raw data."""
 86 |   scene_data_file = utils.get_file_handle(
 87 |       os.path.join(
 88 |           input_path, 'labels', 'labels_detections_3d', scene + '.json'
 89 |       )
 90 |   )
 91 |   scene_data = json.load(scene_data_file)
 92 | 
 93 |   agents = collections.defaultdict(list)
 94 | 
 95 |   for frame in scene_data['labels']:
 96 |     ts = int(frame.split('.')[0])
 97 |     for det in scene_data['labels'][frame]:
 98 |       agents[det['label_id']].append((ts, det))
 99 |   return agents
100 | 
101 | 
102 | def get_agents_features(agents_dict, max_distance_to_robot=10):
103 |   """Returns agents features from raw data dict."""
104 |   agents_pos_dict = collections.defaultdict(dict)
105 |   for agent_id, agent_data in agents_dict.items():
106 |     for ts, agent_instance in agent_data:
107 |       if agent_instance['attributes']['distance'] <= max_distance_to_robot:
108 |         agents_pos_dict[(ts, agent_id)] = {
109 |             'p': np.array([
110 |                 agent_instance['box']['cx'],
111 |                 agent_instance['box']['cy'],
112 |                 agent_instance['box']['cz'],
113 |             ]),
114 |             # rotation angle is relative to negative x axis of robot
115 |             'yaw': np.pi - agent_instance['box']['rot_z'],
116 |         }
117 |   return agents_pos_dict
118 | 
119 | 
120 | def jrdb_preprocess_train(input_path, output_path):
121 |   """Preprocesses the raw train split of JRDB."""
122 | 
123 |   tf.keras.utils.set_random_seed(123)
124 | 
125 |   subsample = 1
126 | 
127 |   scenes = utils.list_scenes(
128 |       os.path.join(input_path, 'train_dataset')
129 |   )
130 |   for scene in tqdm.tqdm(scenes):
131 |     if not FROM_DETECTIONS:
132 |       agents_dict = get_agents_dict(
133 |           os.path.join(input_path, 'train_dataset'), scene
134 |       )
135 |     else:
136 |       agents_dict = get_agents_dict_from_detections(
137 |           os.path.join(input_path, 'processed'), scene
138 |       )
139 | 
140 |     agents_features = utils.get_agents_features_with_box(
141 |         agents_dict, max_distance_to_robot=_MAX_DISTANCE_TO_ROBOT.value
142 |     )
143 | 
144 |     robot_odom = utils.get_robot(
145 |         os.path.join(input_path, 'processed', 'odometry', 'train'), scene
146 |     )
147 | 
148 |     agents_df = pd.DataFrame.from_dict(
149 |         agents_features, orient='index'
150 |     ).rename_axis(['timestep', 'id'])  # pytype: disable=missing-parameter  # pandas-drop-duplicates-overloads
151 | 
152 |     if AGENT_KEYPOINTS:
153 |       keypoints = utils.get_agents_keypoints(
154 |           os.path.join(
155 |               input_path, 'processed', 'labels',
156 |               'labels_3d_keypoints', 'train'),
157 |           scene,
158 |       )
159 |       keypoints_df = pd.DataFrame.from_dict(
160 |           keypoints, orient='index'
161 |       ).rename_axis(['timestep', 'id'])  # pytype: disable=missing-parameter  # pandas-drop-duplicates-overloads
162 | 
163 |       agents_df = agents_df.join(keypoints_df)
164 |       agents_df.keypoints.fillna(
165 |           dict(
166 |               zip(
167 |                   agents_df.index[agents_df['keypoints'].isnull()],
168 |                   [np.ones((33, 3)) * np.nan]
169 |                   * len(
170 |                       agents_df.loc[
171 |                           agents_df['keypoints'].isnull(), 'keypoints'
172 |                       ]
173 |                   ),
174 |               )
175 |           ),
176 |           inplace=True,
177 |       )
178 | 
179 |     robot_df = pd.DataFrame.from_dict(robot_odom, orient='index').rename_axis(  # pytype: disable=missing-parameter  # pandas-drop-duplicates-overloads
180 |         ['timestep']
181 |     )
182 |     # Remove extra data odometry datapoints
183 |     robot_df = robot_df.iloc[agents_df.index.levels[0]]
184 | 
185 |     assert (agents_df.index.levels[0] == robot_df.index).all()
186 | 
187 |     # Subsample
188 |     assert len(agents_df.index.levels[0]) == agents_df.index.levels[0].max() + 1
189 |     agents_df_subsampled_index = agents_df.unstack('id').iloc[::subsample].index
190 |     agents_df = (
191 |         agents_df.unstack('id')
192 |         .iloc[::subsample]
193 |         .reset_index(drop=True)
194 |         .stack('id', dropna=True)
195 |     )
196 | 
197 |     agents_in_odometry_df = utils.agents_to_odometry_frame(
198 |         agents_df, robot_df.iloc[::subsample].reset_index(drop=True)
199 |     )
200 | 
201 |     agents_pos_ragged_tensor = utils.agents_pos_to_ragged_tensor(
202 |         agents_in_odometry_df
203 |     )
204 |     agents_yaw_ragged_tensor = utils.agents_yaw_to_ragged_tensor(
205 |         agents_in_odometry_df
206 |     )
207 |     assert (
208 |         agents_pos_ragged_tensor.shape[0] == agents_yaw_ragged_tensor.shape[0]
209 |     )
210 | 
211 |     tf.data.Dataset.from_tensors(agents_pos_ragged_tensor).save(
212 |         os.path.join(output_path, scene, 'agents', 'position')
213 |     )
214 |     tf.data.Dataset.from_tensors(agents_yaw_ragged_tensor).save(
215 |         os.path.join(output_path, scene, 'agents', 'orientation')
216 |     )
217 | 
218 |     if AGENT_KEYPOINTS:
219 |       agents_keypoints_ragged_tensor = utils.agents_keypoints_to_ragged_tensor(
220 |           agents_in_odometry_df
221 |       )
222 |       tf.data.Dataset.from_tensors(agents_keypoints_ragged_tensor).save(
223 |           os.path.join(output_path, scene, 'agents', 'keypoints')
224 |       )
225 | 
226 |     robot_in_odometry_df = utils.robot_to_odometry_frame(robot_df)
227 |     robot_pos = tf.convert_to_tensor(
228 |         np.stack(robot_in_odometry_df.iloc[::subsample]['p'].values).astype(
229 |             np.float32
230 |         )
231 |     )
232 |     robot_orientation = tf.convert_to_tensor(
233 |         np.stack(robot_in_odometry_df.iloc[::subsample]['yaw'].values).astype(
234 |             np.float32
235 |         )
236 |     )[..., tf.newaxis]
237 | 
238 |     tf.data.Dataset.from_tensors(robot_pos).save(
239 |         os.path.join(output_path, scene, 'robot', 'position')
240 |     )
241 |     tf.data.Dataset.from_tensors(robot_orientation).save(
242 |         os.path.join(output_path, scene, 'robot', 'orientation')
243 |     )
244 | 
245 |     if _PROCESS_POINTCLOUDS.value:
246 |       scene_pointcloud_dict = utils.get_scene_poinclouds(
247 |           os.path.join(input_path, 'train_dataset'),
248 |           scene,
249 |           subsample=subsample,
250 |       )
251 | 
252 |       # Remove extra timesteps
253 |       scene_pointcloud_dict = {
254 |           ts: scene_pointcloud_dict[ts] for ts in agents_df_subsampled_index
255 |       }
256 | 
257 |       scene_pc_odometry = utils.pc_to_odometry_frame(
258 |           scene_pointcloud_dict, robot_df
259 |       )
260 | 
261 |       filtered_pc = utils.filter_agents_and_ground_from_point_cloud(
262 |           agents_in_odometry_df, scene_pc_odometry, robot_in_odometry_df,
263 |           max_dist=_MAX_PC_DISTANCE_TO_ROBOT.value,
264 |       )
265 | 
266 |       scene_pc_ragged_tensor = tf.ragged.stack(filtered_pc)
267 | 
268 |       assert (
269 |           agents_pos_ragged_tensor.bounding_shape()[1]
270 |           == scene_pc_ragged_tensor.shape[0]
271 |       )
272 | 
273 |       tf.data.Dataset.from_tensors(scene_pc_ragged_tensor).save(
274 |           os.path.join(output_path, scene, 'scene', 'pc'), compression='GZIP'
275 |       )
276 | 
277 | 
278 | def main(argv):
279 |   if len(argv) > 1:
280 |     raise app.UsageError('Too many command-line arguments.')
281 |   jrdb_preprocess_train(_INPUT_PATH.value, _OUTPUT_PATH.value)
282 | 
283 | if __name__ == '__main__':
284 |   flags.mark_flags_as_required([
285 |       'input_path', 'output_path'
286 |   ])
287 |   app.run(main)
288 | 


--------------------------------------------------------------------------------
/human_scene_transformer/data/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2024 The human_scene_transformer Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | """Utility Functions."""
 16 | 
 17 | import collections
 18 | import json
 19 | import os
 20 | import tempfile
 21 | 
 22 | from human_scene_transformer.data.math import pose3
 23 | from human_scene_transformer.data.math import quaternion
 24 | from human_scene_transformer.data.math import rotation3
 25 | 
 26 | import numpy as np
 27 | import pandas as pd
 28 | 
 29 | import tensorflow as tf
 30 | 
 31 | import open3d
 32 | 
 33 | 
 34 | def maybe_makedir(path):
 35 |   if not os.path.exists(path):
 36 |    os.makedirs(path)
 37 | 
 38 | 
 39 | def get_file_handle(path, mode='rt'):
 40 |   file_handle = open(path, mode)
 41 |   return file_handle
 42 | 
 43 | 
 44 | def list_scenes(input_path):
 45 |   scenes = os.listdir(os.path.join(input_path, 'labels', 'labels_3d'))
 46 |   scenes.sort()
 47 |   return [scene[:-5] for scene in scenes]
 48 | 
 49 | 
 50 | def get_robot(input_path, scene):
 51 |   """Returns robot features from raw data."""
 52 |   odom_data_file = get_file_handle(
 53 |       os.path.join(input_path, scene + '.json'))
 54 |   odom_data = json.load(odom_data_file)
 55 | 
 56 |   robot = collections.defaultdict(list)
 57 | 
 58 |   for pc_ts, pose in odom_data['odometry'].items():
 59 |     ts = int(pc_ts.split('.')[0])
 60 |     robot[ts] = {
 61 |         'p': np.array([pose['position']['x'],
 62 |                        pose['position']['y'],
 63 |                        pose['position']['z']]),
 64 |         'q': np.array([pose['orientation']['x'],
 65 |                        pose['orientation']['y'],
 66 |                        pose['orientation']['z'],
 67 |                        pose['orientation']['w']]),
 68 |     }
 69 | 
 70 |   return robot
 71 | 
 72 | 
 73 | def get_agents_keypoints(input_path, scene):
 74 |   """Returns agents keypoints from raw data."""
 75 |   scene_data_file = get_file_handle(
 76 |       os.path.join(input_path, scene + '.json'))
 77 |   scene_data = json.load(scene_data_file)
 78 | 
 79 |   agents_keypoints = collections.defaultdict(dict)
 80 | 
 81 |   for frame in scene_data['labels']:
 82 |     ts = int(frame.split('.')[0])
 83 |     for det in scene_data['labels'][frame]:
 84 |       agents_keypoints[(ts, det['label_id'])] = {
 85 |           'keypoints': np.array(det['keypoints']).reshape(33, 3)}
 86 |   return agents_keypoints
 87 | 
 88 | 
 89 | def get_scene_poinclouds(input_path, scene, subsample=1):
 90 |   """Returns scene point clouds from raw data."""
 91 |   pc_files = os.listdir(
 92 |       os.path.join(input_path, 'pointclouds', 'lower_velodyne', scene))
 93 | 
 94 |   pc_files = sorted(pc_files)
 95 |   pc_dict = collections.OrderedDict()
 96 |   for _, pc_file in enumerate(pc_files[::subsample]):
 97 |     pc_file_path = os.path.join(
 98 |         input_path, 'pointclouds', 'lower_velodyne', scene, pc_file)
 99 |     with get_file_handle(pc_file_path, 'rb') as f:
100 |       with tempfile.NamedTemporaryFile() as tmp:
101 |         tmp.write(f.read())
102 |         pcd = open3d.io.read_point_cloud(tmp.name, format='pcd')
103 |         pc_dict[int(pc_file.split('.')[0])] = pcd
104 |   return pc_dict
105 | 
106 | 
107 | def pc_to_odometry_frame(pc_dict, robot_df):
108 |   """Transforms point clouds into odometry frame."""
109 |   world_pose_odometry = pose3.Pose3(
110 |       rotation3.Rotation3(
111 |           quaternion.Quaternion(robot_df.loc[0]['q'])), robot_df.loc[0]['p'])
112 |   odometry_pose_world = world_pose_odometry.inverse()
113 | 
114 |   pc_list = []
115 |   for ts, pc in pc_dict.items():
116 |     robot_odometry_dp = robot_df.loc[ts]
117 | 
118 |     world_pose_robot = pose3.Pose3(
119 |         rotation3.Rotation3(
120 |             quaternion.Quaternion(
121 |                 robot_odometry_dp['q'])), robot_odometry_dp['p'])
122 | 
123 |     odometry_pose_robot = odometry_pose_world * world_pose_robot
124 | 
125 |     odometry_pc = pc.transform(odometry_pose_robot.matrix4x4())
126 | 
127 |     pc_list.append(np.array(odometry_pc.points, dtype=np.float32))
128 | 
129 |   return pc_list
130 | 
131 | 
132 | def robot_to_odometry_frame(robot_df):
133 |   """Transforms robot features into odometry frame."""
134 |   world_pose_odometry = pose3.Pose3(
135 |       rotation3.Rotation3(
136 |           quaternion.Quaternion(robot_df.loc[0]['q'])), robot_df.loc[0]['p'])
137 |   odometry_pose_world = world_pose_odometry.inverse()
138 | 
139 |   robot_dict = {}
140 |   for ts, row in robot_df.iterrows():
141 |     world_pose_robot = pose3.Pose3(
142 |         rotation3.Rotation3(quaternion.Quaternion(row['q'])), row['p'])
143 |     odometry_pose_robot = odometry_pose_world * world_pose_robot
144 | 
145 |     robot_dict[ts] = {
146 |         'p': odometry_pose_robot.translation,
147 |         'yaw': odometry_pose_robot.rotation.euler_angles(radians=True)[-1]
148 |         }
149 |   return pd.DataFrame.from_dict(
150 |       robot_dict, orient='index').rename_axis(['timestep'])  # pytype: disable=missing-parameter  # pandas-drop-duplicates-overloads
151 | 
152 | 
153 | def agents_to_odometry_frame(agents_df, robot_df):
154 |   """Transforms agents features into odometry frame."""
155 |   world_pose_odometry = pose3.Pose3(
156 |       rotation3.Rotation3(
157 |           quaternion.Quaternion(robot_df.loc[0]['q'])), robot_df.loc[0]['p'])
158 |   odometry_pose_world = world_pose_odometry.inverse()
159 | 
160 |   agents_dict = {}
161 |   for index, row in agents_df.iterrows():
162 |     ts = index[0]
163 |     robot_odometry_dp = robot_df.loc[ts]
164 | 
165 |     world_pose_robot = pose3.Pose3(
166 |         rotation3.Rotation3(
167 |             quaternion.Quaternion(robot_odometry_dp['q'])),
168 |         robot_odometry_dp['p'])
169 | 
170 |     robot_pose_agent = pose3.Pose3(
171 |         rotation3.Rotation3.from_euler_angles(
172 |             rpy_radians=[0., 0., row['yaw']]), row['p'])
173 | 
174 |     odometry_pose_agent = (odometry_pose_world * world_pose_robot
175 |                            * robot_pose_agent)
176 | 
177 |     agents_dict[index] = {
178 |         'p': odometry_pose_agent.translation,
179 |         'yaw': odometry_pose_agent.rotation.euler_angles(radians=True)[-1]}
180 | 
181 |     if 'l' in row:
182 |       agents_dict[index]['l'] = row['l']
183 |       agents_dict[index]['w'] = row['w']
184 |       agents_dict[index]['h'] = row['h']
185 | 
186 |     if 'keypoints' in row:
187 |       world_rot_robot = rotation3.Rotation3(
188 |           quaternion.Quaternion(robot_odometry_dp['q']))
189 |       odometry_rot_robot = odometry_pose_world.rotation * world_rot_robot
190 |       rot_keypoints = []
191 |       for keypoint in row['keypoints']:
192 |         if np.isnan(keypoint).any():
193 |           rot_keypoints.append(keypoint)
194 |         else:
195 |           rot_keypoints.append(odometry_rot_robot.rotate_point(keypoint))
196 |       rot_keypoints = np.array(rot_keypoints)
197 |       agents_dict[index]['keypoints'] = rot_keypoints
198 | 
199 |   return pd.DataFrame.from_dict(
200 |       agents_dict, orient='index').rename_axis(['timestep', 'id'])  # pytype: disable=missing-parameter  # pandas-drop-duplicates-overloads
201 | 
202 | 
203 | def get_agents_features_with_box(agents_dict, max_distance_to_robot=10):
204 |   """Returns agents features with bounding box from raw data dict."""
205 |   agents_pos_dict = collections.defaultdict(dict)
206 |   for agent_id, agent_data in agents_dict.items():
207 |     for (ts, agent_instance) in agent_data:
208 |       if agent_instance['attributes']['distance'] <= max_distance_to_robot:
209 |         agents_pos_dict[(ts, agent_id)] = {
210 |             'p': np.array([agent_instance['box']['cx'],
211 |                            agent_instance['box']['cy'],
212 |                            agent_instance['box']['cz']]),
213 |             # rotation angle is relative to negatiev x axis of robot
214 |             'yaw': np.pi - agent_instance['box']['rot_z'],
215 |             'l': agent_instance['box']['l'],
216 |             'w': agent_instance['box']['w'],
217 |             'h': agent_instance['box']['h']
218 |         }
219 |   return agents_pos_dict
220 | 
221 | 
222 | def agents_pos_to_ragged_tensor(agents_df):
223 |   tensor_list = []
224 |   for _, df in agents_df.groupby('id'):
225 |     dropped_df = df.droplevel(1, axis=0)
226 |     r_tensor = tf.RaggedTensor.from_value_rowids(
227 |         values=np.vstack(df['p'].values).flatten().astype(np.float32),
228 |         value_rowids=np.tile(
229 |             np.array(dropped_df.index), (3, 1)).transpose().flatten())
230 |     tensor_list.append(r_tensor)
231 |   return tf.stack(tensor_list)
232 | 
233 | 
234 | def agents_yaw_to_ragged_tensor(agents_df):
235 |   tensor_list = []
236 |   for _, df in agents_df.groupby('id'):
237 |     dropped_df = df.droplevel(1, axis=0)
238 |     r_tensor = tf.RaggedTensor.from_value_rowids(
239 |         values=np.vstack(df['yaw'].values).flatten().astype(np.float32),
240 |         value_rowids=np.array(dropped_df.index))
241 |     tensor_list.append(r_tensor)
242 |   return tf.stack(tensor_list)
243 | 
244 | 
245 | def agents_keypoints_to_ragged_tensor(agents_df):
246 |   tensor_list = []
247 |   for _, df in agents_df.groupby('id'):
248 |     dropped_df = df.droplevel(1, axis=0)
249 |     r_tensor = tf.RaggedTensor.from_value_rowids(
250 |         values=np.vstack(df['keypoints'].values).astype(np.float32),
251 |         value_rowids=np.tile(
252 |             np.array(dropped_df.index), (33, 1)).transpose().flatten())
253 |     tensor_list.append(r_tensor)
254 |   return tf.stack(tensor_list)
255 | 
256 | 
257 | def box_to_hyperplanes(pos, yaw, l, w, h):
258 |   """Transforms a bounding box to a set of hyperplanes."""
259 |   s = np.sin(yaw)
260 |   c = np.cos(yaw)
261 |   normal = np.array([
262 |       np.array([0, 0, h/2]),
263 |       np.array([0, 0, -h/2]),
264 |       np.array([-s * w/2, c * w/2, 0]),
265 |       np.array([-s * -w/2, c * -w/2, 0]),
266 |       np.array([c * l/2, s * l/2, 0]),
267 |       np.array([c * -l/2, s * -l/2, 0])])
268 |   points = pos + normal
269 |   normal = normal / np.linalg.norm(normal, axis=-1, keepdims=True)
270 |   w = -normal
271 | 
272 |   d = -w[:, 0] * points[:, 0] - w[:, 1] * points[:, 1] - w[:, 2] * points[:, 2]
273 | 
274 |   return w, d
275 | 
276 | 
277 | def filter_agents_and_ground_from_point_cloud(
278 |     agents_df, pointcloud_dict, robot_in_odometry_df, max_dist=10.):
279 |   """Filter points which are in human bb or belong to ground."""
280 |   for t, agent_df in agents_df.groupby('timestep'):
281 |     pc_points = pointcloud_dict[t]
282 |     robot_p = robot_in_odometry_df.loc[t]['p'][:2]
283 |     dist_mask = np.linalg.norm(robot_p - pc_points[..., :2], axis=-1) < max_dist
284 |     pc_points = pc_points[
285 |         (pc_points[:, -1] > -0.2) & (pc_points[:, -1] < 0.5) & dist_mask]
286 |     for _, row in agent_df.iterrows():
287 |       w, d = box_to_hyperplanes(
288 |           row['p'], row['yaw'], 1.5*row['l'], 1.5*row['w'], row['h'])
289 |       agent_pc_mask = np.all((pc_points @ w.T + d) > 0., axis=-1)
290 |       pc_points = pc_points[~agent_pc_mask]
291 |     np.random.shuffle(pc_points)
292 |     pointcloud_dict[t] = pc_points
293 |   return pointcloud_dict
294 | 
295 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------