├── .gitignore ├── LICENSE ├── README.md ├── control ├── __init__.py ├── compare.py ├── configs │ ├── README.md │ ├── env-noisy.config │ ├── env-policies.config │ ├── env-train.config │ ├── model │ │ ├── policy.config │ │ └── train.config │ ├── reward │ │ ├── policy.config │ │ └── train.config │ ├── sarl │ │ ├── env.config │ │ ├── policy.config │ │ └── train.config │ └── training │ │ ├── env.config │ │ ├── policy.config │ │ └── train.config ├── policy │ ├── __init__.py │ ├── policy_factory.py │ ├── sarl.py │ ├── uncertain_sarl.py │ └── value_network.py ├── test.py ├── train.py └── utils │ ├── __init__.py │ ├── explorer.py │ ├── memory.py │ ├── plot.py │ └── trainer.py ├── hardware ├── __init__.py ├── demo.py └── utils │ ├── __init__.py │ ├── turtlebot_control.py │ ├── turtlebot_hardware.py │ └── utils.py ├── setup.py ├── simulation ├── __init__.py └── envs │ ├── __init__.py │ ├── crowd_sim.py │ ├── policy │ ├── __init__.py │ ├── cadrl.py │ ├── cadrl │ │ └── rl_model.pth │ ├── linear.py │ ├── orca.py │ ├── policy.py │ └── policy_factory.py │ └── utils │ ├── __init__.py │ ├── action.py │ ├── agent.py │ ├── functions.py │ ├── human.py │ ├── info.py │ ├── robot.py │ ├── scenarios.py │ └── state.py └── uncertainty ├── __init__.py ├── collect_data.py ├── configs └── train.config ├── estimate_epsilons.py ├── network.py ├── preprocess_data.py ├── train.py └── train_models.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac 2 | .DS_Store 3 | */.DS_Store 4 | 5 | # editors 6 | .idea/ 7 | .vscode/ 8 | 9 | # Python 10 | *.egg-info/ 11 | .pytest_cache/ 12 | __pycache__ 13 | .cache/ 14 | venv*/ 15 | 16 | # data 17 | *.mp4 18 | *.png 19 | *.log 20 | *.csv 21 | *.pkl 22 | data/ 23 | models/ 24 | output/ 25 | videos/ 26 | 27 | # jupyter checkpoints 28 | .ipynb_checkpoints 29 | *.ipynb 30 | 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 dbl-blnd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stranger Danger 2 | 3 | This is the codebase for the paper titled ["Stranger Danger! Identifying and Avoiding Unpredictable Pedestrians in RL-based Social Robot Navigation,"](https://ieeexplore.ieee.org/document/10610413) which was presented at the 2024 IEEE International Conference on Robotics and Automation (ICRA). This README describes how to reproduce the results achieved in this paper. An extended version of our paper is available on [arXiv](https://arxiv.org/abs/2407.06056), and a video showcasing our methods and results is available on [YouTube](https://youtu.be/9IDhXvCC58w?si=Y0Di3d5NjWj-3nvl). If you find this work useful, please cite our paper using the citation provided at end of this [README](https://github.com/sarapohland/stranger-danger#citing-our-work). 4 | 5 | ## 0) Setup 6 | 7 | 1. Create an environment with Python 3.6 on Ubuntu Linux. 8 | 2. Install the [Python-RVO2](https://github.com/sybrenstuvel/Python-RVO2) library. 9 | 10 | Note: Make sure that CMake and the tested version of Cython are installed. If the build fails, you may need to delete the build folder and try building again. 11 | 12 | 3. Within the main `stranger-danger` directory, run the following command: 13 | ``` 14 | pip install -e . 15 | ``` 16 | 17 | ## 1) Train Uncertainty Estimation Networks 18 | 19 | The following section details how to train the set of uncertainty estimation networks utilized by two variations of the uncertainty-aware RL policy. 20 | 21 | ### 1.1) Collect pedestrian walking data 22 | 23 | From the `uncertainty` folder, run the following command to collect data of randomized ORCA pedestrians navigating through 100 episodes of six different scenarios: 24 | ``` 25 | python collect_data.py --vary_param epsilon --num_episodes 100 --output_dir data/ 26 | ``` 27 | 28 | This will create CSV files of relevant data in a folder called `data` within the `uncertainty` folder. 29 | 30 | ### 1.2) Preprocess pedestrian walking data 31 | 32 | From the `uncertainty` folder, run the following command to preprocess the collected pedestrian walking data to prepare it to be used as input to the uncertainty network: 33 | ``` 34 | python preprocess_data.py --data_dir data/ 35 | ``` 36 | 37 | This will generate several folders of CSV files within a folder called `preprocessed_data` in the `data` folder you created in the previous step. 38 | 39 | ### 1.3) Train networks with preprocessed data 40 | 41 | From the `uncertainty` folder, run the following script to train and save 20 uncertainty prediction networks with 20 different numbers of time steps: 42 | ``` 43 | chmod +x train_models.sh 44 | ./train_models.sh 45 | ``` 46 | 47 | This will train 20 models using the preprocessed data from the previous step and store each model within its own folder in a larger folder called `models`. The model folders are named such that `uncertain_T` is the model trained using T time steps. Within this folder, there is also a plot of the loss curve and a plot of the model prediction accuracy. *Note: It may take several hours or days to train all 20 models.* 48 | 49 | ## 2) Train Socially-Aware RL Policies 50 | 51 | The following section details how to train the baseline SARL policy and the three variations of this RL policy. 52 | 53 | ### 2.1) Train baseline SARL policy (*SARL*) 54 | 55 | From the `control` folder, run the following command to train the baseline SARL policy without uncertainty-awareness using standard ORCA pedestrians: 56 | ``` 57 | python train.py --policy sarl --output_dir models/sarl/ --env_config configs/env-train.config --policy_config configs/sarl/policy.config --train_config configs/sarl/train.config 58 | ``` 59 | 60 | This will save the trained RL policy to the folder `models/sarl/` in the `control` directory. The model will be trained using the parameters specified in the configuration files (`env_config`, `policy_config`, and `train_config`). More information on the parameters in these configuration files can be found in the configs [README](https://github.com/sarapohland/stranger-danger/blob/main/control/configs/README.md). You can try training with different discomfort distances by changing the `discomfort_dist` value under `reward` in the [training environment config file](https://github.com/sarapohland/stranger-danger/blob/main/control/configs/env-train.config) *Note: It may take several hours or days to train a single model.* 61 | 62 | ### 2.2) Train policy with modified training process (*Training*) 63 | 64 | From the `control` folder, run the following command to train the RL policy without uncertainty-awareness using Noisy ORCA pedestrians: 65 | ``` 66 | python train.py --policy sarl --output_dir models/training/ --env_config configs/env-train.config --policy_config configs/sarl/policy.config --train_config configs/sarl/train.config 67 | ``` 68 | 69 | This will save the trained RL policy to the folder `models/training/` in the `control` directory. The model will be trained using the parameters specified in the configuration files (`env_config`, `policy_config`, and `train_config`). More information on the parameters in these configuration files can be found in the configs [README](https://github.com/sarapohland/stranger-danger/blob/main/control/configs/README.md). *Note: It may take several hours or days to train a single model.* 70 | 71 | ### 2.3) Train policy with modified model architecture (*Model*) 72 | 73 | From the `control` folder, run the following command to train the uncertainty-aware RL policy with only the modified model architecture using Noisy ORCA pedestrians: 74 | ``` 75 | python train.py --policy uncertain_sarl --output_dir models/model/ --env_config configs/env-train.config --policy_config configs/sarl/policy.config --train_config configs/sarl/train.config 76 | ``` 77 | 78 | This will save the trained RL policy to the folder `models/model/` in the `control` directory. The model will be trained using the parameters specified in the configuration files (`env_config`, `policy_config`, and `train_config`). More information on the parameters in these configuration files can be found in the configs [README](https://github.com/sarapohland/stranger-danger/blob/main/control/configs/README.md). *Note: It may take several hours or days to train a single model.* 79 | 80 | ### 2.4) Train policy with modified reward function (*Reward*) 81 | 82 | From the `control` folder, run the following command to train the uncertainty-aware RL policy with the undertainty-dependent reward function using Noisy ORCA pedestrians: 83 | ``` 84 | python train.py --policy uncertain_sarl --output_dir models/reward/ --env_config configs/env-train.config --policy_config configs/sarl/policy.config --train_config configs/sarl/train.config 85 | ``` 86 | 87 | This will save the trained RL policy to the folder `models/reward/` in the `control` directory. The model will be trained using the parameters specified in the configuration files (`env_config`, `policy_config`, and `train_config`). More information on the parameters in these configuration files can be found in the configs [README](https://github.com/sarapohland/stranger-danger/blob/main/control/configs/README.md). *Note: It may take several hours or days to train a single model.* 88 | 89 | ## 3) Ablation Study on Noisy ORCA Pedestrians 90 | 91 | The following section describes how to reproduce the results provided in the ablation study on noisy ORCA pedestrians.Evaluation parameters can be set in the configuration file `configs/env-noisy.config`. More information on the parameters in this configuration files can be found in the configs [README](https://github.com/sarapohland/stranger-danger/blob/main/control/configs/README.md). 92 | 93 | ### 3.1) Evaluate baseline SARL policy (*SARL*) 94 | 95 | The following command will run 500 random trials evaluating the performance of the baseline SARL policy on Noisy ORCA pedestrians with a maximum unpredictability score of 0.5: 96 | ``` 97 | python test.py --policy sarl --model_dir models/sarl/ --estimate_eps --max_epsilon 0.5 --stats_file models/sarl/stats-noisy.csv --env_config configs/env-noisy.config 98 | ``` 99 | 100 | This script will save a CSV file called `stats-noisy.csv` within the `models/sarl/` folder. 101 | 102 | ### 3.2) Evaluate policy with modified training process (*Training*) 103 | 104 | The following command will run 500 random trials evaluating the performance of the *Training* policy on Noisy ORCA pedestrians with a maximum unpredictability score of 0.5: 105 | ``` 106 | python test.py --policy sarl --model_dir models/training/ --estimate_eps --max_epsilon 0.5 --stats_file models/training/stats-noisy.csv --env_config configs/env-noisy.config 107 | ``` 108 | 109 | This script will save a CSV file called `stats-noisy.csv` within the `models/training/` folder. 110 | 111 | ### 3.3) Evaluate policy with modified model architecture (*Model*) 112 | 113 | The following command will run 500 random trials evaluating the performance of the *Model* policy on Noisy ORCA pedestrians with a maximum unpredictability score of 0.5: 114 | ``` 115 | python test.py --policy uncertain_sarl --model_dir models/model/ --estimate_eps --max_epsilon 0.5 --stats_file models/model/stats-noisy.csv --env_config configs/env-noisy.config 116 | ``` 117 | 118 | This script will save a CSV file called `stats-noisy.csv` within the `models/model/` folder. 119 | 120 | ### 3.4) Evaluate policy with modified reward function (*Reward*) 121 | 122 | The following command will run 500 random trials evaluating the performance of the *Reward* policy on Noisy ORCA pedestrians with a maximum unpredictability score of 0.5: 123 | ``` 124 | python test.py --policy uncertain_sarl --model_dir models/reward/ --estimate_eps --max_epsilon 0.5 --stats_file models/reward/stats-noisy.csv --env_config configs/env-noisy.config 125 | ``` 126 | 127 | This script will save a CSV file called `stats-noisy.csv` within the `models/reward/` folder. 128 | 129 | ### 3.5) Evaluate baseline ORCA policy (*ORCA*) 130 | 131 | The following command will run 500 random trials evaluating the performance of the baseline ORCA policy on Noisy ORCA pedestrians with a maximum unpredictability score of 0.5: 132 | ``` 133 | python test.py --policy orca --estimate_eps --max_epsilon 0.5 --stats_file models/orca/stats-noisy.csv --env_config configs/env-noisy.config 134 | ``` 135 | 136 | This script will save a CSV file called `stats-noisy.csv` within the `models/orca/` folder. 137 | 138 | ### 3.6) Create table comparing policy performance 139 | 140 | To compare the performance of these four RL policies (plus the ORCA policy), you can use the compare script in the `control` directory: 141 | ``` 142 | python compare.py --files models/orca/stats-noisy.csv models/sarl/stats-noisy.csv models/training/stats-noisy.csv models/model/stats-noisy.csv models/reward/stats-noisy.csv --names ORCA SARL Training Model Reward 143 | ``` 144 | 145 | This command will print a table of results for the ablation study with LaTeX formating. 146 | 147 | ### 4) Abalation Study on Diverse, Realistic Pedestrians 148 | 149 | The following section describes how to reproduce the results provided in the ablation study on diverse, realistic pedestrians. Evaluation parameters can be set in the configuration file `configs/env-policies.config`. More information on the parameters in this configuration files can be found in the configs [README](https://github.com/sarapohland/stranger-danger/blob/main/control/configs/README.md). 150 | 151 | ### 4.1) Evaluate baseline SARL policy (*SARL*) 152 | 153 | The following command will run 100 random trials evaluating the performance of the baseline SARL policy on pedestrians operating under various policies (standard ORCA, CADRL, and Linear with randomized parameters): 154 | ``` 155 | python test.py --policy sarl --model_dir models/sarl/ --estimate_eps --stats_file models/sarl/stats-policies.csv --env_config configs/env-policies.config 156 | ``` 157 | 158 | This script will save a CSV file called `stats-policies.csv` within the `models/sarl/` folder. 159 | 160 | ### 4.2) Evaluate policy with modified training process (*Training*) 161 | 162 | The following command will run 100 random trials evaluating the performance of the *Training* policy on pedestrians operating under various policies (standard ORCA, CADRL, and Linear with randomized parameters): 163 | ``` 164 | python test.py --policy sarl --model_dir models/training/ --estimate_eps --stats_file models/training/stats-policies.csv --env_config configs/env-policies.config 165 | ``` 166 | 167 | This script will save a CSV file called `stats-policies.csv` within the `models/training/` folder. 168 | 169 | ### 4.3) Evaluate policy with modified model architecture (*Model*) 170 | 171 | The following command will run 100 random trials evaluating the performance of the *Model* policy on pedestrians operating under various policies (standard ORCA, CADRL, and Linear with randomized parameters): 172 | ``` 173 | python test.py --policy uncertain_sarl --model_dir models/model/ --estimate_eps --stats_file models/model/stats-policies.csv --env_config configs/env-policies.config 174 | ``` 175 | 176 | This script will save a CSV file called `stats-policies.csv` within the `models/model/` folder. 177 | 178 | ### 4.4) Evaluate policy with modified reward function (*Reward*) 179 | 180 | The following command will run 100 random trials evaluating the performance of the *Reward* policy on pedestrians operating under various policies (standard ORCA, CADRL, and Linear with randomized parameters): 181 | ``` 182 | python test.py --policy uncertain_sarl --model_dir models/reward/ --estimate_eps --stats_file models/reward/stats-policies.csv --env_config configs/env-policies.config 183 | ``` 184 | 185 | This script will save a CSV file called `stats-policies.csv` within the `models/reward/` folder. 186 | 187 | ### 4.5) Evaluate baseline ORCA policy (*ORCA*) 188 | 189 | The following command will run 100 random trials evaluating the performance of the baseline ORCA policy on pedestrians operating under various policies (standard ORCA, CADRL, and Linear with randomized parameters): 190 | ``` 191 | python test.py --policy orca --estimate_eps --stats_file models/orca/stats-policies.csv --env_config configs/env-policies.config 192 | ``` 193 | 194 | This script will save a CSV file called `stats-policies.csv` within the `models/orca/` folder. 195 | 196 | ### 4.6) Create table comparing policy performance 197 | 198 | To compare the performance of these four RL policies (plus the ORCA policy), you can use the compare script in the `control` directory: 199 | ``` 200 | python compare.py --files models/orca/stats-policies.csv models/sarl/stats-policies.csv models/training/stats-policies.csv models/model/stats-policies.csv models/reward/stats-policies.csv --names ORCA SARL Training Model Reward 201 | ``` 202 | 203 | This command will print a table of results for the ablation study with LaTeX formating. 204 | 205 | ## 5) Visualize the Trained RL Policies 206 | 207 | ### 5.1) Visualize trials of baseline SARL policy (*SARL*) 208 | 209 | The following command will allow you to visualize a single evaluation trial of the baseline SARL policy: 210 | ``` 211 | python test.py --policy sarl --model_dir models/sarl/ --estimate_eps --max_epsilon --visualize --test_case 1 --video_file videos/sarl/test1.mp4 --env_config 212 | ``` 213 | 214 | This will save a video called `test1.mp4` in the `videos/sarl/` folder showing trial number 1 with the environment configurations specified by `config_file` and the maximum epsilon specified by `epsilon_value`. 215 | 216 | ### 5.2) Visualize trials of policy with modified training process (*Training*) 217 | 218 | The following command will allow you to visualize a single evaluation trial of the *Training* policy: 219 | ``` 220 | python test.py --policy sarl --model_dir models/training/ --estimate_eps --max_epsilon --visualize --test_case 1 --video_file videos/training/test1.mp4 --env_config 221 | ``` 222 | 223 | This will save a video called `test1.mp4` in the `videos/training/` folder showing trial number 1 with the environment configurations specified by `config_file` and the maximum epsilon specified by `epsilon_value`. 224 | 225 | ### 5.3) Visualize trials of policy with modified model architecture (*Model*) 226 | 227 | The following command will allow you to visualize a single evaluation trial of the *Model* policy: 228 | ``` 229 | python test.py --policy uncertain_sarl --model_dir models/model/ --estimate_eps --max_epsilon --visualize --test_case 1 --video_file videos/model/test1.mp4 --env_config 230 | ``` 231 | 232 | This will save a video called `test1.mp4` in the `videos/model/` folder showing trial number 1 with the environment configurations specified by `config_file` and the maximum epsilon specified by `epsilon_value`. 233 | 234 | ### 5.4) Visualize trials of policy with modified reward function (*Reward*) 235 | 236 | The following command will allow you to visualize a single evaluation trial of the *Reward* policy: 237 | ``` 238 | python test.py --policy uncertain_sarl --model_dir models/reward/ --estimate_eps --max_epsilon --visualize --test_case 1 --video_file videos/reward/test1.mp4 --env_config 239 | ``` 240 | 241 | This will save a video called `test1.mp4` in the `videos/reward/` folder showing trial number 1 with the environment configurations specified by `config_file` and the maximum epsilon specified by `epsilon_value`. 242 | 243 | ## Citing Our Work 244 | 245 | If you find this codebase useful, please cite the paper associated with this repository: 246 | 247 | S. Pohland, A. Tan, P. Dutta and C. Tomlin, "Stranger Danger! Identifying and Avoiding Unpredictable Pedestrians in RL-based Social Robot Navigation," 2024 IEEE International Conference on Robotics and Automation (ICRA), Yokohama, Japan, 2024, pp. 15217-15224, doi: 10.1109/ICRA57147.2024.10610413. 248 | 249 | @InProceedings{stranger-danger, \ 250 |   author="Pohland, Sara and Tan, Alvin and Dutta, Prabal and Tomlin, Claire", \ 251 |   title="Stranger Danger! Identifying and Avoiding Unpredictable Pedestrians in RL-based Social Robot Navigation", \ 252 |   booktitle="2024 IEEE International Conference on Robotics and Automation (ICRA)", \ 253 |   year="2024", \ 254 |   month="May", \ 255 |   publisher="IEEE", \ 256 |   pages="15217--15224", \ 257 |   doi="10.1109/ICRA57147.2024.10610413" \ 258 |   } -------------------------------------------------------------------------------- /control/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sarapohland/stranger-danger/84a6bad96de4f117d6192b5111eba5531e766e26/control/__init__.py -------------------------------------------------------------------------------- /control/compare.py: -------------------------------------------------------------------------------- 1 | import math 2 | import argparse 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from ast import literal_eval 7 | from tabulate import tabulate 8 | from matplotlib import pyplot as plt 9 | 10 | 11 | def get_result_string(a_series): 12 | a_result = "{:.2f}".format(np.mean(a_series)) 13 | for a_quant in [0.9, 0.95]: 14 | # Quantile (i.e. Value at risk.) 15 | #a_result += "/{:.3f}".format(a_series.quantile(a_quant)) 16 | 17 | # Expected value above the quantile (i.e. Conditional Value at Risk) 18 | a_result += "/{:.2f}".format(np.mean(a_series.loc[(a_series >= a_series.quantile(a_quant))])) 19 | return a_result 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser('Parse configuration file') 23 | parser.add_argument('--files', action='append', nargs='+') 24 | parser.add_argument('--names', action='append', nargs='+') 25 | args = parser.parse_args() 26 | 27 | metric_files = args.files[0] 28 | policy_names = args.names[0] 29 | 30 | # Ignore collisions within the first time step 31 | metrics_dfs = [] 32 | for metric_file in metric_files: 33 | metric_df = pd.read_csv(metric_file) 34 | for index, row in metric_df.iterrows(): 35 | collision_times = literal_eval(row['collision_times']) 36 | collision_positions = literal_eval(row['collision_positions']) 37 | collision_blames = literal_eval(row['rob_collision_blames']) 38 | metric_df.at[index, 'collision_times'] = [] 39 | metric_df.at[index, 'collision_positions'] = [] 40 | metric_df.at[index, 'rob_collision_blames'] = [] 41 | for j, collision_time in enumerate(collision_times): 42 | if collision_time > 0.5: 43 | metric_df.at[index, 'collision_times'].append(collision_time) 44 | metric_df.at[index, 'collision_positions'].append(collision_positions[j]) 45 | metric_df.at[index, 'rob_collision_blames'].append(collision_blames[j]) 46 | num_collisions = len(metric_df.at[index, 'collision_times']) 47 | metric_df.at[index, 'num_collisions'] = num_collisions 48 | if num_collisions == 0 and row['result'] == 'HumanCollision': 49 | metric_df.at[index, 'result'] = 'Timeout' if row['navigation_time'] > 29.0 else 'ReachGoal' 50 | metrics_dfs.append(metric_df) 51 | 52 | # Calculate path lengths 53 | for metrics_df in metrics_dfs: 54 | metrics_df['path_length'] = metrics_df['navigation_time'] * metrics_df['vel_mean'] 55 | 56 | # Define nominal path length and navigation time 57 | V_PREF = 1 # m/s 58 | ROOM_HEIGHT = 15 # meters 59 | GOAL_RADIUS = 0.3 # meters 60 | ROBOT_RADIUS = 0.3 # meters 61 | OPT_PATH_LENGTH = 2 * (ROOM_HEIGHT/2 - 1) - GOAL_RADIUS - ROBOT_RADIUS 62 | OPT_NAV_TIME = OPT_PATH_LENGTH / V_PREF 63 | 64 | results = [[name] for name in policy_names] 65 | for policy_index, metrics_df in enumerate(metrics_dfs): 66 | # success/collision/timeout rates 67 | total_trials = len(metrics_df) 68 | results[policy_index].append(sum(metrics_df['result'] == 'ReachGoal') / total_trials * 100) # Success Rate 69 | results[policy_index].append(sum(metrics_df['result'] == 'Timeout') / total_trials * 100) # Timeout Rate 70 | results[policy_index].append(sum(metrics_df['result'] == 'HumanCollision') / total_trials * 100) # Collision Rate 71 | 72 | # Only consider successful runs so we don't skew our navigation times or 73 | # minimum distance values with collisions and timeouts 74 | smol_metrics = metrics_df.loc[(metrics_df['result'] == 'ReachGoal') | (metrics_df['result'] == 'HumanCollision')] 75 | 76 | # normalized navigation time 77 | normed_times = smol_metrics['navigation_time'] / OPT_NAV_TIME 78 | 79 | results[policy_index].append(get_result_string(normed_times)) 80 | # results[policy_index].append(np.mean(normed_times)) 81 | # results[policy_index].append("{:.4f} \u00B1 {:.4f}".format(np.mean(normed_times), np.std(normed_times))) 82 | 83 | # normalized path lengths 84 | normed_lengths = smol_metrics['path_length'] / OPT_PATH_LENGTH 85 | results[policy_index].append(get_result_string(normed_lengths)) 86 | # results[policy_index].append(np.mean(normed_lengths)) 87 | # results[policy_index].append("{:.4f} \u00B1 {:.4f}".format(np.mean(normed_lengths), np.std(normed_lengths))) 88 | 89 | # total number of collisions between the robot and a pedestrian across all trials 90 | sum_collisions = sum(metrics_df['num_collisions']) 91 | results[policy_index].append(sum_collisions) 92 | 93 | # personal space cost 94 | costs = metrics_df['avg_max_cost']*1000 95 | results[policy_index].append(get_result_string(costs)) 96 | # results[policy_index].append(np.mean(costs)) 97 | # results[policy_index].append("{:.4f} \u00B1 {:.4f}".format(np.mean(costs), np.std(costs))) 98 | 99 | # proportion of time spent in someone's personal space 100 | intruded_props = metrics_df['pers_time_intruded'] / metrics_df['navigation_time'] * 100 101 | results[policy_index].append(get_result_string(intruded_props)) 102 | # results[policy_index].append(np.mean(intruded_props)) 103 | # results[policy_index].append("{:.4f} \u00B1 {:.4f}".format(np.mean(intruded_props), np.std(intruded_props))) 104 | 105 | # proportion of time spent in someone's intimate space 106 | intruded_props = metrics_df['int_time_intruded'] / metrics_df['navigation_time'] * 100 107 | results[policy_index].append(get_result_string(intruded_props)) 108 | # results[policy_index].append(np.mean(intruded_props)) 109 | # results[policy_index].append("{:.4f} \u00B1 {:.4f}".format(np.mean(intruded_props), np.std(intruded_props))) 110 | 111 | # # "accountability" of the robot when a collision occurs 112 | # collision_blames = [] 113 | # for row in metrics_df['rob_collision_blames']: 114 | # if row: 115 | # collision_blames += [val for val in row] 116 | # if collision_blames: 117 | # avg_accountability = np.mean(collision_blames) 118 | # std_accountability = np.std(collision_blames) 119 | # results[policy_index].append(avg_accountability) 120 | # # results[policy_index].append("{:.4f} \u00B1 {:.4f}".format(avg_accountability, std_accountability)) 121 | 122 | # Print the table so it looks nice-ish 123 | headers = ['Navigation Policy', 'Success Rate', 'Timeout Rate', 'Collision Rate', 124 | 'Relative Navigation Time', 'Relative Path Length', 'Number of Collisions', 125 | 'Personal Space Cost', 'Personal Space Violation', 'Intimate Space Violation'] 126 | 127 | # print(tabulate(results, headers=headers, tablefmt="github", numalign='center', floatfmt=".4f")) 128 | print(tabulate(results, headers=headers, tablefmt="latex", numalign='center', floatfmt=".3f")) 129 | 130 | 131 | if __name__ == '__main__': 132 | main() 133 | -------------------------------------------------------------------------------- /control/configs/README.md: -------------------------------------------------------------------------------- 1 | # Configuration Files 2 | 3 | 4 | ## Environment Configuration 5 | 6 | ### Environment 7 | - **Time limit** – The time limit is a float value corresponding to the number of seconds the robot is allowed to take in navigating to its goal before the simulation is ended. This value should be adjusted based on the simulation configurations. 8 | - **Time step** – The time step is a float value corresponding to the number of seconds between each simulation update. At each time step, the robot and each of the humans chooses a new action according to its policy. It is recommended that this value be set to 0.25 seconds. 9 | - **Validation size** – The validation size is an integer value corresponding to the number of episodes that are run in the validation stage while training the RL policy. 10 | - **Test size** – The test size is an integer value corresponding to the number of episode that are used to evaluate the trained policy. 11 | 12 | ### Reward 13 | - **Success reward** – The success reward is the positive reward the robot receives once it reaches its goal. 14 | - **Collision penalty** – The collision penalty is the negative reward the robot receives for colliding with a human. 15 | - **Discomfort distance** – The discomfort distance is the minimum distance between the robot and the human that is considered comfortable. 16 | - **Discomfort penalty factor** – When the robot is within the discomfort distance, it receives a negative reward based on how close the robot is to the human. 17 | - **Time penalty** – The time penalty is the negative reward the robot receives at each time step. 18 | - **Progress reward** – The progress reward is the reward the robot receives for making progress towards its goal at each step. 19 | - **Goal radius** – The robot is considered to have reached the goal once it is within the goal radius. 20 | 21 | ### Simulation 22 | - **Room width** – Width of the simulated room in meters. 23 | - **Room height** – Height of the simulated room in meters. 24 | - **Default number of humans** – If the number of humans is not being randomized, the default number of humans are placed according to the human generating scheme. 25 | - **Maximum number of humans** – If the number of humans is being randomized, the number of humans is sampled from a uniform distribution with the minimum and maximum values specified. 26 | - **Minimum number of humans** – If the number of humans is being randomized, the number of humans is sampled from a uniform distribution with the minimum and maximum values specified. 27 | - **Maximum density of humans** – If the number of humans is being randomized based on the amount of open space, the number of humans is sampled from a uniform distribution whose minimum and maximum values are determined by the amount of open space and the minimum and maximum densities specified. 28 | - **Minimum density of humans** – If the number of humans is being randomized based on the amount of open space, the number of humans is sampled from a uniform distribution whose minimum and maximum values are determined by the amount of open space and the minimum and maximum densities specified. 29 | - **Randomize number of humans** – If this boolean value is set to true, the number of humans will be sampled from a uniform distribution based on the minimum and maximum number of humans specified. 30 | - **Randomize density of humans** – If randomize number of humans is set to false and this boolean value is set to true, the number of humans will be sampled from a uniform distribution based on the minimum and maximum density of humans specified. If both boolean values are set to false, the default number of humans will be used. 31 | - **End on collision** – The simulation will end when the robot collides with a human or obstacle if this boolean value is set to true. 32 | - **Plan human path** – A global path will be generated for all of the humans to follow from their initial position to their goal position if this boolean value is set to true. Note that this can significantly increase the time required for training/evaluation. 33 | - **Perpetual** – The humans will be given a new goal position after reaching their goal to allow for perpetual motion if this boolean value is set to true. Note that perpetual motion is not intended to be used in combination with human path planning. 34 | - **Pedestrian crossing scenario** – Default pedestrian-robot interaction scenario to be used when randomness is set to false. Available scenarios are circle, following, passing, crossing, and random. 35 | - **Randomness** – If this boolean value is set to true, the pedestrian crossing scenario will be randomly chosen. 36 | 37 | ### Waypoints 38 | This section is only used for the hardware experiments and can be ignored. 39 | 40 | ### Humans 41 | - **Visible** – Each human is visible to the robot and all other humans if this boolean value is set to true. 42 | - **Policy** – The actions of the humans are determined by its policy. 43 | - **Sensor** – The humans perceive the environment based on their sensor. The only sensor that is currently implemented is called "coordinates" and allows the humans to view the position, velocity, and radius of visible agents. 44 | - **Observability** – The humans can either have full observability of all visible agents in the environment (if observability is specified as 'full'), or they can only view unobstructed agents within their detection range (if observability is specified as 'partial'). 45 | - **Detection range** – The detection range is how far the humans can view another agent. 46 | - **Default radius** – If the radius of the humans is not being randomized, all of the humans have the default radius. 47 | - **Minimum radius** – If the radius of the humans is being randomized, the radius of each human is sampled from a uniform distribution with the minimum and maximum values specified. 48 | - **Maximum radius** – If the radius of the humans is being randomized, the radius of each human is sampled from a uniform distribution with the minimum and maximum values specified. 49 | - **Default preferred velocity** – If the preferred velocity of the humans is not being randomized, all of the humans have the default preferred velocity. 50 | - **Minimum preferred velocity** – If the preferred velocity of the humans is being randomized, the preferred velocity of each human is sampled from a uniform distribution with the minimum and maximum values specified. 51 | - **Maximum preferred velocity** – If the preferred velocity of the humans is being randomized, the preferred velocity of each human is sampled from a uniform distribution with the minimum and maximum values specified. 52 | - **Default neighbor distance** – If the neighbor distance (ORCA parameter) of the humans is not being randomized, all of the humans have the default neighbor distance. 53 | - **Minimum neighbor distance** – If the neighbor distance of the humans is being randomized, the neighbor distance of each human is sampled from a uniform distribution with the minimum and maximum values specified. 54 | - **Maximum neighbor distance** – If the neighbor distance of the humans is being randomized, the neighbor distance of each human is sampled from a uniform distribution with the minimum and maximum values specified. 55 | - **Default time horizon** – If the time horizon (ORCA parameter) of the humans is not being randomized, all of the humans have the default time horizon. 56 | - **Minimum time horizon** – If the time horizon of the humans is being randomized, the time horizon of each human is sampled from a uniform distribution with the minimum and maximum values specified. 57 | - **Maximum time horizon** – If the time horizon of the humans is being randomized, the time horizon of each human is sampled from a uniform distribution with the minimum and maximum values specified. 58 | - **Randomize radius** – If this boolean value is set to true, the radius of each human will be sampled from a uniform distribution. If it is set to false, the default radius will be assigned to each human. 59 | - **Randomize preferred velocity** – If this boolean value is set to true, the preferred velocity of each human will be sampled from a uniform distribution. If it is set to false, the default preferred velocity will be assigned to each human. 60 | - **Randomize neighbor distance** – If this boolean value is set to true, the neighbor distance of each human will be sampled from a uniform distribution. If it is set to false, the default neighbor distance will be assigned to each human. 61 | - **Randomize time horizon** – If this boolean value is set to true, the time horizon of each human will be sampled from a uniform distribution. If it is set to false, the default time horizon will be assigned to each human. 62 | 63 | ### Robot 64 | - **Visible** – The robot is visible to the humans if this boolean value is set to true. 65 | - **Policy** – If the robot is using the reinforcement learning policy, this parameter can be set to none. The robot can also be controlled using the ORCA policy. 66 | - **Sensor** – The robot perceives the environment based on its sensor. The only sensor that is currently implemented is called "coordinates" and allows the robot to view the position, velocity, and radius of visible agents. 67 | - **Observability** – The robot can either have full observability of all visible agents in the environment (if observability is specified as 'full'), or they can only view unobstructed agents within their detection range (if observability is specified as 'partial'). 68 | - **Detection range** – The detection range is how far the robot can view another agent. 69 | - **Default radius** – If the radius of the robot is not being randomized, it has the default radius. 70 | - **Minimum radius** – If the radius of the robot is being randomized, its radius is sampled from a uniform distribution with the minimum and maximum values specified. 71 | - **Maximum radius** – If the radius of the robot is being randomized, its radius is sampled from a uniform distribution with the minimum and maximum values specified. 72 | - **Default preferred velocity** – If the preferred velocity of the robot is not being randomized, it has the default preferred velocity. 73 | - **Minimum preferred velocity** – If the preferred velocity of the robot is being randomized, its preferred velocity is sampled from a uniform distribution with the minimum and maximum values specified. 74 | - **Maximum preferred velocity** – If the preferred velocity of the robot is being randomized, its preferred velocity is sampled from a uniform distribution with the minimum and maximum values specified. 75 | - **Randomize radius** – If this boolean value is set to true, the radius of the robot will be sampled from a uniform distribution. If it is set to false, the default radius will be used. 76 | - **Randomize preferred velocity** – If this boolean value is set to true, the preferred velocity of the robot will be sampled from a uniform distribution. If it is set to false, the default preferred velocity will be used. 77 | 78 | 79 | ## Policy Configuration 80 | ### Reinforcement Learning (RL) 81 | - **Gamma** – Gamma is the discount factor used in computing the value function. 82 | 83 | ### Action Space 84 | - **Kinematics** – The kinematics model used to control simulated model determines the format of the action provided by policy. The model must either be "holonomic" or "non-holonomic." A holonomic robot receives actions as x and y velocites, while a non-holonomic robot receives actions as linear and angular velocities. 85 | - **Speed samples** – The action space is discretized into n speeds exponentially spaced in the range (0, v_pref] and m rotations evenly spaced in the range [0,2pi), where n is the number of speed samples and m is the number of rotation samples. 86 | - **Rotation samples** – The action space is discretized into n speeds exponentially spaced in the range (0, v_pref] and m rotations evenly spaced in the range [0,2pi), where n is the number of speed samples and m is the number of rotation samples. 87 | - **Query environment** – If set to true, the robot predicts the next state of the humans/objects and the rewards by querying the simulation environment. Otherwise, the next state and reward are estimated without using the simulation environment. 88 | 89 | ### Reward Function 90 | - **Adjust discomfort distance** – If this boolean value is set to true, an uncertainty-dependent discomfort distance is used in the reward function of the RL policy. 91 | - **Discomfort distance slope** – If eps is the uncertainty associated with a pedestrian, the discomfort distance assigned to this person is d = a * eps + b, where a is the discomfort distance slope and b is the intercept. 92 | - **Discomfort distance intercept** – If eps is the uncertainty associated with a pedestrian, the discomfort distance assigned to this person is d = a * eps + b, where a is the discomfort distance slope and b is the intercept. 93 | 94 | ### Value Network 95 | - **MLP1 dimensions** – Hidden layer sizes of the first set of multilayer perecptrons (MLPs) used by our policy. 96 | - **MLP2 dimensions** – Hidden layer sizes of the second set of multilayer perecptrons (MLPs) used by our policy. 97 | - **MLP3 dimensions** – Hidden layer sizes of the third set of multilayer perecptrons (MLPs) used by our policy. 98 | - **Attention dimensions** – Hidden layer sizes of the set of multilayer perecptrons (MLPs) that compute the attention scores used by our policy. 99 | - **With global state** – If the global state is used, the mean of the first set of MLPs will be used in computing the attention score. 100 | 101 | ### Safety Policy 102 | This section is only used for the hardware experiments and can be ignored. 103 | 104 | 105 | ## Training Configuration 106 | ### Trainer 107 | - **Batch size** – Number of batches of state transitions used for training the reinforcement learning (RL) policy. 108 | - **Maximum agents** – If max agents is set to -1, there is no maximum allowable number of agents. 109 | 110 | #### Imitation Learning 111 | - **IL episodes** – Number of imitation learning (IL) episodes ran per epoch before training the RL policy in order to initialize the data buffer. 112 | - **IL policy** – Policy used to populate data buffer with state transitions. 113 | - **IL epochs** – Number of imitation learning (IL) epochs used to collect data to initialize the data buffer. 114 | - **IL learning rate** – Learning rate used when running the imitation learning (IL) policy. 115 | - **Safety space** – If the ORCA policy is used for imitation learning and the robot is not visible to the humans, adding safety space improves ORCA performance. 116 | - **Human randomness** – This float indicates the maximum epsilon value of the randomized ORCA policy used to train the imitation learning policy. 117 | 118 | ### Reinforcement Learning 119 | - **RL learning rate** – Learning rate used when training the reinforcement learning (IL) policy. 120 | - **Training batches** – Number of batches to train at the end of each training episode. 121 | - **Training episodes** – Number of training episodes used in the outer loop. 122 | - **Sample episodes** – Number of training episodes sampled. 123 | - **Target update interval** – The target network used to update the value network is updated every n episodes, where n is the target update interval. 124 | - **Evaluation interval** – Validation is performed every n episodes, where n is the evaluation interval. 125 | - **Capacity** – The capacity is the amount of data that the data buffer can hold. 126 | - **Epsilon start** – The initial epsilon value used in the epsilon-greedy policy. 127 | - **Epsilon end** – The final epsilon value used in the epsilon-greedy policy. 128 | - **Epsilon decay** – The epsilon value used in the epsilon-greedy policy decays from the initial value to the final over n episodes, where n is the epsilon decay. 129 | - **Checkpoint interval** – The model is saved every n episodes, where n is the checkpoint interval. Training can be resumed from this checkpoint. 130 | - **Human randomness start** – This float indicates the initial maximum epsilon value of the randomized ORCA policy used to train the RL policy. 131 | - **Human randomness end** – This float indicates the final maximum epsilon value of the randomized ORCA policy used to train the RL policy. 132 | - **Human randomness step** – This float indicates how much the maximum epsilon value is incremented each time the value is increased durnig training. 133 | -------------------------------------------------------------------------------- /control/configs/env-noisy.config: -------------------------------------------------------------------------------- 1 | [env] 2 | time_limit = 30 3 | time_step = 0.25 4 | val_size = 100 5 | test_size = 500 6 | 7 | 8 | [reward] 9 | success_reward = 1 10 | collision_penalty = -0.25 11 | discomfort_dist = 0.1 12 | discomfort_penalty_factor = 0.5 13 | time_penalty = 0 14 | progress_reward = 0 15 | goal_radius = 0.3 16 | 17 | 18 | [sim] 19 | room_width = 15 20 | room_height = 15 21 | num_humans = 20 22 | max_human_num = 20 23 | min_human_num = 5 24 | max_human_dens = 0.05 25 | min_human_dens = 0.02 26 | random_human_num = true 27 | random_human_dens = false 28 | end_on_collision = false 29 | plan_human_path = false 30 | perpetual = false 31 | scenario = random 32 | randomness = true 33 | 34 | 35 | [humans] 36 | visible = true 37 | policy = orca 38 | sensor = coordinates 39 | observability = full 40 | range = 100 41 | radius = 0.3 42 | min_radius = 0.2 43 | max_radius = 0.8 44 | v_pref = 1.0 45 | min_v_pref = 0.5 46 | max_v_pref = 2.0 47 | neigh_dist = 10.0 48 | min_neigh_dist = 2.0 49 | max_neigh_dist = 20.0 50 | horizon = 1.5 51 | min_horizon = 0.1 52 | max_horizon = 5.0 53 | randomize_radius = false 54 | randomize_v_pref = false 55 | randomize_neigh_dist = false 56 | randomize_horizon = false 57 | 58 | 59 | [robot] 60 | visible = true 61 | policy = none 62 | sensor = coordinates 63 | observability = full 64 | range = 4 65 | radius = 0.3 66 | max_radius = 0.3 67 | min_radius = 0.3 68 | v_pref = 1 69 | max_v_pref = 1 70 | min_v_pref = 1 71 | randomize_radius = false 72 | randomize_v_pref = false -------------------------------------------------------------------------------- /control/configs/env-policies.config: -------------------------------------------------------------------------------- 1 | [env] 2 | time_limit = 30 3 | time_step = 0.25 4 | val_size = 100 5 | test_size = 100 6 | 7 | 8 | [reward] 9 | success_reward = 1 10 | collision_penalty = -0.25 11 | discomfort_dist = 0.1 12 | discomfort_penalty_factor = 0.5 13 | time_penalty = 0 14 | progress_reward = 0 15 | goal_radius = 0.3 16 | 17 | 18 | [sim] 19 | room_width = 15 20 | room_height = 15 21 | num_humans = 20 22 | max_human_num = 20 23 | min_human_num = 5 24 | max_human_dens = 0.05 25 | min_human_dens = 0.02 26 | random_human_num = true 27 | random_human_dens = false 28 | end_on_collision = false 29 | plan_human_path = false 30 | perpetual = false 31 | scenario = random 32 | randomness = true 33 | 34 | 35 | [humans] 36 | visible = true 37 | policy = random 38 | sensor = coordinates 39 | observability = full 40 | range = 100 41 | radius = 0.3 42 | min_radius = 0.2 43 | max_radius = 0.8 44 | v_pref = 1.0 45 | min_v_pref = 0.5 46 | max_v_pref = 2.0 47 | neigh_dist = 10.0 48 | min_neigh_dist = 2.0 49 | max_neigh_dist = 20.0 50 | horizon = 1.5 51 | min_horizon = 0.1 52 | max_horizon = 5.0 53 | randomize_radius = true 54 | randomize_v_pref = true 55 | randomize_neigh_dist = true 56 | randomize_horizon = true 57 | 58 | 59 | [robot] 60 | visible = true 61 | policy = none 62 | sensor = coordinates 63 | observability = full 64 | range = 4 65 | radius = 0.3 66 | max_radius = 0.3 67 | min_radius = 0.3 68 | v_pref = 1 69 | max_v_pref = 1 70 | min_v_pref = 1 71 | randomize_radius = false 72 | randomize_v_pref = false -------------------------------------------------------------------------------- /control/configs/env-train.config: -------------------------------------------------------------------------------- 1 | [env] 2 | time_limit = 30 3 | time_step = 0.25 4 | val_size = 100 5 | test_size = 500 6 | 7 | 8 | [reward] 9 | success_reward = 1 10 | collision_penalty = -0.25 11 | discomfort_dist = 0.1 12 | discomfort_penalty_factor = 0.5 13 | time_penalty = 0 14 | progress_reward = 0 15 | goal_radius = 0.3 16 | 17 | 18 | [sim] 19 | room_width = 15 20 | room_height = 15 21 | num_humans = 20 22 | max_human_num = 20 23 | min_human_num = 5 24 | max_human_dens = 0.05 25 | min_human_dens = 0.02 26 | random_human_num = false 27 | random_human_dens = false 28 | end_on_collision = false 29 | plan_human_path = false 30 | perpetual = false 31 | scenario = random 32 | randomness = true 33 | 34 | 35 | [humans] 36 | visible = true 37 | policy = orca 38 | sensor = coordinates 39 | observability = full 40 | range = 100 41 | radius = 0.3 42 | min_radius = 0.2 43 | max_radius = 0.8 44 | v_pref = 1.0 45 | min_v_pref = 0.5 46 | max_v_pref = 2.0 47 | neigh_dist = 10.0 48 | min_neigh_dist = 2.0 49 | max_neigh_dist = 20.0 50 | horizon = 1.5 51 | min_horizon = 0.1 52 | max_horizon = 5.0 53 | randomize_radius = false 54 | randomize_v_pref = false 55 | randomize_neigh_dist = false 56 | randomize_horizon = false 57 | 58 | 59 | [robot] 60 | visible = false 61 | policy = none 62 | sensor = coordinates 63 | observability = full 64 | range = 4 65 | radius = 0.3 66 | max_radius = 0.3 67 | min_radius = 0.3 68 | v_pref = 1 69 | max_v_pref = 1 70 | min_v_pref = 1 71 | randomize_radius = false 72 | randomize_v_pref = false -------------------------------------------------------------------------------- /control/configs/model/policy.config: -------------------------------------------------------------------------------- 1 | [rl] 2 | gamma = 0.9 3 | 4 | 5 | [action_space] 6 | kinematics = nonholonomic 7 | speed_samples = 5 8 | rotation_samples = 16 9 | query_env = false 10 | 11 | 12 | [reward] 13 | adjust_dist = false 14 | dist_slope = 0.0 15 | dist_intercept = 0.2 16 | 17 | 18 | [network] 19 | mlp1_dims = 150, 100 20 | mlp2_dims = 100, 50 21 | mlp3_dims = 150, 100, 100, 1 22 | attention_dims = 100, 100, 1 23 | with_global_state = true 24 | 25 | 26 | [safety] 27 | safety = false 28 | slow = false 29 | margin = 0.75 30 | spread = 3 -------------------------------------------------------------------------------- /control/configs/model/train.config: -------------------------------------------------------------------------------- 1 | [trainer] 2 | batch_size = 100 3 | max_agents = -1 4 | 5 | 6 | [imitation_learning] 7 | il_episodes = 3000 8 | il_policy = orca 9 | il_epochs = 50 10 | il_learning_rate = 0.01 11 | safety_space = 0.15 12 | human_randomness = 0.0 13 | 14 | 15 | [train] 16 | rl_learning_rate = 0.001 17 | train_batches = 100 18 | train_episodes = 12000 19 | sample_episodes = 1 20 | target_update_interval = 50 21 | evaluation_interval = 1000 22 | capacity = 100000 23 | epsilon_start = 0.5 24 | epsilon_end = 0.1 25 | epsilon_decay = 4000 26 | checkpoint_interval = 1000 27 | randomness_start = 0.0 28 | randomness_end = 0.5 29 | randomness_step = 0.1 30 | -------------------------------------------------------------------------------- /control/configs/reward/policy.config: -------------------------------------------------------------------------------- 1 | [rl] 2 | gamma = 0.9 3 | 4 | 5 | [action_space] 6 | kinematics = nonholonomic 7 | speed_samples = 5 8 | rotation_samples = 16 9 | query_env = false 10 | 11 | 12 | [reward] 13 | adjust_dist = true 14 | dist_slope = 1.0 15 | dist_intercept = 0.2 16 | 17 | 18 | [network] 19 | mlp1_dims = 150, 100 20 | mlp2_dims = 100, 50 21 | mlp3_dims = 150, 100, 100, 1 22 | attention_dims = 100, 100, 1 23 | with_global_state = true 24 | 25 | 26 | [safety] 27 | safety = false 28 | slow = false 29 | margin = 0.75 30 | spread = 3 -------------------------------------------------------------------------------- /control/configs/reward/train.config: -------------------------------------------------------------------------------- 1 | [trainer] 2 | batch_size = 100 3 | max_agents = -1 4 | 5 | 6 | [imitation_learning] 7 | il_episodes = 3000 8 | il_policy = orca 9 | il_epochs = 50 10 | il_learning_rate = 0.01 11 | safety_space = 0.15 12 | human_randomness = 0.0 13 | 14 | 15 | [train] 16 | rl_learning_rate = 0.001 17 | train_batches = 100 18 | train_episodes = 12000 19 | sample_episodes = 1 20 | target_update_interval = 50 21 | evaluation_interval = 1000 22 | capacity = 100000 23 | epsilon_start = 0.5 24 | epsilon_end = 0.1 25 | epsilon_decay = 4000 26 | checkpoint_interval = 1000 27 | randomness_start = 0.0 28 | randomness_end = 0.5 29 | randomness_step = 0.1 30 | -------------------------------------------------------------------------------- /control/configs/sarl/env.config: -------------------------------------------------------------------------------- 1 | [env] 2 | time_limit = 30 3 | time_step = 0.25 4 | val_size = 100 5 | test_size = 500 6 | 7 | 8 | [reward] 9 | success_reward = 1 10 | collision_penalty = -0.25 11 | discomfort_dist = 0.1 12 | discomfort_penalty_factor = 0.5 13 | time_penalty = 0 14 | progress_reward = 0 15 | goal_radius = 0.3 16 | 17 | 18 | [sim] 19 | room_width = 15 20 | room_height = 15 21 | num_humans = 20 22 | max_human_num = 50 23 | min_human_num = 5 24 | max_human_dens = 0.05 25 | min_human_dens = 0.02 26 | random_human_num = false 27 | random_human_dens = false 28 | end_on_collision = false 29 | plan_human_path = false 30 | perpetual = false 31 | scenario = random 32 | randomness = true 33 | 34 | 35 | [humans] 36 | visible = true 37 | policy = orca 38 | sensor = coordinates 39 | observability = full 40 | range = 100 41 | radius = 0.3 42 | min_radius = 0.2 43 | max_radius = 0.8 44 | v_pref = 1.0 45 | min_v_pref = 0.5 46 | max_v_pref = 2.0 47 | neigh_dist = 10.0 48 | min_neigh_dist = 2.0 49 | max_neigh_dist = 20.0 50 | horizon = 1.5 51 | min_horizon = 0.1 52 | max_horizon = 5.0 53 | randomize_radius = false 54 | randomize_v_pref = false 55 | randomize_neigh_dist = false 56 | randomize_horizon = false 57 | 58 | 59 | [robot] 60 | visible = false 61 | policy = none 62 | sensor = coordinates 63 | observability = full 64 | range = 4 65 | radius = 0.3 66 | max_radius = 0.3 67 | min_radius = 0.3 68 | v_pref = 1 69 | max_v_pref = 1 70 | min_v_pref = 1 71 | randomize_radius = false 72 | randomize_v_pref = false -------------------------------------------------------------------------------- /control/configs/sarl/policy.config: -------------------------------------------------------------------------------- 1 | [rl] 2 | gamma = 0.9 3 | 4 | 5 | [action_space] 6 | kinematics = nonholonomic 7 | speed_samples = 5 8 | rotation_samples = 16 9 | query_env = false 10 | 11 | 12 | [reward] 13 | adjust_dist = false 14 | dist_slope = 0.0 15 | dist_intercept = 0.2 16 | 17 | 18 | [network] 19 | mlp1_dims = 150, 100 20 | mlp2_dims = 100, 50 21 | mlp3_dims = 150, 100, 100, 1 22 | attention_dims = 100, 100, 1 23 | with_global_state = true 24 | 25 | 26 | [safety] 27 | safety = false 28 | slow = false 29 | margin = 0.75 30 | spread = 3 -------------------------------------------------------------------------------- /control/configs/sarl/train.config: -------------------------------------------------------------------------------- 1 | [trainer] 2 | batch_size = 100 3 | max_agents = -1 4 | 5 | 6 | [imitation_learning] 7 | il_episodes = 3000 8 | il_policy = orca 9 | il_epochs = 50 10 | il_learning_rate = 0.01 11 | safety_space = 0.15 12 | human_randomness = 0.0 13 | 14 | 15 | [train] 16 | rl_learning_rate = 0.001 17 | train_batches = 100 18 | train_episodes = 12000 19 | sample_episodes = 1 20 | target_update_interval = 50 21 | evaluation_interval = 1000 22 | capacity = 100000 23 | epsilon_start = 0.5 24 | epsilon_end = 0.1 25 | epsilon_decay = 4000 26 | checkpoint_interval = 1000 27 | randomness_start = 0.0 28 | randomness_end = 0.0 29 | randomness_step = 0.1 30 | -------------------------------------------------------------------------------- /control/configs/training/env.config: -------------------------------------------------------------------------------- 1 | [env] 2 | time_limit = 30 3 | time_step = 0.25 4 | val_size = 100 5 | test_size = 500 6 | 7 | 8 | [reward] 9 | success_reward = 1 10 | collision_penalty = -0.25 11 | discomfort_dist = 0.1 12 | discomfort_penalty_factor = 0.5 13 | time_penalty = 0 14 | progress_reward = 0 15 | goal_radius = 0.3 16 | 17 | 18 | [sim] 19 | room_width = 15 20 | room_height = 15 21 | num_humans = 20 22 | max_human_num = 50 23 | min_human_num = 5 24 | max_human_dens = 0.05 25 | min_human_dens = 0.02 26 | random_human_num = false 27 | random_human_dens = false 28 | end_on_collision = false 29 | plan_human_path = false 30 | perpetual = false 31 | scenario = random 32 | randomness = true 33 | 34 | 35 | [humans] 36 | visible = true 37 | policy = orca 38 | sensor = coordinates 39 | observability = full 40 | range = 100 41 | radius = 0.3 42 | min_radius = 0.2 43 | max_radius = 0.8 44 | v_pref = 1.0 45 | min_v_pref = 0.5 46 | max_v_pref = 2.0 47 | neigh_dist = 10.0 48 | min_neigh_dist = 2.0 49 | max_neigh_dist = 20.0 50 | horizon = 1.5 51 | min_horizon = 0.1 52 | max_horizon = 5.0 53 | randomize_radius = false 54 | randomize_v_pref = false 55 | randomize_neigh_dist = false 56 | randomize_horizon = false 57 | 58 | 59 | [robot] 60 | visible = false 61 | policy = none 62 | sensor = coordinates 63 | observability = full 64 | range = 4 65 | radius = 0.3 66 | max_radius = 0.3 67 | min_radius = 0.3 68 | v_pref = 1 69 | max_v_pref = 1 70 | min_v_pref = 1 71 | randomize_radius = false 72 | randomize_v_pref = false -------------------------------------------------------------------------------- /control/configs/training/policy.config: -------------------------------------------------------------------------------- 1 | [rl] 2 | gamma = 0.9 3 | 4 | 5 | [action_space] 6 | kinematics = nonholonomic 7 | speed_samples = 5 8 | rotation_samples = 16 9 | query_env = false 10 | 11 | 12 | [reward] 13 | adjust_dist = false 14 | dist_slope = 0.0 15 | dist_intercept = 0.2 16 | 17 | 18 | [network] 19 | mlp1_dims = 150, 100 20 | mlp2_dims = 100, 50 21 | mlp3_dims = 150, 100, 100, 1 22 | attention_dims = 100, 100, 1 23 | with_global_state = true 24 | 25 | 26 | [safety] 27 | safety = false 28 | slow = false 29 | margin = 0.75 30 | spread = 3 -------------------------------------------------------------------------------- /control/configs/training/train.config: -------------------------------------------------------------------------------- 1 | [trainer] 2 | batch_size = 100 3 | max_agents = -1 4 | 5 | 6 | [imitation_learning] 7 | il_episodes = 3000 8 | il_policy = orca 9 | il_epochs = 50 10 | il_learning_rate = 0.01 11 | safety_space = 0.15 12 | human_randomness = 0.0 13 | 14 | 15 | [train] 16 | rl_learning_rate = 0.001 17 | train_batches = 100 18 | train_episodes = 12000 19 | sample_episodes = 1 20 | target_update_interval = 50 21 | evaluation_interval = 1000 22 | capacity = 100000 23 | epsilon_start = 0.5 24 | epsilon_end = 0.1 25 | epsilon_decay = 4000 26 | checkpoint_interval = 1000 27 | randomness_start = 0.0 28 | randomness_end = 0.5 29 | randomness_step = 0.1 30 | -------------------------------------------------------------------------------- /control/policy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sarapohland/stranger-danger/84a6bad96de4f117d6192b5111eba5531e766e26/control/policy/__init__.py -------------------------------------------------------------------------------- /control/policy/policy_factory.py: -------------------------------------------------------------------------------- 1 | from control.policy.sarl import SARL 2 | from control.policy.uncertain_sarl import UNCERTAIN_SARL 3 | from simulation.envs.policy.policy_factory import policy_factory 4 | 5 | policy_factory['sarl'] = SARL 6 | policy_factory['uncertain_sarl'] = UNCERTAIN_SARL 7 | -------------------------------------------------------------------------------- /control/policy/sarl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import logging 3 | import itertools 4 | import numpy as np 5 | import torch.nn as nn 6 | import numpy.linalg as la 7 | 8 | from torch.nn.functional import softmax 9 | from shapely.geometry import Polygon 10 | 11 | from simulation.envs.policy.policy import Policy 12 | from simulation.envs.utils.action import ActionRot, ActionXY 13 | from simulation.envs.utils.state import ObservableState, FullState 14 | from control.policy.value_network import ValueNetwork 15 | 16 | 17 | class SARL(Policy): 18 | def __init__(self): 19 | super().__init__() 20 | self.name = 'SARL' 21 | self.trainable = True 22 | self.kinematics = None 23 | self.epsilon = None 24 | self.gamma = None 25 | self.speed_samples = None 26 | self.rotation_samples = None 27 | self.query_env = None 28 | self.action_space = None 29 | self.speeds = None 30 | self.rotations = None 31 | self.action_values = None 32 | self.self_state_dim = 6 33 | self.human_state_dim = 7 34 | self.joint_state_dim = self.self_state_dim + self.human_state_dim 35 | self.multiagent_training = True 36 | 37 | # Safety features 38 | self.safety = None 39 | self.walls = None 40 | 41 | def configure(self, config): 42 | self.gamma = config.getfloat('rl', 'gamma') 43 | 44 | self.kinematics = config.get('action_space', 'kinematics') 45 | self.speed_samples = config.getint('action_space', 'speed_samples') 46 | self.rotation_samples = config.getint('action_space', 'rotation_samples') 47 | self.query_env = config.getboolean('action_space', 'query_env') 48 | 49 | mlp1_dims = [int(x) for x in config.get('network', 'mlp1_dims').split(', ')] 50 | mlp2_dims = [int(x) for x in config.get('network', 'mlp2_dims').split(', ')] 51 | mlp3_dims = [int(x) for x in config.get('network', 'mlp3_dims').split(', ')] 52 | attention_dims = [int(x) for x in config.get('network', 'attention_dims').split(', ')] 53 | with_global_state = config.getboolean('network', 'with_global_state') 54 | 55 | self.model = ValueNetwork(self.joint_state_dim, self.self_state_dim, mlp1_dims, 56 | mlp2_dims, mlp3_dims, attention_dims, with_global_state) 57 | logging.info('Policy: Baseline SARL') 58 | 59 | # Safety features 60 | self.safety = config.getboolean('safety', 'safety') 61 | self.slow = config.getboolean('safety', 'slow') 62 | self.margin = config.getfloat('safety', 'margin') 63 | self.spread = config.getint('safety', 'spread') 64 | 65 | if self.safety: 66 | logging.info('Safety controller is activated.') 67 | else: 68 | logging.info('Safety controller is unactivated.') 69 | 70 | def set_device(self, device): 71 | self.device = device 72 | self.model.to(device) 73 | 74 | def set_epsilon(self, epsilon): 75 | self.epsilon = epsilon 76 | 77 | def set_walls(self, walls): 78 | self.walls = walls 79 | 80 | def get_attention_weights(self): 81 | return self.model.attention_weights 82 | 83 | def build_action_space(self, v_pref): 84 | holonomic = self.kinematics == 'holonomic' 85 | speeds = [(np.exp((i + 1) / self.speed_samples) - 1) / (np.e - 1) * v_pref for i in range(self.speed_samples)] 86 | if holonomic: 87 | rotations = np.linspace(0, 2 * np.pi, self.rotation_samples, endpoint=False) 88 | else: 89 | rotations = np.linspace(-np.pi / 4, np.pi / 4, self.rotation_samples) 90 | 91 | action_space = [ActionXY(0, 0) if holonomic else ActionRot(0, -np.pi/4)] 92 | for rotation, speed in itertools.product(rotations, speeds): 93 | if holonomic: 94 | action_space.append(ActionXY(speed * np.cos(rotation), speed * np.sin(rotation))) 95 | else: 96 | action_space.append(ActionRot(speed, rotation)) 97 | 98 | self.speeds = speeds 99 | self.rotations = rotations 100 | self.action_space = action_space 101 | self.free_directions = np.full(len(action_space), True) 102 | 103 | def propagate(self, state, action): 104 | if isinstance(state, ObservableState): 105 | # propagate state of humans 106 | next_px = state.px + action.vx * self.time_step 107 | next_py = state.py + action.vy * self.time_step 108 | next_state = ObservableState(next_px, next_py, action.vx, action.vy, state.radius) 109 | elif isinstance(state, FullState): 110 | # propagate state of current agent 111 | if self.kinematics == 'holonomic': 112 | next_px = state.px + action.vx * self.time_step 113 | next_py = state.py + action.vy * self.time_step 114 | next_state = FullState(next_px, next_py, action.vx, action.vy, state.radius, 115 | state.wx, state.wy, state.v_pref, state.theta) 116 | else: 117 | next_theta = state.theta + action.r 118 | next_vx = action.v * np.cos(next_theta) 119 | next_vy = action.v * np.sin(next_theta) 120 | next_px = state.px + next_vx * self.time_step 121 | next_py = state.py + next_vy * self.time_step 122 | next_state = FullState(next_px, next_py, next_vx, next_vy, state.radius, 123 | state.wx, state.wy, state.v_pref, next_theta) 124 | else: 125 | raise ValueError('State to be propagated is not of a known type.') 126 | return next_state 127 | 128 | def compute_reward(self, nav, obs): 129 | # check for collision 130 | dmin = float('inf') 131 | collision = False 132 | for i, ob in enumerate(obs): 133 | dist = la.norm((nav.px - ob.px, nav.py - ob.py)) - nav.radius - ob.radius 134 | if dist < 0: 135 | collision = True 136 | if dist < dmin: 137 | dmin = dist 138 | 139 | # check if reaching the goal 140 | reaching_goal = la.norm((nav.px - nav.gx, nav.py - nav.gy)) < nav.radius 141 | 142 | # compute reward 143 | if reaching_goal: 144 | reward = 1 145 | elif collision: 146 | reward = -0.25 147 | elif dmin < 0.2: 148 | reward = (dmin - 0.2) * 0.5 * self.time_step 149 | else: 150 | reward = 0 151 | return reward 152 | 153 | def transform(self, state, eps=None): 154 | state_tensor = torch.cat([torch.Tensor([state.self_state + human_state]).to(self.device) 155 | for human_state in state.human_states], dim=0) 156 | state_tensor = self.rotate(state_tensor) 157 | return state_tensor 158 | 159 | def rotate(self, state): 160 | # 'px', 'py', 'vx', 'vy', 'radius', 'gx', 'gy', 'v_pref', 'theta', 'px1', 'py1', 'vx1', 'vy1', 'radius1' 161 | # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 162 | batch = state.shape[0] 163 | dx = (state[:, 5] - state[:, 0]).reshape((batch, -1)) 164 | dy = (state[:, 6] - state[:, 1]).reshape((batch, -1)) 165 | rot = torch.atan2(state[:, 6] - state[:, 1], state[:, 5] - state[:, 0]) 166 | 167 | dg = torch.norm(torch.cat([dx, dy], dim=1), 2, dim=1, keepdim=True) 168 | v_pref = state[:, 7].reshape((batch, -1)) 169 | vx = (state[:, 2] * torch.cos(rot) + state[:, 3] * torch.sin(rot)).reshape((batch, -1)) 170 | vy = (state[:, 3] * torch.cos(rot) - state[:, 2] * torch.sin(rot)).reshape((batch, -1)) 171 | 172 | radius = state[:, 4].reshape((batch, -1)) 173 | if self.kinematics == 'unicycle': 174 | theta = (state[:, 8] - rot).reshape((batch, -1)) 175 | else: 176 | theta = torch.zeros_like(v_pref) 177 | 178 | vx1 = (state[:, 11] * torch.cos(rot) + state[:, 12] * torch.sin(rot)).reshape((batch, -1)) 179 | vy1 = (state[:, 12] * torch.cos(rot) - state[:, 11] * torch.sin(rot)).reshape((batch, -1)) 180 | px1 = (state[:, 9] - state[:, 0]) * torch.cos(rot) + (state[:, 10] - state[:, 1]) * torch.sin(rot) 181 | px1 = px1.reshape((batch, -1)) 182 | py1 = (state[:, 10] - state[:, 1]) * torch.cos(rot) - (state[:, 9] - state[:, 0]) * torch.sin(rot) 183 | py1 = py1.reshape((batch, -1)) 184 | radius1 = state[:, 13].reshape((batch, -1)) 185 | radius_sum = radius + radius1 186 | da = torch.norm(torch.cat([(state[:, 0] - state[:, 9]).reshape((batch, -1)), (state[:, 1] - state[:, 10]). 187 | reshape((batch, -1))], dim=1), 2, dim=1, keepdim=True) 188 | 189 | new_state = torch.cat([dg, v_pref, theta, radius, vx, vy, px1, py1, vx1, vy1, radius1, da, radius_sum], dim=1) 190 | return new_state 191 | 192 | def predict(self, state): 193 | if self.phase is None or self.device is None: 194 | raise AttributeError('Phase and device attributes have to be set.') 195 | if self.phase == 'train' and self.epsilon is None: 196 | raise AttributeError('Epsilon attribute has to be set in training phase.') 197 | 198 | # Stop moving once the goal is reached 199 | if self.reach_destination(state): 200 | return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) 201 | 202 | # Build the action space at the start 203 | if self.action_space is None: 204 | self.build_action_space(state.self_state.v_pref) 205 | if self.safety: 206 | self.build_radar(state.self_state.v_pref, state.self_state.radius) 207 | 208 | # Update free_directions 209 | self.find_free_directions(state.self_state, state.human_states) 210 | 211 | # Robot has observed human 212 | if state.human_states: 213 | # Select action according to epsilon greedy policy 214 | probability = np.random.random() 215 | if self.phase == 'train' and probability < self.epsilon: 216 | max_action = self.action_space[np.random.choice(len(self.action_space))] 217 | else: 218 | self.action_values = list() 219 | max_value = float('-inf') 220 | max_action = None 221 | naive_value = float('-inf') 222 | for free, action in zip(self.free_directions, self.action_space): 223 | # Get input to value network for given action 224 | next_self_state = self.propagate(state.self_state, action) 225 | if self.query_env: 226 | next_human_states, reward, done, info = self.env.onestep_lookahead(action) 227 | else: 228 | next_human_states = [self.propagate(human_state, ActionXY(human_state.vx, human_state.vy)) 229 | for human_state in state.human_states] 230 | reward = self.compute_reward(next_self_state, next_human_states) 231 | batch_next_states = torch.cat([torch.Tensor([next_self_state + next_human_state]).to(self.device) 232 | for next_human_state in next_human_states], dim=0) 233 | rotated_batch_input = self.rotate(batch_next_states).unsqueeze(0) 234 | 235 | # Compute value of action using value network 236 | next_state_value = self.model(rotated_batch_input).data.item() 237 | value = reward + pow(self.gamma, self.time_step * state.self_state.v_pref) * next_state_value 238 | 239 | if value > naive_value: 240 | naive_value = value 241 | if free: 242 | self.action_values.append(value) 243 | if value > max_value: 244 | max_value = value 245 | max_action = action 246 | else: 247 | self.action_values.append(np.nan) 248 | 249 | if max_action is None: 250 | raise ValueError('Value network is not well trained.') 251 | 252 | # if there was a "better" action without safety, then the safety is active 253 | self.safety_active = max_value != naive_value 254 | 255 | if self.phase == 'train': 256 | self.last_state = self.transform(state) 257 | 258 | return max_action 259 | 260 | # Robot has not observed human 261 | else: 262 | # Select action towards goal position 263 | px, py = state.self_state.px, state.self_state.py 264 | wx, wy = state.self_state.wx, state.self_state.wy 265 | theta = state.self_state.theta 266 | speed = min(np.linalg.norm((wy-py, wx-px)), state.self_state.v_pref) 267 | angle = np.arctan2(wy-py, wx-px) 268 | if self.kinematics == 'holonomic': 269 | action = ActionXY(speed * np.cos(angle), speed * np.sin(angle)) 270 | else: 271 | rot = angle - theta 272 | rot = (rot + np.pi) % (2 * np.pi) - np.pi 273 | action = ActionRot(speed, rot) 274 | return self.review_action(action, state) 275 | 276 | def build_radar(self, v_pref, robot_radius): 277 | if self.slow: 278 | # Use positive margin for safety buffer. Use 0 margin to disable safety buffer. 279 | if self.margin >= 0: 280 | margins = [(np.exp((i + 1) / self.speed_samples) - 1) / (np.e - 1) * self.margin + robot_radius for i in range(self.speed_samples)] 281 | # Use negative margin for a default 2-second following space. 282 | else: 283 | margins = [(np.exp((i + 1) / self.speed_samples) - 1) / (np.e - 1) * 2 * v_pref + robot_radius for i in range(self.speed_samples)] 284 | # If we aren't slowing down, we just use widest margins. 285 | else: 286 | if self.margin >= 0: 287 | margins = [self.margin + robot_radius] 288 | else: 289 | margins = [2*v_pref + robot_radius] 290 | 291 | # Rotations are standard. 292 | holonomic = self.kinematics == 'holonomic' 293 | if self.rotations is None: 294 | if holonomic: 295 | rotations = np.linspace(0, 2*np.pi, self.rotation_samples, endpoint=False) 296 | else: 297 | rotations = np.linspace(-np.pi / 4, np.pi / 4, self.rotation_samples) 298 | else: 299 | rotations = self.rotations 300 | 301 | radar_vertices = [] 302 | if holonomic: 303 | theta = np.pi / self.rotation_samples 304 | else: 305 | theta = (np.pi / 4) / self.rotation_samples 306 | tan_theta = np.tan(theta) 307 | plus_minus_one = [1, -1] 308 | for rotation, margin in itertools.product(rotations, margins): 309 | new_vertices = [] 310 | radar_line = [margin * np.cos(rotation), margin * np.sin(rotation)] 311 | margin_width = robot_radius 312 | radar_offset = [margin_width * np.sin(rotation), -margin_width * np.cos(rotation)] 313 | new_vertices += [(radar_line[0] + sign * radar_offset[0], radar_line[1] + sign * radar_offset[1]) for sign in plus_minus_one] 314 | new_vertices += [(-sign * radar_offset[0], -sign * radar_offset[1]) for sign in plus_minus_one] 315 | radar_vertices.append(new_vertices) 316 | 317 | self.num_margins = len(margins) 318 | self.margins = margins 319 | self.rotations = rotations 320 | self.radar_vertices = radar_vertices 321 | 322 | def find_free_directions(self, robo, obs): 323 | # If the safety controller is not active, or we don't have any walls, then all directions are free. 324 | if not self.safety or self.walls is None: 325 | self.free_directions[1:] = True 326 | return 327 | holonomic = self.kinematics == 'holonomic' 328 | 329 | free_list = [] 330 | sin_t = np.sin(robo.theta) 331 | cos_t = np.cos(robo.theta) 332 | 333 | for indexo, vertices in enumerate(self.radar_vertices): 334 | if holonomic: 335 | shifted_vertices = [(robo.position[0] + vertex[0], robo.position[1] + vertex[1]) for vertex in vertices] 336 | else: 337 | # rotate counterclockwise by robo.theta radians 338 | rotated_vertices = [(cos_t*vertex[0] - sin_t*vertex[1], sin_t*vertex[0] + cos_t*vertex[1]) for vertex in vertices] 339 | shifted_vertices = [(robo.position[0] + vertex[0], robo.position[1] + vertex[1]) for vertex in rotated_vertices] 340 | radar_wedge = Polygon(shifted_vertices) 341 | free = True 342 | if self.walls.intersects(Polygon(shifted_vertices)): 343 | free = False 344 | 345 | xboii = [coolio[0] for coolio in shifted_vertices]*2 346 | yboii = [coolio[1] for coolio in shifted_vertices]*2 347 | 348 | if self.slow: 349 | free_list.append(free) 350 | else: 351 | free_list += [free for i in range(self.speed_samples)] 352 | 353 | free_list = np.array(free_list) 354 | roller = np.ones((len(free_list), 1 + 2*self.spread)) 355 | roller[:, 0] = free_list 356 | for i in range(1, self.spread + 1): 357 | if holonomic: 358 | roller[:, i] = np.roll(free_list, i * self.speed_samples) 359 | roller[:, i + self.spread] = np.roll(free_list, -i * self.speed_samples) 360 | else: # non-holonomic spread does not wrap around 361 | roller[i*self.speed_samples:, i] = free_list[:-i*self.speed_samples] 362 | roller[:-i*self.speed_samples, i] = free_list[i*self.speed_samples:] 363 | 364 | # Update our list of free directions 365 | self.free_directions[1:] = roller.all(axis=1) 366 | 367 | # provide an option to find the nearest safe action 368 | def review_action(self, proposed_action, state): 369 | # If we don't care about safety, just return the proposed action 370 | if not self.safety: 371 | return proposed_action 372 | 373 | # Set up the things if necessary. 374 | if self.reach_destination(state): 375 | return ActionXY(0, 0) if self.kinematics == 'holonomic' else ActionRot(0, 0) 376 | if self.safety and self.radar_vertices is None: 377 | self.build_radar(state.self_state.v_pref, state.self_state.radius) 378 | 379 | # Identify areas that have obstacles in the way 380 | self.find_free_directions(state.self_state, state.human_states) 381 | 382 | # Determine x and y components of proposed action 383 | holonomic = self.kinematics == 'holonomic' 384 | if holonomic: 385 | proposed_ax = proposed_action[0] 386 | proposed_ay = proposed_action[1] 387 | else: 388 | # if we want to rotate a bunch, just rotate without moving forward 389 | proposed_ax = proposed_action[0] * np.cos(proposed_action[1]) 390 | proposed_ay = proposed_action[0] * np.sin(proposed_action[1]) 391 | 392 | # We find the closest action to the proposed action that is still safe 393 | closest_action = 0 394 | min_dist = np.linalg.norm([proposed_ax, proposed_ay]) 395 | naive_min_dist = min_dist 396 | if self.action_space is None: 397 | self.build_action_space(state.self_state.v_pref) 398 | for i, action in enumerate(self.action_space): 399 | if holonomic: 400 | ax = action[0] 401 | ay = action[1] 402 | else: 403 | ax = action[0] * np.cos(action[1]) 404 | ay = action[0] * np.sin(action[1]) 405 | new_dist = np.linalg.norm([proposed_ax - ax, proposed_ay - ay]) 406 | if new_dist < min_dist and self.free_directions[i]: 407 | min_dist = new_dist 408 | closest_action = i 409 | if new_dist < naive_min_dist: 410 | naive_min_dist = new_dist 411 | # log when the safety controller is actively preventing an "optimal" path 412 | self.safety_active = min_dist != naive_min_dist 413 | 414 | if self.slow: 415 | return self.action_space[closest_action] 416 | else: 417 | if holonomic: 418 | speed = np.linalg.norm(proposed_action) 419 | action_direction = self.action_space[closest_action] 420 | speed_factor = speed/np.linalg.norm(action_direction) 421 | return ActionXY(action_direction.vx * speed_factor, action_direction.vy * speed_factor) 422 | else: 423 | speed = proposed_action[0] 424 | action_direction = self.action_space[closest_action][1] 425 | return ActionRot(speed, action_direction) 426 | -------------------------------------------------------------------------------- /control/policy/value_network.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | def mlp(input_dim, mlp_dims, last_relu=False): 5 | layers = [] 6 | mlp_dims = [input_dim] + mlp_dims 7 | for i in range(len(mlp_dims) - 1): 8 | layers.append(nn.Linear(mlp_dims[i], mlp_dims[i + 1])) 9 | if i != len(mlp_dims) - 2 or last_relu: 10 | layers.append(nn.ReLU()) 11 | net = nn.Sequential(*layers) 12 | return net 13 | 14 | class ValueNetwork(nn.Module): 15 | def __init__(self, input_dim, self_state_dim, mlp1_dims, mlp2_dims, mlp3_dims, attention_dims, with_global_state): 16 | super().__init__() 17 | self.self_state_dim = self_state_dim 18 | self.global_state_dim = mlp1_dims[-1] 19 | self.mlp1 = mlp(input_dim, mlp1_dims, last_relu=True) 20 | self.mlp2 = mlp(mlp1_dims[-1], mlp2_dims) 21 | self.with_global_state = with_global_state 22 | if with_global_state: 23 | self.attention = mlp(mlp1_dims[-1] * 2, attention_dims) 24 | else: 25 | self.attention = mlp(mlp1_dims[-1], attention_dims) 26 | mlp3_input_dim = mlp2_dims[-1] + self.self_state_dim 27 | self.mlp3 = mlp(mlp3_input_dim, mlp3_dims) 28 | self.attention_weights = None 29 | 30 | def forward(self, state): 31 | size = state.shape 32 | self_state = state[:, 0, :self.self_state_dim] 33 | 34 | # compute embedding vector and pairwise interaction feature 35 | mlp1_input = state.view((-1, size[2])) 36 | mlp1_output = self.mlp1(mlp1_input) 37 | mlp2_output = self.mlp2(mlp1_output) 38 | 39 | # compute attention scores 40 | if self.with_global_state: 41 | global_state = torch.mean(mlp1_output.view(size[0], size[1], -1), 1, keepdim=True) 42 | global_state = global_state.expand((size[0], size[1], self.global_state_dim)).\ 43 | contiguous().view(-1, self.global_state_dim) 44 | attention_input = torch.cat([mlp1_output, global_state], dim=1) 45 | else: 46 | attention_input = mlp1_output 47 | scores = self.attention(attention_input).view(size[0], size[1], 1).squeeze(dim=2) 48 | 49 | # compute normalized weights (masked softmax) 50 | scores_exp = torch.exp(scores) * (scores != 0).float() 51 | weights = (scores_exp / torch.sum(scores_exp, dim=1, keepdim=True)).unsqueeze(2) 52 | self.attention_weights = weights[0, :, 0].data.cpu().numpy() 53 | 54 | # obtain a compact representation of the observed humans 55 | # output feature is a linear combination of input features 56 | features = mlp2_output.view(size[0], size[1], -1) 57 | weighted_feature = torch.sum(torch.mul(weights, features), dim=1) 58 | 59 | # concatenate agent's state with global weighted humans' state 60 | joint_state = torch.cat([self_state, weighted_feature], dim=1) 61 | value = self.mlp3(joint_state) 62 | return value 63 | -------------------------------------------------------------------------------- /control/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gym 3 | import torch 4 | import logging 5 | import argparse 6 | import configparser 7 | import numpy as np 8 | import numpy.linalg as la 9 | 10 | from control.utils.explorer import Explorer, collision_blame 11 | from control.policy.policy_factory import policy_factory 12 | from simulation.envs.utils.robot import Robot 13 | from simulation.envs.policy.orca import ORCA 14 | from simulation.envs.utils.info import * 15 | 16 | # This program throws a lot of RuntimeWarnings that can be ignored, so we suppress warnings. 17 | # If you are trying to troubleshoot, please comment out these lines. Thanks! 18 | import warnings 19 | warnings.filterwarnings('ignore', category=UserWarning) 20 | warnings.filterwarnings('ignore', category=RuntimeWarning) 21 | 22 | 23 | def main(): 24 | parser = argparse.ArgumentParser('Parse configuration file') 25 | parser.add_argument('--policy', type=str, default='orca') 26 | parser.add_argument('--phase', type=str, default='test') 27 | parser.add_argument('--model_dir', type=str, default=None) 28 | parser.add_argument('--test_case', type=int, default=None) 29 | parser.add_argument('--max_epsilon', type=float, default=0.0) 30 | parser.add_argument('--num_episodes', type=float, default=0.0) 31 | parser.add_argument('--video_file', type=str, default=None) 32 | parser.add_argument('--stats_file', type=str, default='stats.csv') 33 | parser.add_argument('--env_config', type=str, default='configs/env.config') 34 | parser.add_argument('--policy_config', type=str, default='configs/policy.config') 35 | parser.add_argument('--estimate_eps', default=False, action='store_true') 36 | parser.add_argument('--il', default=False, action='store_true') 37 | parser.add_argument('--gpu', default=False, action='store_true') 38 | parser.add_argument('--traj', default=False, action='store_true') 39 | parser.add_argument('--visualize', default=False, action='store_true') 40 | args = parser.parse_args() 41 | 42 | # configure logging and device 43 | # logging.basicConfig(level=logging.INFO, format='%(asctime)s, %(levelname)s: %(message)s', 44 | # datefmt="%Y-%m-%d %H:%M:%S", filename='thicc_boi.log', filemode='a') 45 | device = torch.device("cuda:0" if torch.cuda.is_available() and args.gpu else "cpu") 46 | # logging.info('Using device: %s', device) 47 | 48 | # load model and read config files 49 | env_config_file = args.env_config 50 | if args.model_dir is not None: 51 | policy_config_file = os.path.join(args.model_dir, os.path.basename(args.policy_config)) 52 | if args.il: 53 | model_weights = os.path.join(args.model_dir, 'il_model.pth') 54 | else: 55 | if os.path.exists(os.path.join(args.model_dir, 'resumed_rl_model.pth')): 56 | model_weights = os.path.join(args.model_dir, 'resumed_rl_model.pth') 57 | else: 58 | model_weights = os.path.join(args.model_dir, 'rl_model.pth') 59 | else: 60 | policy_config_file = args.policy_config 61 | 62 | # configure policy 63 | policy = policy_factory[args.policy]() 64 | policy_config = configparser.RawConfigParser() 65 | policy_config.read(policy_config_file) 66 | policy.configure(policy_config) 67 | if policy.trainable: 68 | if args.model_dir is None: 69 | parser.error('Trainable policy must be specified with a model weights directory') 70 | policy.get_model().load_state_dict(torch.load(model_weights)) 71 | 72 | # configure environment 73 | env_config = configparser.RawConfigParser() 74 | env_config.read(env_config_file) 75 | env = gym.make('CrowdSim-v0') 76 | env.configure(env_config) 77 | robot = Robot(env_config, 'robot') 78 | robot.set_policy(policy) 79 | env.set_robot(robot) 80 | 81 | # initialize policy 82 | policy.set_phase(args.phase) 83 | policy.set_device(device) 84 | if isinstance(robot.policy, ORCA): 85 | if robot.visible: 86 | robot.policy.safety_space = 0 87 | else: 88 | robot.policy.safety_space = 0 89 | # logging.info('ORCA agent buffer: %f', robot.policy.safety_space) 90 | policy.set_env(env) 91 | robot.print_info() 92 | 93 | # visualize evaluation 94 | if args.visualize: 95 | ob = env.reset(phase=args.phase, test_case=args.test_case, max_epsilon=args.max_epsilon) 96 | done = False 97 | last_pos = np.array(robot.get_position()) 98 | while not done: 99 | eps = env.get_epsilons(args.estimate_eps) 100 | action = robot.act(ob, eps) 101 | ob, _, done, info = env.step(action) 102 | current_pos = np.array(robot.get_position()) 103 | # logging.debug('Speed: %.2f', la.norm(current_pos - last_pos) / robot.time_step) 104 | last_pos = current_pos 105 | 106 | print('Testing: {} scenario with {} pedestrians'.format(env.scenario, len(env.humans))) 107 | print('Result: {}'.format(info)) 108 | 109 | if args.traj: 110 | env.render('traj', args.video_file) 111 | else: 112 | env.render('video', args.video_file) 113 | 114 | # run evaluation without visualization 115 | else: 116 | save_dir = os.path.dirname(args.stats_file) 117 | if not os.path.exists(save_dir): 118 | os.makedirs(save_dir) 119 | explorer = Explorer(env, robot, device, gamma=0.9, stats_file=args.stats_file) 120 | k = int(args.num_episodes if args.num_episodes > 0 else env.case_size[args.phase]) 121 | explorer.run_k_episodes(k, args.phase, print_failure=True, max_epsilon=args.max_epsilon, estimate_eps=args.estimate_eps) 122 | 123 | 124 | if __name__ == '__main__': 125 | main() 126 | -------------------------------------------------------------------------------- /control/train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import gym 4 | import torch 5 | import shutil 6 | import logging 7 | import argparse 8 | import configparser 9 | 10 | from simulation.envs.utils.robot import Robot 11 | from control.utils.trainer import Trainer 12 | from control.utils.memory import ReplayMemory 13 | from control.utils.explorer import Explorer 14 | from control.policy.policy_factory import policy_factory 15 | 16 | # This program throws a lot of RuntimeWarnings that can be ignored, so we suppress warnings. 17 | # If you are trying to troubleshoot, please comment out these lines. Thanks! 18 | import warnings 19 | warnings.filterwarnings('ignore', category=RuntimeWarning) 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser('Parse configuration file') 23 | parser.add_argument('--policy', type=str, default='sarl') 24 | parser.add_argument('--env_config', type=str, default='configs/env.config') 25 | parser.add_argument('--policy_config', type=str, default='configs/policy.config') 26 | parser.add_argument('--train_config', type=str, default='configs/train.config') 27 | parser.add_argument('--output_dir', type=str, default='model/') 28 | parser.add_argument('--gpu', default=False, action='store_true') 29 | parser.add_argument('--debug', default=False, action='store_true') 30 | parser.add_argument('--resume', default=False, action='store_true') 31 | args = parser.parse_args() 32 | 33 | # configure paths 34 | make_new_dir = True 35 | if os.path.exists(args.output_dir): 36 | key = input('Output directory already exists! Overwrite the folder? (y/n)') 37 | if key == 'y' and not args.resume: 38 | shutil.rmtree(args.output_dir) 39 | else: 40 | make_new_dir = False 41 | args.env_config = os.path.join(args.output_dir, os.path.basename(args.env_config)) 42 | args.policy_config = os.path.join(args.output_dir, os.path.basename(args.policy_config)) 43 | args.train_config = os.path.join(args.output_dir, os.path.basename(args.train_config)) 44 | if make_new_dir: 45 | os.makedirs(args.output_dir) 46 | shutil.copy(args.env_config, args.output_dir) 47 | shutil.copy(args.policy_config, args.output_dir) 48 | shutil.copy(args.train_config, args.output_dir) 49 | log_file = os.path.join(args.output_dir, 'output.log') 50 | il_weight_file = os.path.join(args.output_dir, 'il_model.pth') 51 | rl_weight_file = os.path.join(args.output_dir, 'rl_model.pth') 52 | 53 | # configure logging and device 54 | mode = 'a' if args.resume else 'w' 55 | file_handler = logging.FileHandler(log_file, mode=mode) 56 | stdout_handler = logging.StreamHandler(sys.stdout) 57 | level = logging.INFO if not args.debug else logging.DEBUG 58 | logging.basicConfig(level=level, handlers=[stdout_handler, file_handler], 59 | format='%(asctime)s, %(levelname)s: %(message)s', datefmt="%Y-%m-%d %H:%M:%S") 60 | device = torch.device("cuda:0" if torch.cuda.is_available() and args.gpu else "cpu") 61 | logging.info('Using device: %s', device) 62 | 63 | # configure policy 64 | policy = policy_factory[args.policy]() 65 | if not policy.trainable: 66 | parser.error('Policy has to be trainable.') 67 | if args.policy_config is None: 68 | parser.error('Policy config has to be specified for a trainable network.') 69 | policy_config = configparser.RawConfigParser() 70 | policy_config.read(args.policy_config) 71 | policy.configure(policy_config) 72 | policy.set_device(device) 73 | 74 | # configure environment 75 | env_config = configparser.RawConfigParser() 76 | env_config.read(args.env_config) 77 | env = gym.make('CrowdSim-v0') 78 | env.configure(env_config) 79 | robot = Robot(env_config, 'robot') 80 | env.set_robot(robot) 81 | 82 | # read training parameters 83 | if args.train_config is None: 84 | parser.error('Train config file has to be specified for a trainable network.') 85 | train_config = configparser.RawConfigParser() 86 | train_config.read(args.train_config) 87 | rl_learning_rate = train_config.getfloat('train', 'rl_learning_rate') 88 | train_batches = train_config.getint('train', 'train_batches') 89 | train_episodes = train_config.getint('train', 'train_episodes') 90 | sample_episodes = train_config.getint('train', 'sample_episodes') 91 | target_update_interval = train_config.getint('train', 'target_update_interval') 92 | evaluation_interval = train_config.getint('train', 'evaluation_interval') 93 | capacity = train_config.getint('train', 'capacity') 94 | epsilon_start = train_config.getfloat('train', 'epsilon_start') 95 | epsilon_end = train_config.getfloat('train', 'epsilon_end') 96 | epsilon_decay = train_config.getfloat('train', 'epsilon_decay') 97 | checkpoint_interval = train_config.getint('train', 'checkpoint_interval') 98 | randomness_start = train_config.getfloat('train', 'randomness_start') 99 | randomness_end = train_config.getfloat('train', 'randomness_end') 100 | randomness_step = train_config.getfloat('train', 'randomness_step') 101 | 102 | # configure trainer and explorer 103 | model = policy.get_model() 104 | memory = ReplayMemory(capacity) 105 | batch_size = train_config.getint('trainer', 'batch_size') 106 | max_agents = train_config.getint('trainer', 'max_agents') 107 | trainer = Trainer(model, memory, device, batch_size) 108 | explorer = Explorer(env, robot, device, memory, policy.gamma, policy, max_agents) 109 | 110 | # imitation learning 111 | if args.resume: 112 | if not os.path.exists(rl_weight_file): 113 | logging.error('RL weights file does not exist; cannot resume training.') 114 | model.load_state_dict(torch.load(rl_weight_file)) 115 | rl_weight_file = os.path.join(args.output_dir, 'resumed_rl_model.pth') 116 | logging.info('Resuming training with RL trained weights.') 117 | elif os.path.exists(il_weight_file): 118 | model.load_state_dict(torch.load(il_weight_file)) 119 | logging.info('Loading imitation learning trained weights.') 120 | else: 121 | il_episodes = train_config.getint('imitation_learning', 'il_episodes') 122 | il_policy = train_config.get('imitation_learning', 'il_policy') 123 | il_epochs = train_config.getint('imitation_learning', 'il_epochs') 124 | il_learning_rate = train_config.getfloat('imitation_learning', 'il_learning_rate') 125 | human_randomness = train_config.getfloat('imitation_learning', 'human_randomness') 126 | trainer.set_learning_rate(il_learning_rate) 127 | if robot.visible: 128 | safety_space = 0 129 | else: 130 | safety_space = train_config.getfloat('imitation_learning', 'safety_space') 131 | il_policy = policy_factory[il_policy]() 132 | il_policy.multiagent_training = policy.multiagent_training 133 | il_policy.safety_space = safety_space 134 | robot.set_policy(il_policy) 135 | explorer.run_k_episodes(il_episodes, 'train', update_memory=True, imitation_learning=True, max_epsilon=human_randomness) 136 | trainer.optimize_epoch(il_epochs) 137 | torch.save(model.state_dict(), il_weight_file) 138 | logging.info('Finished imitation learning and saved weights.') 139 | logging.info('Experience set size: %d/%d', len(memory), memory.capacity) 140 | explorer.update_target_model(model) 141 | 142 | # reinforcement learning 143 | policy.set_env(env) 144 | robot.set_policy(policy) 145 | robot.print_info() 146 | trainer.set_learning_rate(rl_learning_rate) 147 | # fill the memory pool with some RL experience 148 | if args.resume: 149 | robot.policy.set_epsilon(epsilon_end) 150 | explorer.run_k_episodes(100, 'train', update_memory=True, episode=0) 151 | logging.info('Experience set size: %d/%d', len(memory), memory.capacity) 152 | 153 | episode = 0 154 | max_randomness = randomness_start 155 | randomness_episodes = (randomness_end - randomness_start) / randomness_step + 1 156 | randomness_interval = round(train_episodes / randomness_episodes) 157 | print("\n\nStarting epsilon-greedy RL training with {} stages starting from epsilon = {} and going to epsilon = {}.".format(train_episodes, epsilon_start, epsilon_end)) 158 | print("Note: This includes curriculum training, with maximum pedestrian randomness starting at {} and going up to {}.\nMaximum pedestrian randomness increases by {} every {} RL training stages.".format(randomness_start, randomness_end, randomness_step, randomness_interval)) 159 | while episode < train_episodes: 160 | if args.resume: 161 | epsilon = epsilon_end 162 | else: 163 | if episode < epsilon_decay: 164 | epsilon = epsilon_start + (epsilon_end - epsilon_start) / epsilon_decay * episode 165 | else: 166 | epsilon = epsilon_end 167 | robot.policy.set_epsilon(epsilon) 168 | 169 | print("\n========================================") 170 | print("Stage {} of {}: epsilon-greedy = {}".format(episode + 1, train_episodes, epsilon)) 171 | print("========================================\n") 172 | 173 | 174 | # update the maximum randomness of humans 175 | if episode != 0 and episode % randomness_interval == 0: 176 | max_randomness += randomness_step 177 | 178 | # evaluate the model 179 | if episode % evaluation_interval == 0: 180 | explorer.run_k_episodes(env.case_size['val'], 'val', episode=episode, max_epsilon=0.5) 181 | 182 | # sample k episodes into memory and optimize over the generated memory 183 | explorer.run_k_episodes(sample_episodes, 'train', update_memory=True, episode=episode, max_epsilon=max_randomness) 184 | trainer.optimize_batch(train_batches) 185 | episode += 1 186 | 187 | if episode % target_update_interval == 0: 188 | explorer.update_target_model(model) 189 | 190 | if episode != 0 and episode % checkpoint_interval == 0: 191 | torch.save(model.state_dict(), rl_weight_file) 192 | 193 | # final test 194 | explorer.run_k_episodes(env.case_size['test'], 'test', episode=episode, max_epsilon=max_randomness) 195 | 196 | 197 | if __name__ == '__main__': 198 | main() 199 | -------------------------------------------------------------------------------- /control/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sarapohland/stranger-danger/84a6bad96de4f117d6192b5111eba5531e766e26/control/utils/__init__.py -------------------------------------------------------------------------------- /control/utils/explorer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import time 4 | import copy 5 | import torch 6 | import pickle 7 | import logging 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from simulation.envs.utils.info import * 12 | 13 | 14 | def average(input_list): 15 | if input_list: 16 | return sum(input_list) / len(input_list) 17 | else: 18 | return 0 19 | 20 | def curvature(p0, p1, p2): 21 | area = _get_area(p0, p1, p2) 22 | d0 = _get_dist(p0, p1) 23 | d1 = _get_dist(p1, p2) 24 | d2 = _get_dist(p2, p0) 25 | curv = 4*area/(d0*d1*d2) 26 | return 0 if math.isnan(curv) else curv 27 | 28 | def _get_area(p0, p1, p2): 29 | return (p1[0] - p0[0])*(p2[1] - p0[1]) - (p1[1] - p0[1])*(p2[0] - p0[0]) 30 | 31 | def _get_dist(p0, p1): 32 | return np.sqrt((p0[0] - p1[0])**2 + (p0[1] - p1[1])**2) 33 | 34 | def collision_blame(h_pos, h_vel, r_pos, r_vel): 35 | w = np.array(h_pos) - np.array(r_pos) 36 | w = w / np.linalg.norm(w) 37 | robot_blame = r_vel[0] * w[0] + r_vel[1] * w[1] 38 | human_blame = -(h_vel[0] * w[0] + h_vel[1] * w[1]) 39 | return robot_blame - human_blame 40 | 41 | def discomfort_dist(num_hum): 42 | m = (0.1 - 0.45) / (20 - 6) 43 | b = 0.1 - 20 * m 44 | y = m * num_hum + b 45 | return np.clip(y, 0.1, 0.45) 46 | 47 | 48 | class Explorer(object): 49 | def __init__(self, env, robot, device, memory=None, gamma=None, target_policy=None, max_agents=None, stats_file=None): 50 | self.env = env 51 | self.robot = robot 52 | self.device = device 53 | self.memory = memory 54 | self.gamma = gamma 55 | self.target_policy = target_policy 56 | self.max_agents = max_agents 57 | self.target_model = None 58 | self.stats_file = stats_file 59 | 60 | def update_target_model(self, target_model): 61 | self.target_model = copy.deepcopy(target_model) 62 | 63 | def run_k_episodes(self, k, phase, update_memory=False, episode=None, 64 | imitation_learning=False, print_failure=False, max_epsilon=0, estimate_eps=False): 65 | self.robot.policy.set_phase(phase) 66 | 67 | success_times = [] 68 | timeout_times = [] 69 | collision_times = [] 70 | 71 | min_dist = [] 72 | min_dist_danger = [] 73 | cumulative_rewards = [] 74 | 75 | timeout_cases = [] 76 | collision_cases = [] 77 | # collision_positions = [] 78 | 79 | avg_robot_vel = [] 80 | avg_robot_acc = [] 81 | avg_robot_jer = [] 82 | avg_human_vel = [] 83 | avg_human_acc = [] 84 | avg_human_jer = [] 85 | 86 | success = 0 87 | collision = 0 88 | timeout = 0 89 | too_close = 0 90 | 91 | stats = {'test_case':[],'scenario':[],'perpetual':[],'num_humans':[],'open_space':[],'navigation_time':[], 92 | 'result':[],'num_collisions':[],'collision_times':[],'collision_positions':[],'collision_blames':[], 93 | 'inference_time_mean':[],'inference_time_std':[],'inference_time_min':[],'inference_time_max':[], 94 | 'min_dist_mean':[],'min_dist_std':[],'min_dist_min':[], 95 | 'vel_mean':[],'vel_std':[],'acc_mean':[],'acc_std':[],'jerk_mean':[],'jerk_std':[], 96 | 'curvature_mean': [], 'curvature_std': [], 97 | 'time_intruded':[],'intruded_min_dist_mean':[],'intruded_min_dist_std':[], 98 | 'intruded_vel_mean':[],'intruded_vel_std':[],'intruded_acc_mean':[], 99 | 'intruded_acc_std':[],'intruded_jerk_mean':[],'intruded_jerk_std':[]} 100 | 101 | # Some printouts to help explain the following tqdm loading bar 102 | if imitation_learning: 103 | print("Generating data with ORCA for imitation learning ({} episodes with a maximum pedestrian randomness of {}).".format(k, max_epsilon)) 104 | elif phase == 'train': 105 | print("Generating data for training ({} episodes with a maximum pedestrian randomness of {}).".format(k, max_epsilon)) 106 | elif phase == 'val': 107 | print("Validating on {} episodes with a maximum pedestrian randomness of {}.".format(k, max_epsilon)) 108 | elif phase == 'test': 109 | print("Testing on {} episodes with a maximum pedestrian randomness of {}.".format(k, max_epsilon)) 110 | 111 | from tqdm import tqdm 112 | for trial in tqdm(range(k)): 113 | ob = self.env.reset(phase, max_epsilon=max_epsilon) 114 | 115 | done = False 116 | result = None 117 | states = [] 118 | actions = [] 119 | rewards = [] 120 | epsilons = [] 121 | robot_pos = [] 122 | robot_vel = [] 123 | human_vel = [] 124 | perf_time = [] 125 | dmins = [] 126 | intruded_dmins = [] 127 | times_intruded = 0 128 | absolute_minimum_distance = np.inf 129 | vel_intruded = [] 130 | episode_collision_times = [] 131 | episode_collision_pos = [] 132 | episode_collision_blames = [] 133 | 134 | new_eps = None 135 | eps = None 136 | exp_alpha = 0.8 137 | 138 | while not done: 139 | perf_start = time.perf_counter() 140 | new_eps = self.env.get_epsilons(estimate_eps) 141 | if eps is None: 142 | eps = new_eps 143 | else: 144 | eps = exp_alpha*eps + (1 - exp_alpha)*new_eps 145 | action = self.robot.act(ob, eps) 146 | perf_end = time.perf_counter() 147 | perf_time.append(perf_end - perf_start) 148 | 149 | ob, reward, done, info = self.env.step(action) 150 | states.append(self.robot.policy.last_state) 151 | actions.append(action) 152 | rewards.append(reward) 153 | epsilons.append(eps) 154 | 155 | robot_pos.append([self.robot.px, self.robot.py]) 156 | robot_vel.append([self.robot.vx, self.robot.vy]) 157 | for human in self.env.humans: 158 | human_vel.append([human.vx, human.vy]) 159 | 160 | if isinstance(info, Danger): 161 | too_close += 1 162 | min_dist_danger.append(info.min_dist) 163 | 164 | if isinstance(info, HumanCollision): 165 | h_pos = info.human.get_position() 166 | h_vel = info.human.get_velocity() 167 | r_pos = self.robot.get_position() 168 | r_vel = self.robot.get_velocity() 169 | 170 | episode_collision_times.append(self.env.global_time) 171 | episode_collision_pos.append([round(coord,4) for coord in info.coll_pos]) 172 | episode_collision_blames.append(collision_blame(h_pos, h_vel, r_pos, r_vel)) 173 | 174 | dmins.append(self.env.dmin) 175 | absolute_minimum_distance = min(absolute_minimum_distance, self.env.dmin) 176 | 177 | # record intrusions into personal space 178 | disc_dist = discomfort_dist(len(self.env.humans)) 179 | if self.env.dmin < disc_dist: 180 | times_intruded += 1 181 | intruded_dmins.append(self.env.dmin) 182 | vel_intruded.append([self.robot.vx, self.robot.vy]) 183 | 184 | num_collisions = len(episode_collision_times) 185 | 186 | if num_collisions > 0: 187 | collision += 1 188 | collision_cases.append(trial) 189 | collision_times.append(episode_collision_times) 190 | result = 'HumanCollision' 191 | 192 | else: 193 | if isinstance(info, ReachGoal): 194 | success += 1 195 | success_times.append(self.env.global_time) 196 | result = 'ReachGoal' 197 | elif isinstance(info, HumanCollision): 198 | collision += 1 199 | collision_cases.append(trial) 200 | collision_times.append(self.env.global_time) 201 | result = 'HumanCollision' 202 | elif isinstance(info, Timeout): 203 | timeout += 1 204 | timeout_cases.append(trial) 205 | timeout_times.append(self.env.time_limit) 206 | result = 'Timeout' 207 | else: 208 | raise ValueError('Invalid end signal from environment') 209 | 210 | timeStep = self.env.time_step 211 | 212 | # Robot navigation metrics 213 | curves = [] 214 | for i in range(len(robot_pos)-4): 215 | p0 = robot_pos[i] 216 | p1 = robot_pos[i+2] 217 | p2 = robot_pos[i+4] 218 | curves.append(np.abs(curvature(p0, p1, p2))) 219 | 220 | robot_vel = np.array(robot_vel) 221 | robotDF = pd.DataFrame(robot_vel, columns=['vx', 'vy']) 222 | robotDF[['ax', 'ay']] = robotDF[['vx', 'vy']].diff(4)#/timeStep 223 | robotDF[['jx', 'jy']] = robotDF[['ax', 'ay']].diff(4)#/timeStep 224 | robotDF['vel'] = np.sqrt(np.square(robotDF[['vx', 'vy']]).sum(axis=1)) 225 | robotDF['acc'] = np.sqrt(np.square(robotDF[['ax', 'ay']]).sum(axis=1)) 226 | robotDF['jer'] = np.sqrt(np.square(robotDF[['jx', 'jy']]).sum(axis=1)) 227 | avg_robot_vel.append(np.mean(robotDF['vel'])) 228 | avg_robot_acc.append(np.mean(robotDF['acc'])) 229 | avg_robot_jer.append(np.mean(robotDF['jer'])) 230 | 231 | # Human navigation metrics 232 | if human_vel: 233 | human_vel = np.array(human_vel) 234 | humanDF = pd.DataFrame(human_vel, columns=['vx', 'vy']) 235 | humanDF[['ax', 'ay']] = humanDF[['vx', 'vy']].diff(4)#/timeStep 236 | humanDF[['jx', 'jy']] = humanDF[['ax', 'ay']].diff(4)#/timeStep 237 | humanDF['vel'] = np.sqrt(np.square(humanDF[['vx', 'vy']]).sum(axis=1)) 238 | humanDF['acc'] = np.sqrt(np.square(humanDF[['ax', 'ay']]).sum(axis=1)) 239 | humanDF['jer'] = np.sqrt(np.square(humanDF[['jx', 'jy']]).sum(axis=1)) 240 | avg_human_vel.append(np.mean(humanDF['vel'])) 241 | avg_human_acc.append(np.mean(humanDF['acc'])) 242 | avg_human_jer.append(np.mean(humanDF['jer'])) 243 | 244 | if len(vel_intruded) > 0: 245 | vel_intruded = np.array(vel_intruded) 246 | intrudedDF = pd.DataFrame(vel_intruded, columns=['vx', 'vy']) 247 | intrudedDF[['ax', 'ay']] = intrudedDF[['vx', 'vy']].diff(4)#/timeStep 248 | intrudedDF[['jx', 'jy']] = intrudedDF[['ax', 'ay']].diff(4)#/timeStep 249 | intrudedDF['vel'] = np.sqrt(np.square(intrudedDF[['vx', 'vy']]).sum(axis=1)) 250 | intrudedDF['acc'] = np.sqrt(np.square(intrudedDF[['ax', 'ay']]).sum(axis=1)) 251 | intrudedDF['jer'] = np.sqrt(np.square(intrudedDF[['jx', 'jy']]).sum(axis=1)) 252 | intrude_list = [] 253 | intrude_list.append(np.mean(intrudedDF['vel'])) # intruded_vel_mean 254 | intrude_list.append(np.std(intrudedDF['vel'])) # intruded_vel_std 255 | intrude_list.append(np.mean(intrudedDF['acc'])) # intruded_acc_mean 256 | intrude_list.append(np.std(intrudedDF['acc'])) # intruded_acc_std 257 | intrude_list.append(np.mean(intrudedDF['jer'])) # intruded_jerk_mean 258 | intrude_list.append(np.std(intrudedDF['jer'])) # intruded_jerk_std 259 | else: 260 | intrude_list = [np.nan for _ in range(6)] 261 | 262 | # Record some metrics into the stats file 263 | stats['test_case'].append(trial) 264 | stats['scenario'].append(self.env.scenario) 265 | stats['perpetual'].append(self.env.perpetual) 266 | stats['num_humans'].append(len(self.env.humans)) 267 | stats['open_space'].append(self.env.open_space) 268 | stats['navigation_time'].append(self.env.global_time) 269 | stats['result'].append(result) 270 | stats['num_collisions'].append(num_collisions) 271 | stats['collision_times'].append(episode_collision_times) 272 | stats['collision_positions'].append(episode_collision_pos) 273 | stats['collision_blames'].append(episode_collision_blames) 274 | stats['inference_time_mean'].append(np.mean(perf_time)) 275 | stats['inference_time_std'].append(np.std(perf_time)) 276 | stats['inference_time_min'].append(np.min(perf_time)) 277 | stats['inference_time_max'].append(np.max(perf_time)) 278 | stats['min_dist_mean'].append(np.mean(dmins)) 279 | stats['min_dist_std'].append(np.std(dmins)) 280 | stats['min_dist_min'].append(absolute_minimum_distance) 281 | stats['curvature_mean'].append(np.mean(curves)) 282 | stats['curvature_std'].append(np.std(curves)) 283 | stats['vel_mean'].append(np.mean(robotDF['vel'])) 284 | stats['vel_std'].append(np.std(robotDF['vel'])) 285 | stats['acc_mean'].append(np.mean(robotDF['acc'])) 286 | stats['acc_std'].append(np.std(robotDF['acc'])) 287 | stats['jerk_mean'].append(np.mean(robotDF['jer'])) 288 | stats['jerk_std'].append(np.std(robotDF['jer'])) 289 | stats['time_intruded'].append(timeStep*times_intruded) 290 | stats['intruded_min_dist_mean'].append(np.mean(intruded_dmins)) 291 | stats['intruded_min_dist_std'].append(np.std(intruded_dmins)) 292 | stats['intruded_vel_mean'].append(intrude_list[0]) 293 | stats['intruded_vel_std'].append(intrude_list[1]) 294 | stats['intruded_acc_mean'].append(intrude_list[2]) 295 | stats['intruded_acc_std'].append(intrude_list[3]) 296 | stats['intruded_jerk_mean'].append(intrude_list[4]) 297 | stats['intruded_jerk_std'].append(intrude_list[5]) 298 | 299 | if self.env.dmin != float('inf'): 300 | min_dist.append(self.env.dmin) 301 | 302 | if update_memory: 303 | if isinstance(info, ReachGoal) or isinstance(info, HumanCollision): 304 | # only add positive(success) or negative(collision) experience in experience set 305 | self.update_memory(states, actions, rewards, epsilons, imitation_learning) 306 | 307 | cumulative_rewards.append(sum([pow(self.gamma, t * self.robot.time_step * self.robot.v_pref) 308 | * reward for t, reward in enumerate(rewards)])) 309 | 310 | stats_df = pd.DataFrame(stats) 311 | if self.stats_file is not None: 312 | stats_df.to_csv(self.stats_file, index=False) 313 | 314 | success_rate = success / k 315 | collision_rate = collision / k 316 | timeout_rate = timeout / k 317 | assert success + collision + timeout == k 318 | avg_nav_time = sum(success_times) / len(success_times) if success_times else self.env.time_limit 319 | 320 | extra_info = '' if episode is None else 'in episode {} '.format(episode) 321 | logging.info('{:<5} {} Success rate: {:.2f}, Collision rate: {:.2f}, Timeout rate: {:.2f}, Nav time: {:.2f}'. 322 | format(phase.upper(), extra_info, success_rate, collision_rate, timeout_rate, avg_nav_time)) 323 | 324 | if phase in ['val', 'test']: 325 | if min_dist: 326 | logging.info('Average minimum distance between robot and pedestrian: %.2f', average(min_dist)) 327 | else: 328 | logging.info('Average minimum distance between robot and pedestrian: %.2f', -1) 329 | 330 | if min_dist_danger: 331 | logging.info('Average minimum distance when robot is too close: %.2f', average(min_dist_danger)) 332 | else: 333 | logging.info('Average minimum distance when robot is too close: %.2f', -1) 334 | 335 | avg_avg_robot_vel = sum(avg_robot_vel) / len(avg_robot_vel) 336 | avg_avg_robot_acc = sum(avg_robot_acc) / len(avg_robot_acc) 337 | avg_avg_robot_jer = sum(avg_robot_jer) / len(avg_robot_jer) 338 | 339 | logging.info('Avg robot speed: {:.2f}, Avg robot accel: {:.2f}, Avg robot jerk: {:.2f}'. 340 | format(avg_avg_robot_vel, avg_avg_robot_acc, avg_avg_robot_jer)) 341 | 342 | if avg_human_vel: 343 | avg_avg_human_vel = sum(avg_human_vel) / len(avg_human_vel) 344 | avg_avg_human_acc = sum(avg_human_acc) / len(avg_human_acc) 345 | avg_avg_human_jer = sum(avg_human_jer) / len(avg_human_jer) 346 | else: 347 | avg_avg_human_vel = -1 348 | avg_avg_human_acc = -1 349 | avg_avg_human_jer = -1 350 | 351 | logging.info('Avg pedestrian speed: {:.2f}, Avg pedestrian accel: {:.2f}, Avg pedestrian jerk: {:.2f}'. 352 | format(avg_avg_human_vel, avg_avg_human_acc, avg_avg_human_jer)) 353 | 354 | if print_failure: 355 | logging.info('Collision cases: ' + ' '.join([str(x) for x in collision_cases])) 356 | logging.info('Timeout cases: ' + ' '.join([str(x) for x in timeout_cases])) 357 | # logging.info('Collision locations: ' + ' '.join([str([round(x[0], 2), round(x[1], 2)]) for x in collision_positions])) 358 | 359 | def update_memory(self, states, actions, rewards, epsilons, imitation_learning=False): 360 | if self.memory is None or self.gamma is None: 361 | raise ValueError('Memory or gamma value is not set!') 362 | 363 | for i, state in enumerate(states): 364 | reward = rewards[i] 365 | epsilon = epsilons[i] 366 | 367 | # VALUE UPDATE 368 | if imitation_learning: 369 | # define the value of states in IL as cumulative discounted rewards, which is the same in RL 370 | state = self.target_policy.transform(state, epsilon) 371 | value = sum([pow(self.gamma, max(t - i, 0) * self.robot.time_step * self.robot.v_pref) * reward 372 | * (1 if t >= i else 0) for t, reward in enumerate(rewards)]) 373 | else: 374 | if i == len(states) - 1: 375 | value = reward 376 | else: 377 | next_state = states[i + 1] 378 | gamma_bar = pow(self.gamma, self.robot.time_step * self.robot.v_pref) 379 | value = reward + gamma_bar * self.target_model(next_state.unsqueeze(0)).data.item() 380 | value = torch.Tensor([value]).to(self.device) 381 | 382 | # transform state of different human_num into fixed-size tensor 383 | if self.max_agents is not None and self.max_agents > 0: 384 | if len(state.size()) == 1: 385 | human_num = 1 386 | feature_size = state.size()[0] 387 | else: 388 | human_num, feature_size = state.size() 389 | if human_num != self.max_agents: 390 | padding = torch.zeros((self.max_agents - human_num, feature_size)) 391 | state = torch.cat([state, padding]) 392 | self.memory.push((state, value)) 393 | -------------------------------------------------------------------------------- /control/utils/memory.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | 3 | 4 | class ReplayMemory(Dataset): 5 | def __init__(self, capacity): 6 | self.capacity = capacity 7 | self.memory = list() 8 | self.position = 0 9 | 10 | def push(self, item): 11 | # replace old experience with new experience 12 | if len(self.memory) < self.position + 1: 13 | self.memory.append(item) 14 | else: 15 | self.memory[self.position] = item 16 | self.position = (self.position + 1) % self.capacity 17 | 18 | def is_full(self): 19 | return len(self.memory) == self.capacity 20 | 21 | def __getitem__(self, item): 22 | return self.memory[item] 23 | 24 | def __len__(self): 25 | return len(self.memory) 26 | 27 | def clear(self): 28 | self.memory = list() 29 | -------------------------------------------------------------------------------- /control/utils/plot.py: -------------------------------------------------------------------------------- 1 | import re 2 | import argparse 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def running_mean(x, n): 8 | cumsum = np.cumsum(np.insert(x, 0, 0)) 9 | return (cumsum[n:] - cumsum[:-n]) / float(n) 10 | 11 | 12 | def main(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('model_dir', type=str) 15 | parser.add_argument('--plot_sr', default=False, action='store_true') 16 | parser.add_argument('--plot_cr', default=False, action='store_true') 17 | parser.add_argument('--plot_tr', default=False, action='store_true') 18 | parser.add_argument('--window_size', type=int, default=200) 19 | args = parser.parse_args() 20 | 21 | ax1_legends = ['Training', 'Validation'] 22 | 23 | # Read output log file 24 | with open(args.model_dir + '/output.log', 'r') as file: 25 | log = file.read() 26 | 27 | val_pattern = r"VAL in episode (?P\d+) Success rate: (?P[0-1].\d+), " \ 28 | r"Collision rate: (?P[0-1].\d+), Timeout rate: (?P[0-1].\d+), Nav time: (?P