├── .gitignore ├── LICENSE ├── README.md ├── environment.yml ├── examples ├── mountain_car │ ├── config_mountaincar.py │ ├── run_mountain_car_multiple.py │ └── run_mountaincar.py ├── pendulum │ ├── config_pendulum.py │ ├── run_pendulum.py │ └── run_pendulum_multiple.py └── process_control │ ├── config_process_control.py │ ├── run_process_control.py │ └── run_processc_control_multiple.py ├── images ├── Article_results.png ├── Cost_runs_MountainCarContinuous-v0.png ├── Cost_runs_Pendulum-v0.png ├── anim_mountain_car.gif ├── anim_pendulum.gif ├── anim_process_control_actionschange.gif ├── anim_process_control_base.gif ├── anim_process_control_timevarying.gif ├── control_anim_MountainCarContinuous-v0.gif ├── control_anim_Pendulum-v0.gif ├── gif_total.gif ├── history_mountain_car.png ├── history_pendulum.png ├── model_mountain_car.png ├── model_pendulum.png ├── mountaincar_constraints-2023-05-13_13.23.24_adjusted.gif ├── mountaincar_constraints-2023-05-13_13.23.24_adjusted.mkv └── process_control_base_history.png └── rl_gp_mpc ├── __init__.py ├── config_classes ├── actions_config.py ├── controller_config.py ├── memory_config.py ├── model_config.py ├── observation_config.py ├── reward_config.py ├── total_config.py ├── training_config.py ├── utils │ └── functions_process_config.py └── visu_config.py ├── control_objects ├── __init__.py ├── actions_mappers │ ├── abstract_action_mapper.py │ ├── action_init_functions.py │ ├── derivative_action_mapper.py │ └── normalization_action_mapper.py ├── controllers │ ├── abstract_controller.py │ ├── gp_mpc_controller.py │ └── iteration_info_class.py ├── memories │ └── gp_memory.py ├── models │ ├── abstract_model.py │ └── gp_model.py ├── observations_states_mappers │ ├── abstract_observation_state_mapper.py │ └── normalization_observation_state_mapper.py ├── states_reward_mappers │ ├── abstract_state_reward_mapper.py │ └── setpoint_distance_reward_mapper.py └── utils │ ├── data_utils.py │ └── pytorch_utils.py ├── envs └── process_control.py ├── run_env_function.py └── visu_objects ├── dynamic_2d_graph.py ├── static_2d_graph.py ├── static_3d_graph.py ├── utils.py └── visu_object.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | **/folder_save/ 3 | **/__pycache__/ 4 | launch.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 SimonRennotte 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control 2 | ![control animation](https://github.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/blob/master/images/gif_total.gif?) 3 | 4 | 5 |

6 | 7 |

8 | 9 |

10 | 11 |

12 | 13 | ## Overview 14 | Unofficial implementation of the paper [Data-Efficient Reinforcement Learning with Probabilistic Model Predictive Control](https://arxiv.org/pdf/1706.06491v1.pdf) with Pytorch and GPyTorch. 15 | 16 | ### Abstract of the paper 17 | Trial-and-error based reinforcement learning (RL) has seen rapid advancements in recent times, especially with the advent of deep neural networks. 18 | However, the majority of autonomous RL algorithms either rely on engineered features or a large number of interactions with the environment. 19 | Such a large number of interactions may be impractical in many real-world applications. 20 | For example, robots are subject to wear and tear and, hence, millions of interactions may change or damage the system. 21 | Moreover, practical systems have limitations in the form of the maximum torque that can be safely applied. 22 | To reduce the number of system interactions while naturally handling constraints, we propose a model-based RL framework based on Model Predictive Control (MPC). 23 | In particular, we propose to learn a probabilistic transition model using Gaussian Processes (GPs) to incorporate model uncertainties into long-term predictions, thereby, 24 | reducing the impact of model errors. We then use MPC to find a control sequence that minimises the expected long-term cost. 25 | We provide theoretical guarantees for the first-order optimality in the GP-based transition models with deterministic approximate inference for long-term planning. 26 | The proposed framework demonstrates superior data efficiency and learning rates compared to the current state of the art. 27 | 28 | --- 29 | 30 | ## Table of contents 31 | * [Usage](#Usage) 32 | * [Installation](#installation) 33 | * [How to run](#run) 34 | * [Examples](#examples) 35 | * [Pendulum-v0](#pendulum-v0) 36 | * [MountainCarContinuous-v0](#mountaincarcontinuous-v0) 37 | * [ProcessControl](#process_control) 38 | * [Advanced functionalities](#advanced_functionalities) 39 | * [States constraints](#states_constraints) 40 | * [Actions change limitation](#actions_change_limitation) 41 | * [Time varying model](#time_varying_model) 42 | * [Resources](#resources) 43 | * [Brief explanation of the method](#brief-explanation) 44 | * [Why is this paper important](#why-is-this-paper-important) 45 | * [Remarks](#remarks) 46 | * [Differences from the paper](#differences) 47 | * [Limitations](#limitations) 48 | * [Talks/Tutorials](#talks-tutorials) 49 | * [Papers](#papers) 50 | * [Textbooks](#textbooks) 51 | * [Projects](#projects) 52 | 53 | 54 | 55 | ## Usage 56 | 57 | 58 | 59 | ### Installation 60 | #### Dependencies 61 | numpy, gym, pytorch, gpytorch, matplotlib, scikit-learn, ffmpeg 62 | 63 | #### Run the pendulum example 64 | Download [anaconda](https://www.anaconda.com/products/individual) 65 | Open an anaconda prompt window: 66 | ``` 67 | git clone https://github.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control 68 | cd Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control 69 | conda env create -f environment.yml 70 | conda activate gp_rl_env 71 | python examples/pendulum/run_pendulum.py 72 | ``` 73 | To apply the method on different gym environments, refer the to /examples folder. 74 | Note that the control_config object must be defined so that it works for the environment. 75 | Some parameters depend on the dimensionality of actions and observations. 76 | 77 | 78 | 79 | ## Examples 80 | In each experiment, two types of plots are available to observe and comprehend the control process: 81 | 82 | - 2D Plots: 83 | - The top graph displays the states, along with the predicted states and uncertainty from a specified number of previous time steps. The value of "n" is mentioned in the legend. 84 | - The middle graph depicts the actions taken during control. 85 | - The bottom graph illustrates the real cost alongside the predicted trajectory cost, which is the mean of future predicted costs, along with its associated uncertainty. 86 | 87 | - 3D Plots: 88 | - These plots showcase the Gaussian process models and the stored data points. 89 | - Each graph in the top row represents the change in states for the next step based on the current states and actions. 90 | - The indices on the x and y axes denote the states or actions being represented. For example, an index of 3 corresponds to the action for the pendulum. Actions have higher indices than states. 91 | - Since it's not feasible to display every input of the Gaussian process on the 3D graph, the axes are chosen to represent the two inputs (state or action) with the smallest lengthscales. Hence, the x-y axes may differ for each graph. 92 | - The graphs in the bottom row demonstrate the predicted uncertainty, while the data points represent prediction errors. Points stored in the memory of the Gaussian process model are depicted in green, while points that were not used for Gaussian process predictions due to their similarity to existing points in memory are shown in black. 93 | 94 | During the control process, a dynamic graph, similar to the 2D plot described earlier, provides real-time visualization of the evolving states, actions, and costs. It also displays the predicted states, actions, and costs computed by the model for the Model Predictive Control (MPC). The predicted future states, actions, and costs are represented by dashed lines, accompanied by their confidence intervals (2 standard deviations). 95 | 96 | 97 | 98 | ### Pendulum-v0 99 | The following figure shows the mean cost over 10 runs: 100 | 101 |

102 | 103 |

104 | 105 | We can see that the model allows to control the environment in less than hundred interactions with the environment. 106 | As a comparison, the state of the art of model free reinforcement learning algorithms in https://github.com/quantumiracle/SOTA-RL-Algorithms solves the problem in more than 15 episodes of 200 interactions with the environment. 107 | 108 | The following animation shows an example of control. 109 | 110 |

111 | 112 |

113 | 114 | And the gaussian process models and points in memory: 115 | 116 |

117 | 118 |

119 | 120 | 121 |
122 | 123 | ### MountainCarContinuous-v0 124 | 125 | The mountain car problem is different in that the number of time steps to plan in order to control the environment is higher. To avoid this problem, the parameter to repeat the actions has been set to 5. For the shown example, 1 control time step corresponds to 5 time steps where the action is held constant. 126 | 127 | The mean costs over 10 runs can be seen in the following figure: 128 | 129 |

130 | 131 |

132 | 133 | As for the pendulum, the optimal control is obtained in very few steps compared to the state of the art of model-free reinforcement agents 134 | 135 |

136 | 137 |

138 | 139 |

140 | 141 |

142 | 143 | 144 |
145 | 146 | ### Process Control 147 | 148 | To assess control performance in the presence of noise, multiple actions, and time-varying parameters, a custom gym environment has been created. 149 | This environment simulates a straightforward process control scenario involving the regulation of a level and concentration within a tank. 150 | 151 | For detailed information about the environment, please refer to the file located in the /envs/ folder. 152 | 153 | The control is represented in the following animation: 154 | 155 |

156 | 157 |

158 | 159 |
160 | 161 | ## Advanced functionalities 162 | 163 | 164 | 165 | ### MountainCarContinuous-v0 with states constraints 166 | 167 | The predicted future states can be accessed within the object during each control iteration. 168 | This means the future distribution can be used to set constraints on the states. 169 | In this case, the penalties have been added for states that fall outside an allowed region. 170 | 171 | To illustrate control with constraints, an example is provided below using the mountain car scenario. 172 | The graph displays the permissible boundaries indicated by dotted lines. 173 | 174 | The following constraints have been added: 175 | - The car is not allowed to reach the top but must stay on the clif. 176 | - The maximum speed of the car is limited 177 | - The car can't go too much left 178 | 179 |

180 | 181 |

182 | 183 | The control is reached with few violations of the constraints while still optimizing the cost function. 184 | 185 |
186 | 187 | ### Process control: Limiting the action changes 188 | 189 | By changing the way that the optimized actions are defined, it is possible to limit the changes of actions. 190 | 191 | The following animation represents an example of this functionality for the process control environment. 192 | 193 |

194 | 195 |

196 | 197 |
198 | 199 | ### Process control: Time varying parameters and model 200 | 201 | In the previous examples, the model assumed a static environment without any changes over time. 202 | 203 | However, by introducing an extra input to the model that represents the iteration time, we can assign more significance to recent data points. This enables the model to adapt to changes in the environment as time progresses. 204 | 205 | The learned time lengthscale provides insight into the pace at which the environment evolves. 206 | 207 | To illustrate this functionality, an animation is presented for the process control environment. 208 | In this animation, the parameters changes every 200 timesteps. 209 | 210 |

211 | 212 |

213 | 214 |
215 | 216 | ## Resources 217 | 218 | 219 | 220 | ### Brief explanation of the method 221 | The approach uses a model to control the environment. 222 | This family of methods is called Model Predictive Control (MPC). 223 | At each interaction with the real environment, the optimal action is obtained through an iterative approach. 224 | The model is used to evaluate certain actions over a fixed time horizon by simulating the environment. 225 | This simulation is used several times with different actions at each interaction with the real world to find the optimal actions in the time horizon window. 226 | The first control of the time horizon is then used for the next action in the real world. 227 | In traditional control theory, the model is a mathematical model obtained from expert knowledge. 228 | Here, the model is a Gaussian process that learns from observed data. 229 | 230 | Gaussian processes are used to predict the change of states as a function of states and actions. 231 | The predictions are a multidimensional gaussian distribution, which allows to get the uncertainty of these predictions. 232 | Gaussian processes are defined by a mean and covariance function, and store previous points (states(t), actions(t), (states(t+1) - states(t))) in memory. 233 | To compute new predictions, the covariance between the new points and the points stored in memory is calculated, 234 | which allows, with a little mathematics, to get the predicted distribution. 235 | Conceptually, Gaussian processes can be seen as if they were looking at adjacent points in memory to compute predictions at new points. 236 | Depending on the distance between the new point and the points stored in memory, the uncertainty will be greater or smaller. 237 | In our case, for each state, one Gaussian process is used which has n (number of states) + m (number of actions) inputs, 238 | and 1 output used to predict the variation of that state. 239 | 240 | In this paper, the uncertainties propagate during trajectory calculations which allows to calculate the uncertainty of the loss in the window of the simulation horizon. 241 | This makes it possible to explore more efficiently by rewarding future states where the uncertainty of the loss is high. 242 | It can also be used to get a real-time idea of the model's certainty about the future. 243 | Uncertainty can also be used to impose security constraints. 244 | This can be done by prohibiting visits to states where the uncertainty is too high by imposing constraints on the lower or upper limit of the state confidence interval. 245 | This method is already used for safe Bayesian optimization. 246 | For example, it has been used [to optimize UAV controllers to avoid crashes during optimization.](https://www.youtube.com/watch?v=GiqNQdzc5TI) 247 | 248 | This approach allows learning fast enough to enable online learning from scratch, 249 | which opens up many possibilities for Reinforcement Learning in new applications with some more research. 250 | 251 | 252 | 253 | ### Why is this paper important? 254 | Currently, real-world applications of model-free reinforcement learning algorithms are limited due to the number of interactions they require with the environment. 255 | 256 | This method shows that for the applications on which it can be used, the same results as for state-of-the-art model-free algorithms (to the extent of my knowledge) can be obtained with approximately 20 times less interaction with the environment. 257 | 258 | Understanding the reasons of this increased efficiency would open the search for algorithms with the same improvement in sample efficiency but without the limitations of this method. 259 | 260 | For example, the future predicted rewards (or cost) are predicted as a distribution. 261 | By maximizing the upper confidence bound of future rewards, future states with high reward uncertainty are encouraged, allowing for effective exploration. 262 | 263 | Maximizing future state uncertainty could also be used to explore environments without rewards. 264 | 265 | If future research removes the limitations of this method, this type of data efficiency could be used for real world applications where real-time learning is required and thus open many new applications for reinforcement learning. 266 | 267 | 268 | 269 | ### Remarks 270 | 271 | 272 | 273 | #### Implementation differences from the paper 274 | 275 | Compared to the implementation in the paper, the scripts have been designed to perform the control over a long time without any reset, which means : 276 | - The optimized function in the MPC is the lower confidence bound of the expected long-term cost to reward exploration and avoid getting stuck in a local minimum. 277 | - The environment is not reset, learning is done in one go. Thus, the hyper-parameters training can not be done between trials. The learning of the hyperparameters and the storage of the visualizations are performed in a parallel process at regular time intervals in order to minimize the computation time at each control iteration. 278 | - The optimizer for actions is LBFGS 279 | - An option has been added to decide to include a point in the model memory depending on the prediction error at that point and the predicted uncertainty to avoid having too many points in memory. Only points with a predicted uncertainty or a prediction error greater than a threshold are stored in memory. 280 | - An option has been added to allow to include the time of observation to the gaussian process models. 281 | This can be useful when the environment changes over time, as the model will learn to rely on more recent points vs older points in memory. 282 | 283 | 284 | 285 | #### Limitations 286 | 287 | - The cost function must be clearly defined as a squared distance function of the states and actions from a reference point. 288 | - The length of the prediction horizon of the mpc will impact computation times. This can be a problem when the dimensionality of the observation space and/or action space is also high. 289 | - The dimension of the input and output of the gaussian process must stay low, which limits application to cases with low dimensionality of the states and actions. 290 | - If too much points are stored in the memory of the gaussian process, the computation times might become too high per iteration. 291 | - The current implementation will not work for gym environments with discrete states or actions. 292 | - No guarantee is given for the time per iteration. 293 | - Actions must have an effect on the observation of the next observed step. Delays are not supported in the model. Observation must unequivocally describe the system states. 294 | 295 | ### Talks/Tutorials 296 | 297 | Gaussian processes: https://www.youtube.com/watch?v=92-98SYOdlY&list=PL93aLKqThq4hbACqDja5QFuFKDcNtkPXz&index=2 298 | 299 | Presentation of PILCO by Marc Deisenroth: https://www.youtube.com/watch?v=AVdx2hbcsfI (method that uses the same gaussian process model, but without a MPC controller) 300 | 301 | Safe Bayesian optimization: https://www.youtube.com/watch?v=sMfPRLtrob4 302 | 303 | 304 | 305 | ### Papers 306 | 307 | Original paper: https://deepai.org/publication/data-efficient-reinforcement-learning-with-probabilistic-model-predictive-control 308 | 309 | PILCO paper that describes the moment matching approximation used for states uncertainty propagation: https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=6654139 310 | 311 | Marc Deisenroth thesis: https://deisenroth.cc/pdf/thesis.pdf 312 | 313 | 314 | 315 | ### Textbooks 316 | 317 | http://www.gaussianprocess.org/gpml/ 318 | 319 | 320 | 321 | ### Projects 322 | 323 | https://github.com/nrontsis/PILCO 324 | 325 | 326 | 327 | ## Contact me 328 | You can contact me on Linkedin: https://www.linkedin.com/in/simon-rennotte-96aa04169/ 329 | or by email: simon.rennotte@protonmail.com 330 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: gp_rl_env 2 | channels: 3 | - defaults 4 | - pytorch 5 | dependencies: 6 | - python=3.10 7 | - pip 8 | - numpy=1.24.3 9 | - pytorch=2.0.0 10 | - cpuonly 11 | - matplotlib=3.6 12 | - scikit-learn=1.2.2 13 | - imageio=2.26.0 14 | - ffmpeg 15 | - pip: 16 | - gym==0.17.3 17 | - gpytorch 18 | - imageio[ffmpeg] 19 | -------------------------------------------------------------------------------- /examples/mountain_car/config_mountaincar.py: -------------------------------------------------------------------------------- 1 | from rl_gp_mpc.config_classes.total_config import Config 2 | from rl_gp_mpc.config_classes.controller_config import ControllerConfig 3 | from rl_gp_mpc.config_classes.actions_config import ActionsConfig 4 | from rl_gp_mpc.config_classes.reward_config import RewardConfig 5 | from rl_gp_mpc.config_classes.observation_config import ObservationConfig 6 | from rl_gp_mpc.config_classes.memory_config import MemoryConfig 7 | from rl_gp_mpc.config_classes.model_config import ModelConfig 8 | from rl_gp_mpc.config_classes.training_config import TrainingConfig 9 | 10 | 11 | def get_config(len_horizon=10, num_repeat_actions=5, include_time_model=False,): 12 | observation_config = ObservationConfig( 13 | obs_var_norm = [1e-6, 1e-6] 14 | ) 15 | 16 | reward_config = RewardConfig( 17 | target_state_norm=[1, 0.5], 18 | 19 | weight_state=[1, 0], 20 | weight_state_terminal=[5, 0], 21 | 22 | target_action_norm=[0.5], 23 | weight_action=[0.05], 24 | 25 | exploration_factor=1, 26 | 27 | use_constraints=False, 28 | state_min=[0.2, -2], 29 | state_max=[0.925, 0.85], 30 | area_multiplier=1, 31 | 32 | clip_lower_bound_cost_to_0=False, 33 | ) 34 | 35 | actions_config = ActionsConfig( 36 | limit_action_change=False, 37 | max_change_action_norm=[0.3] 38 | ) 39 | 40 | model_config = ModelConfig( 41 | gp_init = { 42 | "noise_covar.noise": [1e-5, 1e-5], 43 | "base_kernel.lengthscale": [0.5, 0.5], 44 | "outputscale": [5e-2, 5e-2] 45 | }, 46 | min_std_noise=1e-3, 47 | max_std_noise=1e-2, 48 | min_outputscale=1e-5, 49 | max_outputscale=0.95, 50 | min_lengthscale=4e-3, 51 | max_lengthscale=25.0, 52 | min_lengthscale_time=10, 53 | max_lengthscale_time=10000, 54 | init_lengthscale_time=100, 55 | include_time_model=include_time_model, 56 | ) 57 | 58 | memory_config = MemoryConfig( 59 | check_errors_for_storage=True, 60 | min_error_prediction_state_for_memory=[3e-3, 3e-3], 61 | min_prediction_state_std_for_memory=[3e-3, 3e-3], 62 | points_batch_memory=1500 63 | ) 64 | 65 | training_config = TrainingConfig( 66 | lr_train=7e-3, 67 | iter_train=20, 68 | training_frequency=60, 69 | clip_grad_value=1e-3, 70 | print_train=False, 71 | step_print_train=5 72 | ) 73 | 74 | controller_config = ControllerConfig( 75 | len_horizon=len_horizon, 76 | actions_optimizer_params = { 77 | "disp": None, "maxcor": 8, "ftol": 1e-18, "gtol": 1e-18, "eps": 1e-2, "maxfun": 8, 78 | "maxiter": 8, "iprint": -1, "maxls": 8, "finite_diff_rel_step": None 79 | }, 80 | init_from_previous_actions=True, 81 | restarts_optim=2, 82 | optimize=True, 83 | num_repeat_actions=num_repeat_actions 84 | ) 85 | 86 | config = Config( 87 | observation_config=observation_config, 88 | reward_config=reward_config, 89 | actions_config=actions_config, 90 | model_config=model_config, 91 | memory_config=memory_config, 92 | training_config=training_config, 93 | controller_config=controller_config, 94 | ) 95 | return config 96 | 97 | 98 | -------------------------------------------------------------------------------- /examples/mountain_car/run_mountain_car_multiple.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | file_path = os.path.abspath(__file__) 4 | sys.path.append(os.path.join(os.path.dirname(file_path), '../../')) 5 | 6 | import gym 7 | 8 | from rl_gp_mpc.config_classes.visu_config import VisuConfig 9 | from config_mountaincar import get_config 10 | 11 | from rl_gp_mpc.run_env_function import run_env_multiple 12 | 13 | if __name__ == '__main__': 14 | num_runs = 20 15 | random_actions_init=10 16 | num_repeat_actions=5 17 | num_steps=500 18 | len_horizon = 12 19 | verbose = False 20 | 21 | env_name = 'MountainCarContinuous-v0' 22 | env = gym.make(env_name) 23 | control_config = get_config(len_horizon=len_horizon, num_repeat_actions=num_repeat_actions) 24 | 25 | visu_config = VisuConfig(render_live_plot_2d=False, render_env=True, save_render_env=True) 26 | 27 | run_env_multiple(env, env_name, control_config, visu_config, num_runs, random_actions_init=random_actions_init, num_steps=num_steps, verbose=verbose) 28 | -------------------------------------------------------------------------------- /examples/mountain_car/run_mountaincar.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | file_path = os.path.abspath(__file__) 4 | sys.path.append(os.path.join(os.path.dirname(file_path), '../../')) 5 | 6 | import gym 7 | 8 | from rl_gp_mpc.config_classes.visu_config import VisuConfig 9 | from rl_gp_mpc.run_env_function import run_env 10 | from config_mountaincar import get_config 11 | 12 | 13 | def run_mountain_car(num_steps=500, random_actions_init=20, num_repeat_actions=5): 14 | len_horizon = 12 15 | verbose = True 16 | 17 | env_name = 'MountainCarContinuous-v0' 18 | env = gym.make(env_name) 19 | control_config = get_config(len_horizon=len_horizon, num_repeat_actions=num_repeat_actions) 20 | 21 | visu_config = VisuConfig(render_live_plot_2d=True, render_env=True, save_render_env=True) 22 | costs = run_env(env, control_config, visu_config, random_actions_init=random_actions_init, num_steps=num_steps, verbose=verbose) 23 | return costs 24 | 25 | 26 | if __name__ == '__main__': 27 | costs = run_mountain_car(num_steps=500, num_repeat_actions=5) 28 | -------------------------------------------------------------------------------- /examples/pendulum/config_pendulum.py: -------------------------------------------------------------------------------- 1 | from rl_gp_mpc.config_classes.total_config import Config 2 | from rl_gp_mpc.config_classes.controller_config import ControllerConfig 3 | from rl_gp_mpc.config_classes.actions_config import ActionsConfig 4 | from rl_gp_mpc.config_classes.reward_config import RewardConfig 5 | from rl_gp_mpc.config_classes.observation_config import ObservationConfig 6 | from rl_gp_mpc.config_classes.memory_config import MemoryConfig 7 | from rl_gp_mpc.config_classes.model_config import ModelConfig 8 | from rl_gp_mpc.config_classes.training_config import TrainingConfig 9 | 10 | 11 | def get_config(len_horizon=15, include_time_model=False, num_repeat_actions=1): 12 | observation_config = ObservationConfig( 13 | obs_var_norm = [1e-6, 1e-6, 1e-6] 14 | ) 15 | 16 | reward_config = RewardConfig( 17 | target_state_norm=[1, 0.5, 0.5], 18 | 19 | weight_state=[1, 0.1, 0.1], 20 | weight_state_terminal=[5, 2, 2], 21 | 22 | target_action_norm=[0.5], 23 | weight_action=[1e-3], 24 | 25 | exploration_factor=1, 26 | 27 | use_constraints=False, 28 | state_min=[-3, -3, -3], 29 | state_max=[3, 3, 3], 30 | area_multiplier=1, 31 | 32 | clip_lower_bound_cost_to_0=False, 33 | ) 34 | 35 | actions_config = ActionsConfig( 36 | limit_action_change=False, 37 | max_change_action_norm=[0.3] 38 | ) 39 | 40 | model_config = ModelConfig( 41 | gp_init = { 42 | "noise_covar.noise": [1e-5, 1e-5, 1e-5], # variance = (std)² 43 | "base_kernel.lengthscale": [0.5, 0.5, 0.5], 44 | "outputscale": [5e-2, 5e-2, 5e-2] 45 | }, 46 | # [[0.75, 0.75, 0.75, 0.75], 47 | # [0.75, 0.75, 0.75, 0.75], 48 | # [0.75, 0.75, 0.75, 0.75]] 49 | min_std_noise=1e-3, 50 | max_std_noise=1e-2, 51 | min_outputscale=1e-2, 52 | max_outputscale=0.95, 53 | min_lengthscale=4e-3, 54 | max_lengthscale=10.0, 55 | min_lengthscale_time=10, 56 | max_lengthscale_time=10000, 57 | init_lengthscale_time=100, 58 | include_time_model=include_time_model 59 | ) 60 | 61 | memory_config = MemoryConfig( 62 | check_errors_for_storage=True, 63 | min_error_prediction_state_for_memory=[3e-4, 3e-4, 3e-4], 64 | min_prediction_state_std_for_memory=[3e-3, 3e-3, 3e-3], 65 | points_batch_memory=1500 66 | ) 67 | 68 | training_config = TrainingConfig( 69 | lr_train=7e-3, 70 | iter_train=15, 71 | training_frequency=25, 72 | clip_grad_value=1e-3, 73 | print_train=False, 74 | step_print_train=5 75 | ) 76 | 77 | controller_config = ControllerConfig( 78 | len_horizon=len_horizon, 79 | actions_optimizer_params = { 80 | "disp": None, "maxcor": 4, "ftol": 1e-15, "gtol": 1e-15, "eps": 1e-2, "maxfun": 4, 81 | "maxiter": 4, "iprint": -1, "maxls": 4, "finite_diff_rel_step": None 82 | }, 83 | num_repeat_actions=num_repeat_actions 84 | ) 85 | 86 | config = Config( 87 | observation_config=observation_config, 88 | reward_config=reward_config, 89 | actions_config=actions_config, 90 | model_config=model_config, 91 | memory_config=memory_config, 92 | training_config=training_config, 93 | controller_config=controller_config, 94 | ) 95 | 96 | return config 97 | 98 | 99 | -------------------------------------------------------------------------------- /examples/pendulum/run_pendulum.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | file_path = os.path.abspath(__file__) 4 | sys.path.append(os.path.join(os.path.dirname(file_path), '../../')) 5 | 6 | import gym 7 | 8 | from rl_gp_mpc.config_classes.visu_config import VisuConfig 9 | from rl_gp_mpc.run_env_function import run_env 10 | from config_pendulum import get_config 11 | 12 | 13 | def run_pendulum(): 14 | num_steps = 150 15 | random_actions_init = 10 16 | num_repeat_actions = 1 17 | len_horizon = 15 18 | verbose = True 19 | 20 | env_name = 'Pendulum-v0' 21 | env = gym.make(env_name) 22 | control_config = get_config(len_horizon=len_horizon, num_repeat_actions=num_repeat_actions) 23 | 24 | visu_config = VisuConfig() 25 | costs = run_env(env, control_config, visu_config, random_actions_init=random_actions_init, num_steps=num_steps, verbose=verbose) 26 | return costs 27 | 28 | 29 | if __name__ == '__main__': 30 | costs = run_pendulum() 31 | -------------------------------------------------------------------------------- /examples/pendulum/run_pendulum_multiple.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | file_path = os.path.abspath(__file__) 4 | sys.path.append(os.path.join(os.path.dirname(file_path), '../../')) 5 | 6 | import gym 7 | 8 | from rl_gp_mpc.config_classes.visu_config import VisuConfig 9 | from rl_gp_mpc.run_env_function import run_env_multiple 10 | from config_pendulum import get_config 11 | 12 | 13 | if __name__ == '__main__': 14 | num_runs = 20 15 | 16 | num_steps = 300 17 | random_actions_init = 10 18 | num_repeat_actions = 1 19 | len_horizon = 12 20 | verbose = True 21 | 22 | env_name = 'Pendulum-v0' 23 | env = gym.make(env_name) 24 | control_config = get_config(len_horizon=len_horizon, num_repeat_actions=num_repeat_actions) 25 | 26 | visu_config = VisuConfig(render_live_plot_2d=False, render_env=True, save_render_env=True) 27 | 28 | run_env_multiple(env, env_name, control_config, visu_config, num_runs, random_actions_init=random_actions_init, num_steps=num_steps, verbose=verbose) 29 | -------------------------------------------------------------------------------- /examples/process_control/config_process_control.py: -------------------------------------------------------------------------------- 1 | from rl_gp_mpc.config_classes.total_config import Config 2 | from rl_gp_mpc.config_classes.controller_config import ControllerConfig 3 | from rl_gp_mpc.config_classes.actions_config import ActionsConfig 4 | from rl_gp_mpc.config_classes.reward_config import RewardConfig 5 | from rl_gp_mpc.config_classes.observation_config import ObservationConfig 6 | from rl_gp_mpc.config_classes.memory_config import MemoryConfig 7 | from rl_gp_mpc.config_classes.model_config import ModelConfig 8 | from rl_gp_mpc.config_classes.training_config import TrainingConfig 9 | 10 | 11 | def get_config(len_horizon=4, include_time_model=True, num_repeat_actions=10): 12 | observation_config = ObservationConfig( 13 | obs_var_norm = [1e-6, 1e-6] 14 | ) 15 | 16 | reward_config = RewardConfig( 17 | target_state_norm=[0.5, 0.5], 18 | 19 | weight_state=[1, 1], 20 | weight_state_terminal=[1, 1], 21 | 22 | target_action_norm=[0, 0], 23 | weight_action=[1e-4, 1e-4], 24 | 25 | exploration_factor=1, 26 | 27 | use_constraints=False, 28 | state_min=[0.1, 0.3], 29 | state_max=[0.9, 0.8], 30 | area_multiplier=1, 31 | 32 | clip_lower_bound_cost_to_0=False, 33 | ) 34 | 35 | actions_config = ActionsConfig( 36 | limit_action_change=False, 37 | max_change_action_norm=[0.1, 0.2] 38 | ) 39 | 40 | model_config = ModelConfig( 41 | gp_init = { 42 | "noise_covar.noise": [1e-5, 1e-5], 43 | "base_kernel.lengthscale": [0.25, 0.25], 44 | "outputscale": [5e-2, 5e-2] 45 | }, 46 | init_lengthscale_time=100, 47 | min_std_noise=1e-3, 48 | max_std_noise=3e-1, 49 | min_outputscale=1e-5, 50 | max_outputscale=0.95, 51 | min_lengthscale=5e-2, 52 | max_lengthscale=25.0, 53 | include_time_model=include_time_model, 54 | min_lengthscale_time=5, 55 | max_lengthscale_time=1000 56 | ) 57 | 58 | memory_config = MemoryConfig( 59 | check_errors_for_storage=True, 60 | min_error_prediction_state_for_memory=[1e-5, 1e-5], 61 | min_prediction_state_std_for_memory=[3e-3, 3e-3], 62 | points_batch_memory=1500 63 | ) 64 | 65 | training_config = TrainingConfig( 66 | lr_train=7e-3, 67 | iter_train=15, 68 | training_frequency=15, 69 | clip_grad_value=1e-3, 70 | print_train=False, 71 | step_print_train=5 72 | ) 73 | 74 | controller_config = ControllerConfig( 75 | len_horizon=len_horizon, 76 | actions_optimizer_params = { 77 | "disp": None, "maxcor": 15, "ftol": 1e-99, "gtol": 1e-99, "eps": 1e-2, "maxfun": 15, 78 | "maxiter": 15, "iprint": -1, "maxls": 15, "finite_diff_rel_step": None 79 | }, 80 | init_from_previous_actions=True, 81 | restarts_optim=2, 82 | optimize=True, 83 | num_repeat_actions=num_repeat_actions 84 | ) 85 | 86 | config = Config( 87 | observation_config=observation_config, 88 | reward_config=reward_config, 89 | actions_config=actions_config, 90 | model_config=model_config, 91 | memory_config=memory_config, 92 | training_config=training_config, 93 | controller_config=controller_config, 94 | ) 95 | return config 96 | 97 | 98 | -------------------------------------------------------------------------------- /examples/process_control/run_process_control.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | file_path = os.path.abspath(__file__) 4 | sys.path.append(os.path.join(os.path.dirname(file_path), '../../')) 5 | 6 | from rl_gp_mpc.config_classes.visu_config import VisuConfig 7 | from rl_gp_mpc.envs.process_control import ProcessControl 8 | from rl_gp_mpc.run_env_function import run_env 9 | from config_process_control import get_config 10 | 11 | 12 | def run_process_control(): 13 | # choose the configuration file to load the corresponding env 14 | # open(os.path.join('params', 'main_parameters_mountain_car.json')) 15 | # open(os.path.join('params', 'main_parameters_my_env.json')) 16 | num_steps = 500 17 | random_actions_init = 100 18 | num_repeat_actions = 5 19 | include_time_model = False 20 | len_horizon = 5 21 | verbose = True 22 | 23 | env = ProcessControl( 24 | dt=1, 25 | s_range=(20, 30), 26 | fi_range=(0.15, 0.3), ci_range=(0.15, 0.2), 27 | cr_range=(0.8, 1.0), 28 | noise_l_prop_range=(5e-3, 1e-2), noise_co_prop_range=(5e-3, 1e-2), 29 | sp_l_range=(0.4, 0.6), sp_co_range=(0.4, 0.6), 30 | change_params=False, period_change=200 31 | ) 32 | control_config = get_config(len_horizon=len_horizon, include_time_model=include_time_model, num_repeat_actions=num_repeat_actions) 33 | 34 | visu_config = VisuConfig(render_live_plot_2d=True, render_env=False, save_render_env=False, save_live_plot_2d=True) 35 | costs = run_env(env, control_config, visu_config, random_actions_init=random_actions_init, num_steps=num_steps, verbose=verbose) 36 | return costs 37 | 38 | if __name__ == '__main__': 39 | costs = run_process_control() 40 | -------------------------------------------------------------------------------- /examples/process_control/run_processc_control_multiple.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | file_path = os.path.abspath(__file__) 4 | sys.path.append(os.path.join(os.path.dirname(file_path), '../../')) 5 | 6 | from rl_gp_mpc.envs.process_control import ProcessControl 7 | 8 | from rl_gp_mpc.config_classes.visu_config import VisuConfig 9 | from config_process_control import get_config 10 | 11 | from rl_gp_mpc.run_env_function import run_env_multiple 12 | 13 | if __name__ == '__main__': 14 | num_runs = 10 15 | 16 | num_steps = 1000 17 | random_actions_init = 10 18 | num_repeat_actions = 10 19 | include_time_model = True 20 | len_horizon = 5 21 | verbose = True 22 | 23 | env = ProcessControl( 24 | dt=1, 25 | s_range=(20, 30), 26 | fi_range=(0.2, 0.3), ci_range=(0.1, 0.2), 27 | cr_range=(0.9, 1.0), 28 | noise_l_prop_range=(3e-3, 1e-2), noise_co_prop_range=(3e-3, 1e-2), 29 | sp_l_range=(0.4, 0.6), sp_co_range=(0.4, 0.6), 30 | change_params=True, period_change=500 31 | ) 32 | env_name = 'process_control' 33 | control_config = get_config(len_horizon=len_horizon, include_time_model=include_time_model, num_repeat_actions=num_repeat_actions) 34 | 35 | visu_config = VisuConfig(render_live_plot_2d=False, render_env=True, save_render_env=True) 36 | 37 | run_env_multiple(env, env_name, control_config, visu_config, num_runs, random_actions_init=random_actions_init, num_steps=num_steps, verbose=verbose) 38 | -------------------------------------------------------------------------------- /images/Article_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/Article_results.png -------------------------------------------------------------------------------- /images/Cost_runs_MountainCarContinuous-v0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/Cost_runs_MountainCarContinuous-v0.png -------------------------------------------------------------------------------- /images/Cost_runs_Pendulum-v0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/Cost_runs_Pendulum-v0.png -------------------------------------------------------------------------------- /images/anim_mountain_car.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/anim_mountain_car.gif -------------------------------------------------------------------------------- /images/anim_pendulum.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/anim_pendulum.gif -------------------------------------------------------------------------------- /images/anim_process_control_actionschange.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/anim_process_control_actionschange.gif -------------------------------------------------------------------------------- /images/anim_process_control_base.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/anim_process_control_base.gif -------------------------------------------------------------------------------- /images/anim_process_control_timevarying.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/anim_process_control_timevarying.gif -------------------------------------------------------------------------------- /images/control_anim_MountainCarContinuous-v0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/control_anim_MountainCarContinuous-v0.gif -------------------------------------------------------------------------------- /images/control_anim_Pendulum-v0.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/control_anim_Pendulum-v0.gif -------------------------------------------------------------------------------- /images/gif_total.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/gif_total.gif -------------------------------------------------------------------------------- /images/history_mountain_car.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/history_mountain_car.png -------------------------------------------------------------------------------- /images/history_pendulum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/history_pendulum.png -------------------------------------------------------------------------------- /images/model_mountain_car.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/model_mountain_car.png -------------------------------------------------------------------------------- /images/model_pendulum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/model_pendulum.png -------------------------------------------------------------------------------- /images/mountaincar_constraints-2023-05-13_13.23.24_adjusted.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/mountaincar_constraints-2023-05-13_13.23.24_adjusted.gif -------------------------------------------------------------------------------- /images/mountaincar_constraints-2023-05-13_13.23.24_adjusted.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/mountaincar_constraints-2023-05-13_13.23.24_adjusted.mkv -------------------------------------------------------------------------------- /images/process_control_base_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/images/process_control_base_history.png -------------------------------------------------------------------------------- /rl_gp_mpc/__init__.py: -------------------------------------------------------------------------------- 1 | from .control_objects.controllers.gp_mpc_controller import GpMpcController 2 | from .visu_objects.visu_object import ControlVisualizations -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/actions_config.py: -------------------------------------------------------------------------------- 1 | from .utils.functions_process_config import convert_config_lists_to_tensor 2 | 3 | 4 | class ActionsConfig: 5 | def __init__( 6 | self, 7 | limit_action_change:bool=False, 8 | max_change_action_norm:"list[float]"=[0.05], 9 | ): 10 | """ 11 | limit_action_change: if set to true, the actions will stay close to the action of the previous time step 12 | max_change_action_norm: determines the maximum change of the normalized action if limit_action_change is set to true 13 | """ 14 | self.limit_action_change = limit_action_change 15 | self.max_change_action_norm = max_change_action_norm 16 | 17 | self = convert_config_lists_to_tensor(self) -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/controller_config.py: -------------------------------------------------------------------------------- 1 | class ControllerConfig: 2 | def __init__(self, 3 | len_horizon:int=15, 4 | actions_optimizer_params:dict={ 5 | "disp": None, "maxcor": 30, "ftol": 1e-99, "gtol": 1e-99, "eps": 1e-2, "maxfun": 30, 6 | "maxiter": 30, "iprint": -1, "maxls": 30, "finite_diff_rel_step": None 7 | }, 8 | init_from_previous_actions:bool=True, 9 | restarts_optim:int=1, 10 | optimize:bool=True, 11 | num_repeat_actions:int=1 12 | ): 13 | """ 14 | len_horizon: number of timesteps used by the mpc to find the optimal action. The effective number of predicted timtesteps will be len_horizon * num_repeat_actions 15 | actions_optimizer_params: parameters of the optimizer used. Refer to the scipy documentation for more information about the parameters 16 | init_from_previous_actions: if set to true, the actions of the optimizer will be initialized with actions of the previous timestep, shifted by 1 17 | restarts_optim: number of restarts for the optimization. Can help escape local mimimum of the actions if init_from_previous_actions is set to true 18 | optimize: if set to false, the optimal actions will be taken at random. Used for debugging only 19 | num_repeat_actions: The actions will be repeated this number of time. It allows to predict more timesteps in advance while keeping the same len_horizon number. It can also be used to be more resilient to noise with the model. The model only predicts the change after all actions have been repeated. 20 | """ 21 | self.len_horizon = len_horizon 22 | self.actions_optimizer_params = actions_optimizer_params 23 | self.init_from_previous_actions = init_from_previous_actions 24 | self.restarts_optim=restarts_optim 25 | self.optimize = optimize 26 | self.num_repeat_actions=num_repeat_actions -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/memory_config.py: -------------------------------------------------------------------------------- 1 | from .utils.functions_process_config import convert_config_lists_to_tensor 2 | 3 | 4 | class MemoryConfig: 5 | def __init__(self, 6 | check_errors_for_storage:bool=True, 7 | min_error_prediction_state_for_memory:"list[float]"=[3e-4, 3e-4, 3e-4], 8 | min_prediction_state_std_for_memory:"list[float]"=[3e-3, 3e-3, 3e-3], 9 | points_batch_memory:int=1500 10 | ): 11 | """ 12 | check_errors_for_storage: If true, when adding a new point in memory, it will be checked if it is worth adding it to the model memory depending on the prediction by checking the error or uncertainty 13 | min_error_prediction_state_for_memory: if check_errors_for_storage is true, a point will only be used by the model if the error is above this threshold (any) 14 | min_prediction_state_std_for_memory: if check_errors_for_storage is true, a point will only be used by the model if the predicted standard deviation is above this threshold (any) 15 | """ 16 | self.check_errors_for_storage = check_errors_for_storage 17 | self.min_error_prediction_state_for_memory = min_error_prediction_state_for_memory 18 | self.min_prediction_state_std_for_memory = min_prediction_state_std_for_memory 19 | self.points_batch_memory = points_batch_memory 20 | 21 | self = convert_config_lists_to_tensor(self) 22 | -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/model_config.py: -------------------------------------------------------------------------------- 1 | from .utils.functions_process_config import convert_dict_lists_to_dict_tensor, extend_dim, extend_dim_lengthscale_time 2 | 3 | 4 | class ModelConfig: 5 | def __init__( 6 | self, 7 | gp_init:dict = { 8 | "noise_covar.noise": [1e-4, 1e-4, 1e-4], # variance = (std)² 9 | "base_kernel.lengthscale": [[0.75, 0.75, 0.75, 0.75], 10 | [0.75, 0.75, 0.75, 0.75], 11 | [0.75, 0.75, 0.75, 0.75]], 12 | "outputscale": [5e-2, 5e-2, 5e-2] 13 | }, 14 | init_lengthscale_time:float=100, 15 | min_std_noise:float=1e-3, 16 | max_std_noise:float=3e-1, 17 | min_outputscale:float=1e-5, 18 | max_outputscale:float=0.95, 19 | min_lengthscale:float=4e-3, 20 | max_lengthscale:float=25.0, 21 | min_lengthscale_time:float=10, 22 | max_lengthscale_time:float=10000, 23 | include_time_model:bool=False, 24 | ): 25 | """ 26 | include_time_model: if set to true, the time of the observation (index of control) will be an additional contextual input to the model, which will allow to weight recent points more in the case that the environment changes over time 27 | gp_init: dict containing the initialization of the gp model parameters. The dimension are the number of observation, since there is 1 model per state/observation. Lengthcales dimensions are (No, No+Na). No: numver of ovservations, Na: number of actions 28 | All parameters are for normalized observations expect for the time lengthscales 29 | The noise parameter represents the noise variance. Take the sqrt to get the corresponding std. 30 | """ 31 | self.include_time_model = include_time_model 32 | 33 | self.min_std_noise = min_std_noise 34 | self.max_std_noise = max_std_noise 35 | self.min_outputscale = min_outputscale 36 | self.max_outputscale = max_outputscale 37 | self.min_lengthscale = min_lengthscale 38 | self.max_lengthscale = max_lengthscale 39 | self.min_lengthscale_time = min_lengthscale_time 40 | self.max_lengthscale_time = max_lengthscale_time 41 | self.init_lengthscale_time = init_lengthscale_time 42 | self.include_time_model = include_time_model 43 | 44 | self.gp_init = convert_dict_lists_to_dict_tensor(gp_init) 45 | 46 | def extend_dimensions_params(self, dim_state, dim_input): 47 | self.min_std_noise = extend_dim(self.min_std_noise, dim=(dim_state,)) 48 | self.max_std_noise = extend_dim(self.max_std_noise, dim=(dim_state,)) 49 | self.min_outputscale = extend_dim(self.min_outputscale, dim=(dim_state,)) 50 | self.max_outputscale = extend_dim(self.max_outputscale, dim=(dim_state,)) 51 | self.gp_init["noise_covar.noise"] = extend_dim(self.gp_init["noise_covar.noise"], dim=(dim_state,)) 52 | self.gp_init["outputscale"] = extend_dim(self.gp_init["outputscale"], dim=(dim_state,)) 53 | 54 | if self.include_time_model: 55 | self.min_lengthscale = extend_dim_lengthscale_time(self.min_lengthscale, self.min_lengthscale_time, dim_state, dim_input) 56 | self.max_lengthscale = extend_dim_lengthscale_time(self.max_lengthscale, self.max_lengthscale_time, dim_state, dim_input) 57 | 58 | self.gp_init["base_kernel.lengthscale"] = extend_dim_lengthscale_time( 59 | lengthscale=self.gp_init["base_kernel.lengthscale"], 60 | lengthscale_time=self.init_lengthscale_time, 61 | num_models=dim_state, 62 | num_inputs=dim_input, 63 | ) 64 | else: 65 | self.min_lengthscale = extend_dim(self.min_lengthscale, dim=(dim_state, dim_input)) 66 | self.max_lengthscale = extend_dim(self.max_lengthscale, dim=(dim_state, dim_input)) 67 | self.gp_init["base_kernel.lengthscale"] = extend_dim(self.gp_init["base_kernel.lengthscale"], dim=(dim_state, dim_input)) -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/observation_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class ObservationConfig: 4 | def __init__( 5 | self, 6 | obs_var_norm:"list[float]"=[1e-6, 1e-6, 1e-6] 7 | ): 8 | """ 9 | obs_var_norm: uncertainty of the observation used by the model 10 | """ 11 | self.obs_var_norm = torch.diag(torch.Tensor(obs_var_norm)) 12 | 13 | -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/reward_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .utils.functions_process_config import convert_config_lists_to_tensor 3 | 4 | class RewardConfig: 5 | def __init__( 6 | self, 7 | target_state_norm:"list[float]"=[1, 0.5, 0.5], 8 | weight_state:"list[float]"=[1, 0.1, 0.1], 9 | weight_state_terminal:"list[float]"=[10, 5, 5], 10 | target_action_norm:"list[float]"=[0.5], 11 | weight_action:"list[float]"=[0.05], 12 | exploration_factor:float=3, 13 | use_constraints:bool=False, 14 | state_min:"list[float]"=[-0.1, 0.05, 0.05], 15 | state_max:"list[float]"=[1.1, 0.95, 0.925], 16 | area_multiplier:float=1, 17 | clip_lower_bound_cost_to_0:bool=False 18 | ): 19 | """ 20 | target_state_norm: target states. The cost relative to observations will be 0 if the normalized observations match this. Dim=No 21 | weight_state: target states. The weight attributed to each observation. Dim=No 22 | weight_state_terminal. The weight attributed to each observation at the end of the prediction horizon. Dim=No 23 | target_action_norm: Target actions. The cost relative to actions will be 0 if the nromalized actions match this. Dim=Na 24 | weight_action: weight attributed for each action in the cost. Dim=Na 25 | exploration_factor: bonus attributed to the uncertainty of the predicted cost function. 26 | use_constraints: if set to True, constraints for states will be used for the cost function 27 | state_min: minimum bound of the constraints for the normalized observation if use-constraints is True. Dim=No 28 | state_max: maximum bound of the constraints for the normalized observation if use-constraints is True. Dim=No 29 | area_multiplier: When including constraints in the cost function. The area of the predicted states outside the allowed region is used as penalty. This parameter multiply this area to adjust the effect of constraints on the cost 30 | clip_lower_bound_cost_to_0: if set to True, the cost will be clipped to 0. Should always be False 31 | 32 | Na: dimension of the env actions 33 | No: dimension of the env observations 34 | """ 35 | self.target_state_norm = target_state_norm 36 | self.weight_state = weight_state 37 | self.weight_state_terminal = weight_state_terminal 38 | self.target_action_norm = target_action_norm 39 | self.weight_action = weight_action 40 | self.exploration_factor = exploration_factor 41 | 42 | self.use_constraints=use_constraints 43 | self.state_min=state_min 44 | self.state_max=state_max 45 | self.area_multiplier=area_multiplier 46 | 47 | self.clip_lower_bound_cost_to_0 = clip_lower_bound_cost_to_0 48 | # Computed after init 49 | self.target_state_action_norm = None 50 | self.weight_matrix_cost = None 51 | self.weight_matrix_cost_terminal = None 52 | 53 | self = convert_config_lists_to_tensor(self) 54 | self = combine_weight_matrix(self) 55 | self.target_state_action_norm = torch.cat((self.target_state_norm, self.target_action_norm)) 56 | 57 | 58 | def combine_weight_matrix(reward_config:RewardConfig): 59 | reward_config.weight_matrix_cost = \ 60 | torch.block_diag( 61 | torch.diag(reward_config.weight_state), 62 | torch.diag(reward_config.weight_action)) 63 | reward_config.weight_matrix_cost_terminal = torch.diag(reward_config.weight_state_terminal) 64 | return reward_config -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/total_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .controller_config import ControllerConfig 3 | from .actions_config import ActionsConfig 4 | from .model_config import ModelConfig 5 | from .training_config import TrainingConfig 6 | from .reward_config import RewardConfig 7 | from .observation_config import ObservationConfig 8 | from .memory_config import MemoryConfig 9 | 10 | 11 | torch.set_default_tensor_type(torch.DoubleTensor) 12 | 13 | 14 | class Config: 15 | def __init__( 16 | self, 17 | observation_config: ObservationConfig=ObservationConfig(), 18 | reward_config:RewardConfig=RewardConfig(), 19 | actions_config:ActionsConfig=ActionsConfig(), 20 | model_config:ModelConfig=ModelConfig(), 21 | memory_config:MemoryConfig=MemoryConfig(), 22 | training_config:TrainingConfig=TrainingConfig(), 23 | controller_config:ControllerConfig=ControllerConfig() 24 | ): 25 | self.observation = observation_config 26 | self.reward = reward_config 27 | self.actions =actions_config 28 | self.model = model_config 29 | self.memory = memory_config 30 | self.training = training_config 31 | self.controller = controller_config 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/training_config.py: -------------------------------------------------------------------------------- 1 | class TrainingConfig: 2 | def __init__( 3 | self, 4 | lr_train:float=7e-3, 5 | iter_train:int=15, 6 | training_frequency:int=25, 7 | clip_grad_value:float=1e-3, 8 | print_train:bool=False, 9 | step_print_train:int=5 10 | ): 11 | """ 12 | lr_train: learning rate when training the hyperparameters of the model 13 | iter_train: number of iteration when training the model 14 | training_frequency: the model will be trained periodically. The interval in number of control iteration is this parameter 15 | clip_grad_value: if the gradient is above this number, it will be clipped to that number. Allows for better stability 16 | print_train: if set to true, will print the training loop information 17 | step_print_train: if print_train is true, will print the info every step_print_train of the training iterations 18 | """ 19 | self.lr_train = lr_train 20 | self.iter_train = iter_train 21 | self.training_frequency = training_frequency 22 | self.clip_grad_value = clip_grad_value 23 | self.print_train = print_train 24 | self.step_print_train = step_print_train -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/utils/functions_process_config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def convert_config_lists_to_tensor(self): 5 | for attr, value in self.__dict__.items(): 6 | if isinstance(value, list): 7 | setattr(self, attr, torch.Tensor(value)) 8 | return self 9 | 10 | 11 | def convert_dict_lists_to_dict_tensor(dict_list: "dict[list]"): 12 | for attr, value in dict_list.items(): 13 | if isinstance(value, list): 14 | dict_list[attr] = torch.Tensor(value) 15 | return dict_list 16 | 17 | 18 | def extend_dim_lengthscale_time(lengthscale, lengthscale_time:float, num_models:int, num_inputs:int): 19 | lengthscales = torch.empty((num_models, num_inputs)) 20 | if isinstance(lengthscale, float) or isinstance(lengthscale, int) or lengthscale.dim() != 1: 21 | lengthscales[:, :-1] = lengthscale 22 | lengthscales[:, -1] = lengthscale_time 23 | else: 24 | lengthscales[:, :-1] = lengthscale[None].t().repeat((1, num_inputs - 1)) 25 | lengthscales[:, -1] = lengthscale_time 26 | return lengthscales 27 | 28 | 29 | def extend_dim(orig_tensor, dim:int): 30 | orig_tensor = orig_tensor if isinstance(orig_tensor, torch.Tensor) else torch.tensor(orig_tensor) 31 | ones_multiply = torch.ones(dim) 32 | 33 | if orig_tensor.ndim < ones_multiply.ndim: 34 | orig_tensor = orig_tensor.unsqueeze(-1) 35 | 36 | return orig_tensor * ones_multiply 37 | 38 | -------------------------------------------------------------------------------- /rl_gp_mpc/config_classes/visu_config.py: -------------------------------------------------------------------------------- 1 | class VisuConfig: 2 | def __init__( 3 | self, 4 | save_render_env:bool=True, 5 | render_live_plot_2d:bool=True, 6 | render_env:bool=True, 7 | save_live_plot_2d:bool=False 8 | ): 9 | """ 10 | save_render_env: if set to true and render_env is true, will save the env animation 11 | render_live_plot_2d: if true, will show the dynamic 2d graph update in real time 12 | render_env: if set to true, will show the env 13 | save_live_plot_2d: if set to true and render_live_plot_2d is true, will save the 2d graph animation 14 | 15 | """ 16 | self.save_render_env = save_render_env 17 | self.render_live_plot_2d = render_live_plot_2d 18 | self.render_env = render_env 19 | self.save_live_plot_2d = save_live_plot_2d -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SimonRennotte/Data-Efficient-Reinforcement-Learning-with-Probabilistic-Model-Predictive-Control/e73e805688792b4d63fcee08f4eb06a7fab46a11/rl_gp_mpc/control_objects/__init__.py -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/actions_mappers/abstract_action_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from rl_gp_mpc.config_classes.actions_config import ActionsConfig 7 | 8 | 9 | class AbstractActionMapper: 10 | ''' 11 | Class use to map actions to space between 0 and 1. 12 | The optimiser constraints the embed_action to be between 0 and 1. 13 | By using this class, the action can be constrainted in interesting ways. 14 | It is used by DerivativeActionsMapper to limit the rate of change of actions. 15 | Another application would be to use a autoencoder to allow the mpc controller to work with high dimension actions 16 | by optimizing in the embed space. 17 | ''' 18 | def __init__(self, action_low:Union[np.ndarray, torch.Tensor], action_high:Union[np.ndarray, torch.Tensor], len_horizon: int, config:ActionsConfig): 19 | self.config = config 20 | self.action_low = torch.Tensor(action_low) 21 | self.action_high = torch.Tensor(action_high) 22 | self.dim_action = len(action_low) 23 | self.len_horizon = len_horizon 24 | self.n_iter_ctrl = 0 25 | 26 | def transform_action_raw_to_action_model(self, action_raw: Union[np.ndarray, torch.Tensor]) -> torch.Tensor: 27 | raise NotImplementedError 28 | 29 | def transform_action_model_to_action_raw(self, action_model: Union[np.ndarray, torch.Tensor], update_internals:bool=False) -> torch.Tensor: 30 | raise NotImplementedError 31 | 32 | def transform_action_mpc_to_action_model(self, action_mpc: np.ndarray) -> torch.Tensor: 33 | raise NotImplementedError 34 | 35 | def transform_action_mpc_to_action_raw(self, action_mpc: Union[np.ndarray, torch.Tensor], update_internals:bool=False) -> torch.Tensor: 36 | action_model = self.transform_action_mpc_to_action_model(action_mpc) 37 | action_raw = self.transform_action_model_to_action_raw(action_model, update_internals=update_internals) 38 | return action_raw 39 | 40 | def norm_action(self, action: Union[np.ndarray, torch.Tensor]) -> torch.Tensor: 41 | return (torch.Tensor(action) - self.action_low) / (self.action_high - self.action_low) 42 | 43 | def denorm_action(self, normed_action: Union[np.ndarray, torch.Tensor], update_internals=False) -> torch.Tensor: 44 | # Update internals is True when the action will be applied to the environment 45 | if update_internals: 46 | self.n_iter_ctrl += 1 47 | return torch.Tensor(normed_action) * (self.action_high - self.action_low) + self.action_low -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/actions_mappers/action_init_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def generate_mpc_action_init_random(len_horizon, dim_action): 5 | return np.random.uniform(low=0, high=1, size=(len_horizon, dim_action)).reshape(-1) 6 | 7 | 8 | def generate_mpc_action_init_frompreviousiter(actions_mpc, dim_action): 9 | actions_mpc[:-dim_action] = actions_mpc[dim_action:] 10 | return actions_mpc 11 | 12 | 13 | def get_init_action_change(len_horizon, max_change_action_norm): 14 | return np.dot(np.expand_dims(np.random.uniform(low=-1, high=1, size=(len_horizon)), 1), np.expand_dims(max_change_action_norm, 0)) 15 | 16 | 17 | def get_init_action(len_horizon, num_actions): 18 | return np.random.uniform(low=0, high=1, size=(len_horizon, num_actions)) -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/actions_mappers/derivative_action_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from rl_gp_mpc.config_classes.actions_config import ActionsConfig 7 | from rl_gp_mpc.control_objects.utils.pytorch_utils import Clamp 8 | from .abstract_action_mapper import AbstractActionMapper 9 | 10 | class DerivativeActionMapper(AbstractActionMapper): 11 | def __init__(self, action_low:Union[np.ndarray, torch.Tensor], action_high:Union[np.ndarray, torch.Tensor], len_horizon: int, config:ActionsConfig): 12 | super().__init__(action_low, action_high, len_horizon, config) 13 | self.action_model_previous_iter = torch.rand(self.dim_action) 14 | self.bounds = [(0, 1)] * self.dim_action * len_horizon 15 | self.clamp_class = Clamp() 16 | 17 | def transform_action_raw_to_action_model(self, action_raw: Union[np.ndarray, torch.Tensor]) -> torch.Tensor: 18 | action_model = self.norm_action(action_raw) 19 | return action_model 20 | 21 | def transform_action_model_to_action_raw(self, action_model: Union[np.ndarray, torch.Tensor], update_internals:bool=False) -> torch.Tensor: 22 | if update_internals: 23 | self.action_model_previous_iter = action_model[0] 24 | 25 | action_raw = self.denorm_action(action_model, update_internals=update_internals) 26 | return action_raw 27 | 28 | def transform_action_mpc_to_action_model(self, action_mpc: torch.Tensor) -> torch.Tensor: 29 | actions_mpc_2d = torch.atleast_2d(action_mpc.reshape(self.len_horizon, -1)) 30 | actions_mpc_2d = actions_mpc_2d * 2 * self.config.max_change_action_norm - self.config.max_change_action_norm 31 | actions_mpc_2d[0] = actions_mpc_2d[0] + self.action_model_previous_iter 32 | actions_model = torch.cumsum(actions_mpc_2d, dim=0) 33 | clamp = self.clamp_class.apply 34 | actions_model = clamp(actions_model, 0, 1) 35 | return actions_model -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/actions_mappers/normalization_action_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from rl_gp_mpc.config_classes.actions_config import ActionsConfig 7 | from .abstract_action_mapper import AbstractActionMapper 8 | 9 | 10 | class NormalizationActionMapper(AbstractActionMapper): 11 | def __init__(self, action_low:Union[np.ndarray, torch.Tensor], action_high:Union[np.ndarray, torch.Tensor], len_horizon: int, config:ActionsConfig): 12 | super().__init__(action_low, action_high, len_horizon, config) 13 | self.bounds = [(0, 1)] * self.dim_action * len_horizon 14 | 15 | def transform_action_raw_to_action_model(self, action_raw: Union[np.ndarray, torch.Tensor]) -> torch.Tensor: 16 | return self.norm_action(action_raw) 17 | 18 | def transform_action_model_to_action_raw(self, action_model: Union[np.ndarray, torch.Tensor], update_internals:bool=False) -> torch.Tensor: 19 | return self.denorm_action(action_model, update_internals=update_internals) 20 | 21 | def transform_action_mpc_to_action_model(self, action_mpc: np.ndarray) -> torch.Tensor: 22 | actions = torch.atleast_2d(action_mpc.reshape(self.len_horizon, -1)) 23 | return actions -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/controllers/abstract_controller.py: -------------------------------------------------------------------------------- 1 | from rl_gp_mpc.config_classes.total_config import Config 2 | 3 | 4 | class BaseControllerObject: 5 | def __init__(self, config: Config): 6 | raise NotImplementedError 7 | 8 | def add_memory(self, obs, action, obs_new, reward, **kwargs): 9 | raise NotImplementedError() 10 | 11 | def get_action(self, obs_mu, obs_var=None): 12 | raise NotImplementedError() 13 | 14 | def get_action_random(self, obs_mu, obs_var=None): 15 | raise NotImplementedError() 16 | 17 | def train(self): 18 | raise NotImplementedError() -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/controllers/gp_mpc_controller.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | 3 | import numpy as np 4 | import torch 5 | from scipy.optimize import minimize 6 | 7 | from rl_gp_mpc.control_objects.actions_mappers.action_init_functions import generate_mpc_action_init_random, generate_mpc_action_init_frompreviousiter 8 | from rl_gp_mpc.config_classes.total_config import Config 9 | from rl_gp_mpc.control_objects.observations_states_mappers.normalization_observation_state_mapper import NormalizationObservationStateMapper 10 | from rl_gp_mpc.control_objects.actions_mappers.derivative_action_mapper import DerivativeActionMapper 11 | from rl_gp_mpc.control_objects.actions_mappers.normalization_action_mapper import NormalizationActionMapper 12 | from rl_gp_mpc.control_objects.models.gp_model import GpStateTransitionModel 13 | from rl_gp_mpc.control_objects.states_reward_mappers.setpoint_distance_reward_mapper import SetpointStateRewardMapper 14 | from rl_gp_mpc.control_objects.memories.gp_memory import Memory 15 | from rl_gp_mpc.control_objects.utils.pytorch_utils import Clamp 16 | 17 | from .abstract_controller import BaseControllerObject 18 | from .iteration_info_class import IterationInformation 19 | 20 | 21 | class GpMpcController(BaseControllerObject): 22 | def __init__(self, observation_low:np.array, observation_high:np.array, action_low:np.array, action_high:np.array, config:Config): 23 | self.config = config 24 | self.observation_state_mapper = NormalizationObservationStateMapper(config=config.observation, observation_low=observation_low, observation_high=observation_high) 25 | 26 | if self.config.actions.limit_action_change: 27 | self.actions_mapper = DerivativeActionMapper(config=self.config.actions, action_low=action_low, action_high=action_high, len_horizon=self.config.controller.len_horizon) 28 | else: 29 | self.actions_mapper = NormalizationActionMapper(config=self.config.actions, action_low=action_low, action_high=action_high, len_horizon=self.config.controller.len_horizon) 30 | 31 | self.transition_model = GpStateTransitionModel(config=self.config.model, 32 | dim_state=self.observation_state_mapper.dim_observation, 33 | dim_action=self.actions_mapper.dim_action) 34 | self.state_reward_mapper = SetpointStateRewardMapper(config=self.config.reward) 35 | 36 | self.memory = Memory(self.config.memory, 37 | dim_input=self.transition_model.dim_input, dim_state=self.transition_model.dim_state, 38 | include_time_model=self.transition_model.config.include_time_model, 39 | step_model=self.config.controller.num_repeat_actions) 40 | 41 | self.actions_mpc_previous_iter = None 42 | self.clamp_lcb_class = Clamp() 43 | 44 | self.iter_ctrl = 0 45 | 46 | self.num_cores_main = multiprocessing.cpu_count() 47 | self.ctx = multiprocessing.get_context('spawn') 48 | self.queue_train = self.ctx.Queue() 49 | 50 | self.info_iters = {} 51 | 52 | def get_action(self, obs_mu:np.array, obs_var:np.array=None, random:bool=False): 53 | """ 54 | Get the optimal action given the observation by optimizing 55 | the actions of the simulated trajectory with the gaussian process models such that the lower confidence bound of 56 | the mean cost of the trajectory is minimized. 57 | Only the first action of the prediction window is returned. 58 | 59 | Args: 60 | obs_mu (numpy.array): unnormalized observation from the gym environment. dim=(Ns) 61 | obs_var (numpy.array): unnormalized variance of the observation from the gym environment. dim=(Ns, Ns). 62 | default=None. If it is set to None, 63 | the observation noise from the json parameters will be used for every iteration. 64 | Ns is the dimension of states in the gym environment. 65 | 66 | Returns: 67 | next_action_raw (numpy.array): action to use in the gym environment. 68 | It is denormalized, so it can be used directly. 69 | dim=(Na), where Ns is the dimension of the action_space 70 | 71 | """ 72 | # Check for parallel process that are open but not alive at each iteration to retrieve the results and close them 73 | self.check_and_close_processes() 74 | if (self.iter_ctrl % self.config.controller.num_repeat_actions) == 0: 75 | self.memory.prepare_for_model() 76 | with torch.no_grad(): 77 | state_mu, state_var = self.observation_state_mapper.get_state(obs=obs_mu, obs_var=obs_var, update_internals=True) 78 | 79 | if random: 80 | actions_model = self._get_random_actions(state_mu, state_var) 81 | else: 82 | actions_model = self._get_optimal_actions(state_mu, state_var) 83 | actions_raw = self.actions_mapper.transform_action_model_to_action_raw(actions_model, update_internals=True) 84 | next_action_raw = actions_raw[0] 85 | 86 | # all the information fo the trajectory are stored in the object when using the function compute_mean_lcb_trajectory 87 | with torch.no_grad(): 88 | reward, reward_var = self.state_reward_mapper.get_reward(state_mu, state_var, actions_model[0]) 89 | states_std_pred = torch.diagonal(self.states_var_pred, dim1=-2, dim2=-1).sqrt() 90 | indexes_predicted = np.arange(self.iter_ctrl, self.iter_ctrl + self.config.controller.len_horizon * self.config.controller.num_repeat_actions, self.config.controller.num_repeat_actions) 91 | self.iter_info = IterationInformation( 92 | iteration=self.iter_ctrl, 93 | state=self.states_mu_pred[0], 94 | cost=-reward.item(), 95 | cost_std=reward_var.sqrt().item(), 96 | mean_predicted_cost=np.min([-self.rewards_trajectory.mean().item(), 3]), 97 | mean_predicted_cost_std=self.rewards_traj_var.sqrt().mean().item(), 98 | lower_bound_mean_predicted_cost=self.cost_traj_mean_lcb.item(), 99 | predicted_idxs=indexes_predicted, 100 | predicted_states=self.states_mu_pred, 101 | predicted_states_std=states_std_pred, 102 | predicted_actions=actions_model, 103 | predicted_costs=-self.rewards_trajectory, 104 | predicted_costs_std=self.rewards_traj_var.sqrt(), 105 | ) 106 | self.store_iter_info(self.iter_info) 107 | self.past_action = next_action_raw 108 | else: 109 | next_action_raw = self.past_action 110 | 111 | self.iter_ctrl += 1 112 | return np.array(next_action_raw) 113 | 114 | def _get_optimal_actions(self, state_mu:torch.Tensor, state_var:torch.Tensor) -> torch.Tensor: 115 | x_mem, y_mem = self.memory.get() 116 | with torch.no_grad(): 117 | self.transition_model.prepare_inference(x_mem, y_mem) 118 | 119 | # The optimize function from the scipy library. 120 | # It is used to get the optimal actions_norm in the prediction window 121 | # that minimizes the lower bound of the predicted cost. The jacobian is used, 122 | # otherwise the computation times would be 5 to 10x slower (for the tests I used) 123 | opt_fun = np.inf 124 | actions_mpc_optim = None 125 | for idx_restart in range(self.config.controller.restarts_optim): 126 | # only init from previous actions for the first restart, if it's available in the object and specified to be used in the config 127 | if self.config.controller.init_from_previous_actions and (self.actions_mpc_previous_iter is not None) and (idx_restart == 0): 128 | actions_mpc_init = generate_mpc_action_init_frompreviousiter(self.actions_mpc_previous_iter, dim_action=self.actions_mapper.dim_action) 129 | else: 130 | actions_mpc_init = generate_mpc_action_init_random(len_horizon=self.config.controller.len_horizon, dim_action=self.actions_mapper.dim_action) 131 | 132 | if self.config.controller.optimize: 133 | iter_res = minimize(fun=self.compute_mean_lcb_trajectory, 134 | x0=actions_mpc_init, 135 | jac=True, 136 | args=(state_mu, state_var), 137 | method='L-BFGS-B', 138 | bounds=self.actions_mapper.bounds, 139 | options=self.config.controller.actions_optimizer_params) 140 | actions_mpc_restart = iter_res.x 141 | func_val = iter_res.fun 142 | else: 143 | actions_mpc_restart = generate_mpc_action_init_random(len_horizon=self.config.controller.len_horizon, dim_action=self.actions_mapper.dim_action) 144 | func_val, grad_val = self.compute_mean_lcb_trajectory(actions_mpc_restart, state_mu, state_var) 145 | 146 | if func_val < opt_fun or (actions_mpc_optim is None and np.isnan(func_val)): 147 | opt_fun = func_val 148 | actions_mpc_optim = actions_mpc_restart 149 | 150 | self.actions_mpc_previous_iter = actions_mpc_optim.copy() 151 | actions_mpc_optim = torch.Tensor(actions_mpc_optim) 152 | actions_model_optim = self.actions_mapper.transform_action_mpc_to_action_model(actions_mpc_optim) 153 | return actions_model_optim 154 | 155 | def _get_random_actions(self, state_mu:torch.Tensor, state_var:torch.Tensor) -> torch.Tensor: 156 | actions_mpc = generate_mpc_action_init_random(len_horizon=self.config.controller.len_horizon, dim_action=self.actions_mapper.dim_action) 157 | actions_model = self.actions_mapper.transform_action_mpc_to_action_model(torch.Tensor(actions_mpc)) 158 | 159 | x_mem, y_mem = self.memory.get() 160 | with torch.no_grad(): 161 | self.transition_model.prepare_inference(x_mem, y_mem) 162 | func_val, grad_val = self.compute_mean_lcb_trajectory(actions_mpc, state_mu, state_var) # store the trajectory info in the object 163 | return actions_model 164 | 165 | def add_memory(self, obs: np.array, action: np.array, obs_new:np.array, reward:float, predicted_state:np.array=None, predicted_state_std:np.array=None): 166 | """ 167 | Add an observation, action and observation after applying the action to the memory that is used 168 | by the gaussian process models. 169 | At regular number of points interval (self.training_frequency), 170 | the training process of the gaussian process models will be launched to optimize the hyper-parameters. 171 | 172 | Args: 173 | obs: non-normalized observation. Dim=(Ns,) 174 | action: non-normalized action. Dim=(Ns,) 175 | obs_new: non-normalized observation obtained after applying the action on the observation. 176 | Dim=(Ns,) 177 | reward: reward obtained from the gym env. Unused at the moment. 178 | The cost given state and action is computed instead. 179 | predicted_state: 180 | if check_storage is True and predicted_state is not None, 181 | the prediction error for that point will be computed. 182 | and the point will only be stored in memory if the 183 | prediction error is larger than self.error_pred_memory. Dim=(Ns,) 184 | 185 | predicted_state_std: 186 | If check_storage is true, and predicted_state_std is not None, the point will only be 187 | stored in memory if it is larger than self.error_pred_memory. Dim=(Ns,) 188 | 189 | where Ns: dimension of states, Na: dimension of actions 190 | """ 191 | state_mu, state_var = self.observation_state_mapper.get_state(obs=obs, update_internals=False) 192 | state_mu_new, state_var_new = self.observation_state_mapper.get_state(obs=obs_new, update_internals=False) 193 | action_model = self.actions_mapper.transform_action_raw_to_action_model(action) 194 | 195 | self.memory.add(state_mu, action_model, state_mu_new, reward, iter_ctrl=self.iter_ctrl-1, predicted_state=predicted_state, predicted_state_std=predicted_state_std) 196 | 197 | if self.iter_ctrl % self.config.training.training_frequency == 0 and \ 198 | not ('p_train' in self.__dict__ and not self.p_train._closed): 199 | self.start_training_process() 200 | 201 | def start_training_process(self): 202 | saved_state = self.transition_model.save_state() 203 | saved_state.to_arrays() 204 | self.p_train = self.ctx.Process(target=self.transition_model.train, 205 | args=( 206 | self.queue_train, 207 | saved_state, 208 | self.config.training.lr_train, 209 | self.config.training.iter_train, 210 | self.config.training.clip_grad_value, 211 | self.config.training.print_train, 212 | self.config.training.step_print_train) 213 | ) 214 | self.p_train.start() 215 | 216 | def check_and_close_processes(self): 217 | """ 218 | Check active parallel processes, wait for their resolution, get the parameters and close them 219 | """ 220 | if 'p_train' in self.__dict__ and not self.p_train._closed and not (self.p_train.is_alive()): 221 | params_dict_list = self.queue_train.get() 222 | self.p_train.join() 223 | for model_idx in range(len(self.transition_model.models)): 224 | self.transition_model.models[model_idx].initialize(**params_dict_list[model_idx]) 225 | self.p_train.close() 226 | x_mem, y_mem = self.memory.get() 227 | self.transition_model.prepare_inference(x_mem, y_mem) 228 | 229 | def compute_mean_lcb_trajectory(self, actions_mpc:np.array, obs_mu:torch.Tensor, obs_var:torch.Tensor): 230 | """ 231 | Compute the mean lower bound cost of a trajectory given the actions of the trajectory 232 | and initial state distribution. The gaussian process models are used to predict the evolution of 233 | states (mean and variance). Then the cost is computed for each predicted state and the mean is returned. 234 | The partial derivatives of the mean lower bound cost with respect to the actions are also returned. 235 | They are computed automatically with autograd from pytorch. 236 | This function is called multiple times by an optimizer to find the optimal actions. 237 | 238 | Args: 239 | actions_mpc: actions to apply for the simulated trajectory. 240 | It is a flat 1d array so that this function can be used by the minimize function of the scipy library. 241 | It is reshaped and transformed into a tensor inside. 242 | If self.config.actions.limit_action_change is true, each element of the array contains the relative 243 | change with respect to the previous iteration, so that the change can be bounded by 244 | the optimizer. dim=(Nh x Na,) 245 | where Nh is the len of the horizon and Na the dimension of actions 246 | 247 | obs_mu: mean value of the inital state distribution. 248 | dim=(Ns,) where Ns is the dimension of state 249 | 250 | obs_var: covariance matrix of the inital state distribution. 251 | dim=(Ns, Ns) where Ns is the dimension of state 252 | 253 | Returns: 254 | mean_cost_traj_lcb.item() (float): lower bound of the mean cost distribution 255 | of the predicted trajectory. 256 | 257 | 258 | gradients_dcost_dactions.detach().numpy() (numpy.array): 259 | Derivative of the lower bound of the mean cost 260 | distribution with respect to each of the mpc actions in the 261 | prediction horizon. Dim=(Nh * Na,) 262 | where Nh is the len of the horizon and Ma the dimension of actions 263 | """ 264 | # reshape actions from flat 1d numpy array into 2d tensor 265 | actions_mpc = torch.Tensor(actions_mpc) 266 | actions_mpc.requires_grad = True 267 | actions_model = self.actions_mapper.transform_action_mpc_to_action_model(action_mpc=actions_mpc) 268 | states_mu_pred, states_var_pred = self.transition_model.predict_trajectory(actions_model, obs_mu, obs_var, len_horizon=self.config.controller.len_horizon, current_time_idx=self.iter_ctrl) 269 | rewards_traj, rewards_traj_var = self.state_reward_mapper.get_rewards_trajectory(states_mu_pred, states_var_pred, actions_model) 270 | rewards_traj_ucb = rewards_traj + self.config.reward.exploration_factor * torch.sqrt(rewards_traj_var) 271 | 272 | if self.config.reward.clip_lower_bound_cost_to_0: 273 | clamp = self.clamp_lcb_class.apply 274 | rewards_traj_ucb = clamp(rewards_traj_ucb, float('-inf'), 0) 275 | mean_reward_traj_ucb = rewards_traj_ucb.mean() 276 | mean_cost_traj_ucb = -mean_reward_traj_ucb 277 | gradients_dcost_dactions = torch.autograd.grad(mean_cost_traj_ucb, actions_mpc, retain_graph=False)[0] 278 | 279 | self.cost_traj_mean_lcb = mean_reward_traj_ucb.detach() 280 | self.states_mu_pred = states_mu_pred.detach() 281 | self.rewards_trajectory = rewards_traj.detach() 282 | self.rewards_traj_var = rewards_traj_var.detach() 283 | self.states_var_pred = states_var_pred.detach() 284 | 285 | return mean_cost_traj_ucb.item(), gradients_dcost_dactions.detach().numpy() 286 | 287 | def compute_cost_unnormalized(self, obs:np.array, action:np.array, obs_var:np.array=None)-> "tuple[float]": 288 | """ 289 | Compute the cost on un-normalized state and actions. 290 | Takes in numpy array and returns numpy array. 291 | Meant to be used to compute the cost outside the object. 292 | Args: 293 | obs (numpy.array): state (or observation). shape=(Ns,) 294 | action (numpy.array): action. Shape=(Na,) 295 | obs_var (numpy.array): state (or observation) variance. Default=None. shape=(Ns, Ns) 296 | If set to None, the observation constant stored inside the object will be used 297 | 298 | Returns: 299 | cost_mu (float): Mean of the cost 300 | cost_var (float): variance of the cost 301 | """ 302 | state_mu, state_var = self.observation_state_mapper.get_state(obs=obs, obs_var=obs_var, update_internals=False) 303 | action_model = self.actions_mapper.transform_action_raw_to_action_model(action) 304 | reward_mu, reward_var = self.state_reward_mapper.get_reward(state_mu, state_var, action_model) 305 | return -reward_mu.item(), reward_var.item() 306 | 307 | def get_iter_info(self): 308 | return self.iter_info 309 | 310 | def store_iter_info(self, iter_info: IterationInformation): 311 | iter_info_dict = iter_info.__dict__ 312 | for key in iter_info_dict.keys(): 313 | if not key in self.info_iters: 314 | self.info_iters[key] = [iter_info_dict[key]] 315 | else: 316 | self.info_iters[key].append(iter_info_dict[key]) 317 | 318 | -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/controllers/iteration_info_class.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | NUM_DECIMALS_REPR = 3 5 | 6 | class IterationInformation: 7 | def __init__(self, 8 | iteration:int, 9 | state:float, 10 | cost:float, 11 | cost_std:float, 12 | mean_predicted_cost:float, 13 | mean_predicted_cost_std:float, 14 | lower_bound_mean_predicted_cost:float, 15 | predicted_idxs:np.array, 16 | predicted_states:np.array, 17 | predicted_states_std:np.array, 18 | predicted_actions:np.array, 19 | predicted_costs:np.array, 20 | predicted_costs_std:np.array, 21 | ): 22 | self.iteration=iteration 23 | self.state=state 24 | self.cost=cost 25 | self.cost_std=cost_std 26 | self.mean_predicted_cost=mean_predicted_cost 27 | self.mean_predicted_cost_std=mean_predicted_cost_std 28 | self.lower_bound_mean_predicted_cost=lower_bound_mean_predicted_cost 29 | self.predicted_idxs=predicted_idxs 30 | self.predicted_states=predicted_states 31 | self.predicted_states_std=predicted_states_std 32 | self.predicted_actions=predicted_actions 33 | self.predicted_costs=predicted_costs 34 | self.predicted_costs_std=predicted_costs_std 35 | 36 | def to_arrays(self): 37 | for key in self.__dict__.keys(): 38 | if isinstance(self.__dict__[key], torch.Tensor): 39 | self.__setattr__(key, np.array(self.__dict__[key])) 40 | 41 | def to_tensors(self): 42 | for key in self.__dict__.keys(): 43 | if isinstance(self.__dict__[key], np.array): 44 | self.__setattr__(key, torch.Tensor(self.__dict__[key])) 45 | 46 | def __str__(self): 47 | np.set_printoptions(precision=NUM_DECIMALS_REPR, suppress=True) 48 | iter_info_dict = self.__dict__ 49 | str_rep = "\n" 50 | for key, item in iter_info_dict.items(): 51 | #if key in ['predicted_states', 'predicted_states_std']: 52 | # continue 53 | if isinstance(item, np.ndarray) or isinstance(item, torch.Tensor): 54 | item = np.array2string(np.array(item), threshold=np.inf, max_line_width=np.inf, separator=',').replace('\n', '') # np.round( ), NUM_DECIMALS_REPR)# .tolist() 55 | else: 56 | item = np.round(item, NUM_DECIMALS_REPR) 57 | str_rep += f"{key}: {item}\n" 58 | return str_rep -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/memories/gp_memory.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from rl_gp_mpc.config_classes.memory_config import MemoryConfig 5 | from rl_gp_mpc.control_objects.utils.data_utils import form_model_input 6 | 7 | 8 | class Memory: 9 | def __init__(self, config:MemoryConfig, dim_input, dim_state, include_time_model=False, step_model=1): 10 | self.config = config 11 | self.include_time_model = include_time_model 12 | self.dim_input = dim_input 13 | self.dim_state = dim_state 14 | self.step_model = step_model 15 | 16 | self.inputs = torch.empty(config.points_batch_memory, dim_input) 17 | self.states_next = torch.empty(config.points_batch_memory, dim_state) 18 | self.rewards = torch.empty(config.points_batch_memory) 19 | self.iter_ctrls = torch.empty(config.points_batch_memory) 20 | self.errors = torch.empty(config.points_batch_memory, dim_state) 21 | self.stds = torch.empty(config.points_batch_memory, dim_state) 22 | 23 | self.model_inputs = torch.empty(config.points_batch_memory, dim_input) 24 | self.model_targets = torch.empty(config.points_batch_memory, dim_state) 25 | 26 | self.active_data_mask = np.empty(config.points_batch_memory, dtype=bool) 27 | self.len_mem = 0 28 | self.len_mem_last_processed = 0 29 | self.len_mem_model = 0 30 | 31 | def add(self, state, action_model, state_next, reward, iter_ctrl=0, **kwargs): 32 | # Add all points to memory but must use the "prepare" function to form the model inputs 33 | # it is needed if the model predicts multiple steps ahead 34 | if len(self.inputs) < (self.len_mem + 1): 35 | self.inputs = torch.cat(self.inputs, torch.empty(self.config.points_batch_memory, self.dim_input)) 36 | self.states_next = torch.cat(self.states_next, torch.empty(self.config.points_batch_memory, self.dim_state)) 37 | self.rewards = torch.cat(self.rewards, torch.empty(self.config.points_batch_memory)) 38 | self.iter_ctrls = torch.cat(self.rewards, torch.empty(self.config.points_batch_memory)) 39 | self.errors = torch.cat(self.errors, torch.empty(self.config.points_batch_memory, self.dim_state)) 40 | self.stds = torch.cat(self.stds, torch.empty(self.config.points_batch_memory, self.dim_state)) 41 | 42 | x = form_model_input(state=state, action_model=action_model, time_idx=iter_ctrl, include_time_model=self.include_time_model, dim_input=self.dim_input) 43 | self.inputs[self.len_mem] = x 44 | self.states_next[self.len_mem] = state_next 45 | self.rewards[self.len_mem] = reward 46 | self.iter_ctrls[self.len_mem] = iter_ctrl 47 | 48 | store_gp_mem = True 49 | if self.config.check_errors_for_storage: 50 | if 'predicted_state' in kwargs.keys() and kwargs['predicted_state'] is not None: 51 | error_prediction = torch.abs(kwargs['predicted_state'] - state_next) 52 | store_gp_mem = torch.any(error_prediction > self.config.min_error_prediction_state_for_memory) 53 | self.errors[self.len_mem] = error_prediction 54 | else: 55 | self.errors[self.len_mem] = np.nan 56 | 57 | if 'predicted_state_std' in kwargs.keys() and kwargs['predicted_state_std'] is not None: 58 | store_gp_mem = store_gp_mem and (torch.any(kwargs['predicted_state_std'] > self.config.min_prediction_state_std_for_memory)) 59 | self.stds[self.len_mem] = kwargs['predicted_state_std'] 60 | else: 61 | self.stds[self.len_mem] = np.nan 62 | 63 | self.active_data_mask[self.len_mem] = store_gp_mem 64 | self.len_mem += 1 65 | 66 | def prepare_for_model(self): 67 | # form the model inputs from the memory. 68 | # It is defined in this way to allow to have a model that predicts multiple steps ahead 69 | if len(self.model_inputs) < (self.len_mem_model + 1): 70 | self.model_inputs = torch.cat(self.model_inputs, torch.empty(self.config.points_batch_memory, self.model_inputs.shape[1])) 71 | self.model_targets = torch.cat(self.model_targets, torch.empty(self.config.points_batch_memory, self.model_targets.shape[1])) 72 | 73 | idxs_add_memory = self.get_indexes_to_process() 74 | mask_add_memory = self.active_data_mask[idxs_add_memory] 75 | idxs_add_memory = idxs_add_memory[mask_add_memory] 76 | len_to_add = len(idxs_add_memory) 77 | 78 | new_model_inputs, new_model_targets = self.get_memory_by_index(idxs_add_memory) 79 | self.model_inputs[self.len_mem_model:self.len_mem_model+len_to_add] = new_model_inputs 80 | self.model_targets[self.len_mem_model:self.len_mem_model+len_to_add] = new_model_targets 81 | 82 | self.len_mem_model += len_to_add 83 | self.len_mem_last_processed = self.len_mem 84 | 85 | def get_memory_total(self): 86 | indexes= self.get_indexes_processed() 87 | inputs, targets = self.get_memory_by_index(indexes) 88 | return inputs, targets 89 | 90 | def get_memory_by_index(self, indexes): 91 | inputs = self.inputs[indexes] 92 | targets = self.states_next[indexes + self.step_model - 1] - self.inputs[indexes, :self.dim_state] 93 | return inputs, targets 94 | 95 | def get_indexes_to_process(self): 96 | return np.arange(self.len_mem_last_processed, self.len_mem, self.step_model) 97 | 98 | def get_indexes_processed(self): 99 | return np.arange(0, self.len_mem_last_processed, self.step_model) 100 | 101 | def get_mask_model_inputs(self): 102 | indexes = self.get_indexes_processed() 103 | return self.active_data_mask[indexes] 104 | 105 | def get(self): 106 | if self.len_mem_model > 0: 107 | x_mem = self.model_inputs[:self.len_mem_model] 108 | y_mem = self.model_targets[:self.len_mem_model] 109 | else: # If no datapoints, initialize with dummy 0 variable to avoid errors 110 | x_mem = torch.zeros((1, self.dim_input)) 111 | y_mem = torch.zeros((1, self.dim_state)) 112 | return x_mem, y_mem -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/models/abstract_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from rl_gp_mpc.config_classes.model_config import ModelConfig 4 | 5 | class AbstractStateTransitionModel: 6 | def __init__(self, config:ModelConfig, dim_state, dim_action): 7 | self.config = config 8 | self.dim_state = dim_state 9 | self.dim_action = dim_action 10 | self.dim_input = self.dim_state + self.dim_action 11 | 12 | def predict_trajectory(self, input:torch.Tensor, input_var:torch.Tensor) -> "tuple[torch.Tensor]": 13 | raise NotImplementedError 14 | 15 | def predict_next_state(self, input: torch.Tensor, input_var: torch.Tensor) -> "tuple[torch.Tensor]": 16 | raise NotImplementedError 17 | 18 | def prepare_inference(self, x, y): 19 | raise NotImplementedError 20 | 21 | def train(self, x, y): 22 | raise NotImplementedError 23 | 24 | def save_state(self): 25 | raise NotImplementedError 26 | 27 | def load_state(self, saved_state): 28 | raise NotImplementedError 29 | -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/models/gp_model.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import time 3 | 4 | import torch 5 | import gpytorch 6 | import numpy as np 7 | 8 | from rl_gp_mpc.config_classes.model_config import ModelConfig 9 | 10 | from .abstract_model import AbstractStateTransitionModel 11 | 12 | 13 | class SavedState: 14 | def __init__(self, inputs:torch.Tensor, states_change: torch.Tensor, parameters:"list[dict]", constraints_hyperparams:dict, models: "list[gpytorch.models.ExactGP]"=None): 15 | self.inputs = inputs 16 | self.states_change = states_change 17 | self.parameters = parameters 18 | self.constraints_hyperparams = constraints_hyperparams 19 | if models is not None: 20 | self.state_dicts_models = [model.state_dict() for model in models] 21 | else: 22 | self.state_dicts_models = None 23 | 24 | def to_arrays(self): 25 | # convert tensors to arrays so that they can be sent to the multiprocessing process for training without errors 26 | self.inputs = self.inputs.numpy() 27 | self.states_change = self.states_change.numpy() 28 | self.parameters = [{k: v.numpy() for k, v in parameter.items()} for parameter in self.parameters] 29 | self.constraints_hyperparams = {k: v.numpy() if isinstance(v, torch.Tensor) else v for k, v in self.constraints_hyperparams.items()} 30 | 31 | def to_tensors(self): 32 | # used to reconstruct tensors inside the multiprocess process for training 33 | self.inputs = torch.Tensor(self.inputs) 34 | self.states_change = torch.Tensor(self.states_change) 35 | self.parameters = [{k: torch.Tensor(v) for k, v in parameter.items()} for parameter in self.parameters] 36 | self.constraints_hyperparams = {k: torch.Tensor(v) if isinstance(v, np.ndarray) else v for k, v in self.constraints_hyperparams.items()} 37 | 38 | 39 | class GpStateTransitionModel(AbstractStateTransitionModel): 40 | def __init__(self, config: ModelConfig, dim_state, dim_action): 41 | super().__init__(config, dim_state, dim_action) 42 | 43 | if self.config.include_time_model: 44 | self.dim_input += 1 45 | 46 | self.config.extend_dimensions_params(dim_state=self.dim_state, dim_input=self.dim_input) 47 | 48 | self.models = create_models( 49 | gp_init_dict=self.config.gp_init, 50 | constraints_gp=self.config.__dict__, 51 | inputs=None, 52 | targets=None, 53 | num_models=self.dim_state, 54 | num_inputs=self.dim_input 55 | ) 56 | 57 | for idx_model in range(len(self.models)): 58 | self.models[idx_model].eval() 59 | 60 | def predict_trajectory(self, actions: torch.Tensor, obs_mu: torch.Tensor, obs_var: torch.Tensor, len_horizon: int, current_time_idx: int) -> "tuple[torch.Tensor]": 61 | """ 62 | Compute the future predicted states distribution for the simulated trajectory given the 63 | current initial state (or observation) distribution (obs_mu and obs_var) and planned actions 64 | It also returns the costs, the variance of the costs, and the lower confidence bound of the cost 65 | along the trajectory 66 | 67 | Args: 68 | actions: actions to apply for the simulated trajectory. dim=(Nh, Na) 69 | where Nh is the len of the horizon and Na the dimension of actions 70 | 71 | obs_mu: mean value of the inital state distribution. 72 | dim=(Ns,) where Ns is the dimension of state 73 | 74 | obs_var: variance matrix of the inital state distribution. 75 | dim=(Ns, Ns) where Ns is the dimension of state 76 | 77 | Returns: 78 | states_mu_pred: predicted states of the trajectory. 79 | The first element contains the initial state. 80 | Dim=(Nh + 1, Ns) 81 | 82 | states_var_pred: covariance matrix of the predicted states of the trajectory. 83 | The first element contains the initial state. 84 | Dim=(Nh + 1, Ns, Ns) 85 | """ 86 | dim_state = self.dim_state 87 | dim_input_model = self.dim_input 88 | dim_action = self.dim_action 89 | states_mu_pred = torch.empty((len_horizon + 1, len(obs_mu))) 90 | states_var_pred = torch.empty((len_horizon + 1, dim_state, dim_state)) 91 | states_mu_pred[0] = obs_mu 92 | states_var_pred[0] = obs_var 93 | state_dim = obs_mu.shape[0] 94 | # Input of predict_next_state_change is not a state, but the concatenation of state and action 95 | for idx_time in range(1, len_horizon + 1): 96 | input_var = torch.zeros((dim_input_model, dim_input_model)) 97 | input_var[:state_dim, :state_dim] = states_var_pred[idx_time - 1] 98 | input_mean = torch.empty((dim_input_model,)) 99 | input_mean[:dim_state] = states_mu_pred[idx_time - 1] 100 | input_mean[dim_state:(dim_state + dim_action)] = actions[idx_time - 1] 101 | if self.config.include_time_model: 102 | input_mean[-1] = current_time_idx + idx_time - 1 103 | state_change, state_change_var, v = self.predict_next_state_change(input_mean, input_var) 104 | # use torch.clamp(states_mu_pred[idx_time], 0, 1) ? 105 | states_mu_pred[idx_time] = states_mu_pred[idx_time - 1] + state_change 106 | states_var_pred[idx_time] = state_change_var + states_var_pred[idx_time - 1] + \ 107 | input_var[:states_var_pred.shape[1]] @ v + \ 108 | v.t() @ input_var[:states_var_pred.shape[1]].t() 109 | 110 | return states_mu_pred, states_var_pred 111 | 112 | def predict_next_state_change(self, input_mu: torch.Tensor, input_var: torch.Tensor): 113 | """ 114 | Approximate GP regression at noisy inputs via moment matching 115 | IN: mean (m) (row vector) and (s) variance of the state 116 | OUT: mean (M) (row vector), variance (S) of the action and inv(s)*input-ouputcovariance 117 | Function inspired from 118 | https://github.com/nrontsis/PILCO/blob/6a962c8e4172f9e7f29ed6e373c4be2dd4b69cb7/pilco/models/mgpr.py#L81, 119 | reinterpreted from tensorflow to pytorch. 120 | Must be called after self.prepare_inference 121 | Args: 122 | input_mu: mean value of the input distribution. Dim=(Ns + Na,) 123 | 124 | input_var: covariance matrix of the input distribution. Dim=(Ns + Na, Ns + Na) 125 | 126 | Returns: 127 | M.t() (torch.Tensor): mean value of the predicted change distribution. Dim=(Ns,) 128 | 129 | S (torch.Tensor): covariance matrix of the predicted change distribution. Dim=(Ns, Ns) 130 | 131 | V.t() (torch.Tensor): Dim=(Ns, Ns + Na) 132 | 133 | where Ns: dimension of state, Na: dimension of action 134 | """ 135 | dim_state = self.dim_state 136 | dim_input_model = self.dim_input 137 | input_var = input_var[None, None, :, :].repeat([dim_state, dim_state, 1, 1]) 138 | inp = (self.x_mem - input_mu)[None, :, :].repeat([dim_state, 1, 1]) 139 | 140 | iN = inp @ self.iL 141 | B = self.iL @ input_var[0, ...] @ self.iL + torch.eye(dim_input_model) 142 | 143 | # Redefine iN as in^T and t --> t^T 144 | # B is symmetric, so it is equivalent 145 | iN_t = torch.transpose(iN, -1, -2) 146 | t = torch.transpose(torch.linalg.solve(B, iN_t), -1, -2) 147 | 148 | lb = torch.exp(-torch.sum(iN * t, -1) / 2) * self.beta 149 | tiL = t @ self.iL 150 | c = self.variances / torch.sqrt(torch.det(B)) 151 | 152 | M = (torch.sum(lb, -1) * c)[:, None] 153 | V = torch.matmul(torch.transpose(tiL.conj(), -1, -2), lb[:, :, None])[..., 0] * c[:, None] 154 | 155 | # Calculate S: Predictive Covariance 156 | R = torch.matmul(input_var, torch.diag_embed( 157 | 1 / torch.square(self.lengthscales[None, :, :]) + 158 | 1 / torch.square(self.lengthscales[:, None, :]) 159 | )) + torch.eye(dim_input_model) 160 | 161 | X = inp[None, :, :, :] / torch.square(self.lengthscales[:, None, None, :]) 162 | X2 = -inp[:, None, :, :] / torch.square(self.lengthscales[None, :, None, :]) 163 | Q = torch.linalg.solve(R, input_var) / 2 164 | Xs = torch.sum(X @ Q * X, -1) 165 | X2s = torch.sum(X2 @ Q * X2, -1) 166 | maha = -2 * torch.matmul(torch.matmul(X, Q), torch.transpose(X2.conj(), -1, -2)) + Xs[:, :, :, None] + X2s[:, :, None, :] 167 | 168 | k = torch.log(self.variances)[:, None] - torch.sum(torch.square(iN), -1) / 2 169 | L = torch.exp(k[:, None, :, None] + k[None, :, None, :] + maha) 170 | temp = self.beta[:, None, None, :].repeat([1, dim_state, 1, 1]) @ L 171 | S = (temp @ self.beta[None, :, :, None].repeat([dim_state, 1, 1, 1]))[:, :, 0, 0] 172 | 173 | diagL = torch.Tensor.permute(torch.diagonal(torch.Tensor.permute(L, dims=(3, 2, 1, 0)), dim1=-2, dim2=-1), 174 | dims=(2, 1, 0)) 175 | S = S - torch.diag_embed(torch.sum(torch.mul(self.iK, diagL), [1, 2])) 176 | S = S / torch.sqrt(torch.det(R)) 177 | S = S + torch.diag_embed(self.variances) 178 | S = S - M @ torch.transpose(M, -1, -2) 179 | 180 | return M.t(), S, V.t() 181 | 182 | def prepare_inference(self, inputs: torch.Tensor, state_changes: torch.Tensor): 183 | # compute all the parameters needed for inference that only depend on the memory, and not on the input, such that they can be computed in advance. 184 | # So they are not computed inside the mpc for each iteration 185 | self.x_mem = inputs 186 | self.y_mem = state_changes 187 | self.iK, self.beta = calculate_factorizations(inputs, state_changes, self.models) 188 | # self.x_mem = inputs[:self.beta.shape[1]] 189 | self.lengthscales = torch.stack([model.covar_module.base_kernel.lengthscale[0] for model in self.models]) 190 | self.variances = torch.stack([model.covar_module.outputscale for model in self.models]) 191 | self.iL = torch.diag_embed(1 / self.lengthscales) 192 | 193 | @staticmethod 194 | def train(queue:multiprocessing.Queue, 195 | saved_state:SavedState, 196 | lr_train:float, num_iter_train:int, clip_grad_value:float, print_train:bool=False, step_print_train:int=25): 197 | """ 198 | # , saved_state:SavedState , 199 | Train the gaussian process models hyper-parameters such that the marginal-log likelihood 200 | for the predictions of the points in memory is minimized. 201 | This function is launched in parallel of the main process, which is why a queue is used to tranfer 202 | information back to the main process and why the gaussian process models are reconstructed 203 | using the points in memory and hyper-parameters (the objects cant be sent directly as argument). 204 | If an error occurs, returns the parameters sent as init values 205 | (hyper-parameters obtained by the previous training process) 206 | Args: 207 | queue: queue object used to transfer information to the main process 208 | saved_state: SavedState, contains all the information to reconstruct the models 209 | lr_train: learning rate of the training 210 | num_iter_train: number of iteration for the training optimizer 211 | clip_grad_value: value at which the gradient are clipped, so that the training is more stable 212 | print_train: weither to print the information during training. default=False 213 | step_print_train: If print_train is True, only print the information every step_print_train iteration 214 | """ 215 | #torch.set_num_threads(1) 216 | start_time = time.time() 217 | saved_state.to_tensors() 218 | # create models, which is necessary since this function is used in a parallel process 219 | # that do not share memory with the principal process 220 | models = create_models(saved_state.parameters, saved_state.constraints_hyperparams, saved_state.inputs, saved_state.states_change) 221 | best_outputscales = [model.covar_module.outputscale.detach() for model in models] 222 | best_noises = [model.likelihood.noise.detach() for model in models] 223 | best_lengthscales = [model.covar_module.base_kernel.lengthscale.detach() for model in models] 224 | previous_losses = torch.empty(len(models)) 225 | 226 | for model_idx in range(len(models)): 227 | output = models[model_idx](models[model_idx].train_inputs[0]) 228 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(models[model_idx].likelihood, models[model_idx]) 229 | previous_losses[model_idx] = -mll(output, models[model_idx].train_targets) 230 | 231 | best_losses = previous_losses.detach().clone() 232 | # Random initialization of the parameters showed better performance than 233 | # just taking the value from the previous iteration as init values. 234 | # If parameters found at the end do not better performance than previous iter, 235 | # return previous parameters 236 | for model_idx in range(len(models)): 237 | models[model_idx].covar_module.outputscale = \ 238 | models[model_idx].covar_module.raw_outputscale_constraint.lower_bound + \ 239 | torch.rand(models[model_idx].covar_module.outputscale.shape) * \ 240 | (models[model_idx].covar_module.raw_outputscale_constraint.upper_bound - \ 241 | models[model_idx].covar_module.raw_outputscale_constraint.lower_bound) 242 | 243 | models[model_idx].covar_module.base_kernel.lengthscale = \ 244 | models[model_idx].covar_module.base_kernel.raw_lengthscale_constraint.lower_bound + \ 245 | torch.rand(models[model_idx].covar_module.base_kernel.lengthscale.shape) * \ 246 | (models[model_idx].covar_module.base_kernel.raw_lengthscale_constraint.upper_bound - \ 247 | models[model_idx].covar_module.base_kernel.raw_lengthscale_constraint.lower_bound) 248 | 249 | models[model_idx].likelihood.noise = \ 250 | models[model_idx].likelihood.noise_covar.raw_noise_constraint.lower_bound + \ 251 | torch.rand(models[model_idx].likelihood.noise.shape) * \ 252 | (models[model_idx].likelihood.noise_covar.raw_noise_constraint.upper_bound - 253 | models[model_idx].likelihood.noise_covar.raw_noise_constraint.lower_bound) 254 | mll = gpytorch.mlls.ExactMarginalLogLikelihood(models[model_idx].likelihood, models[model_idx]) 255 | models[model_idx].train() 256 | models[model_idx].likelihood.train() 257 | optimizer = torch.optim.LBFGS([ 258 | {'params': models[model_idx].parameters()}, # Includes GaussianLikelihood parameters 259 | ], lr=lr_train, line_search_fn='strong_wolfe') 260 | try: 261 | for i in range(num_iter_train): 262 | def closure(): 263 | optimizer.zero_grad() 264 | # Output from model 265 | output = models[model_idx](models[model_idx].train_inputs[0]) 266 | # Calc loss and backprop gradients 267 | loss = -mll(output, models[model_idx].train_targets) 268 | torch.nn.utils.clip_grad_value_(models[model_idx].parameters(), clip_grad_value) 269 | loss.backward() 270 | if print_train: 271 | if i % step_print_train == 0: 272 | print( 273 | 'Iter %d/%d - Loss: %.5f output_scale: %.5f lengthscale: %s noise: %.5f' % ( 274 | i + 1, num_iter_train, loss.item(), 275 | models[model_idx].covar_module.outputscale.item(), 276 | str(models[ 277 | model_idx].covar_module.base_kernel.lengthscale.detach().numpy()), 278 | pow(models[model_idx].likelihood.noise.item(), 0.5) 279 | )) 280 | return loss 281 | 282 | loss = optimizer.step(closure) 283 | if loss < best_losses[model_idx]: 284 | best_losses[model_idx] = loss.item() 285 | best_lengthscales[model_idx] = models[model_idx].covar_module.base_kernel.lengthscale 286 | best_noises[model_idx] = models[model_idx].likelihood.noise 287 | best_outputscales[model_idx] = models[model_idx].covar_module.outputscale 288 | 289 | except Exception as e: 290 | print(e) 291 | 292 | print( 293 | 'training process - model %d - time train %f - output_scale: %s - lengthscales: %s - noise: %s' % ( 294 | model_idx, time.time() - start_time, str(best_outputscales[model_idx].detach().numpy()), 295 | str(best_lengthscales[model_idx].detach().numpy()), 296 | str(best_noises[model_idx].detach().numpy()))) 297 | 298 | print('training process - previous marginal log likelihood: %s - new marginal log likelihood: %s' % 299 | (str(previous_losses.detach().numpy()), str(best_losses.detach().numpy()))) 300 | params_dict_list = [] 301 | for model_idx in range(len(models)): 302 | params_dict_list.append({ 303 | 'covar_module.base_kernel.lengthscale': best_lengthscales[model_idx].detach().numpy(), 304 | 'covar_module.outputscale': best_outputscales[model_idx].detach().numpy(), 305 | 'likelihood.noise': best_noises[model_idx].detach().numpy()}) 306 | queue.put(params_dict_list) 307 | 308 | def save_state(self) -> SavedState: 309 | saved_state = SavedState( 310 | inputs=self.x_mem, 311 | states_change=self.y_mem, 312 | parameters=[model.state_dict() for model in self.models], 313 | constraints_hyperparams=self.config.__dict__ 314 | ) 315 | return saved_state 316 | 317 | 318 | def create_models(gp_init_dict, constraints_gp, inputs:torch.Tensor=None, targets:torch.Tensor=None, num_models=None, num_inputs=None) -> "list[ExactGPModelMonoTask]": 319 | """ 320 | Define gaussian process models used for predicting state transition, 321 | using constraints and init values for (outputscale, noise, lengthscale). 322 | 323 | Args: 324 | train_inputs (torch.Tensor or None): Input values in the memory of the gps 325 | train_targets (torch.Tensor or None): target values in the memory of the gps. 326 | Represent the change in state values 327 | gp_init_dict (dict or list of dict): Value of the hyper-parameters of the gaussian processes. 328 | constraints_gp (dict): See the ReadMe about parameters for information about keys 329 | num_models (int or None): Must be provided when train_inputs or train_targets are None. 330 | The number of models should be equal to the dimension of state, 331 | so that the transition for each state can be predicted with a different gp. 332 | Default=None 333 | num_inputs (int or None): Must be provided when train_inputs or train_targets are None. 334 | The number of inputs should be equal to the sum of the dimension of state 335 | and dimension of action. Default=None 336 | include_time (bool): If True, gp will have one additional input corresponding to the time of the observation. 337 | This is usefull if the env change with time, 338 | as more recent points will be trusted more than past points 339 | (time closer to the point to make inference at). 340 | It is to be specified only if 341 | 342 | Returns: 343 | models (list of gpytorch.models.ExactGP): models containing the parameters, memory, 344 | constraints of the gps and functions for exact predictions 345 | """ 346 | if inputs is not None and targets is not None: 347 | num_models = len(targets[0]) 348 | models = [ExactGPModelMonoTask(inputs, targets[:, idx_model], len(inputs[0])) 349 | for idx_model in range(num_models)] 350 | else: 351 | if num_models is None or num_inputs is None: 352 | raise(ValueError('If train_inputs or train_targets are None, num_models and num_inputs must be defined')) 353 | else: 354 | models = [ExactGPModelMonoTask(None, None, num_inputs) for _ in range(num_models)] 355 | 356 | for idx_model in range(num_models): 357 | if constraints_gp is not None: 358 | if "min_std_noise" in constraints_gp.keys() and "max_std_noise" in constraints_gp.keys(): 359 | min_var_noise_model = torch.pow(constraints_gp['min_std_noise'][idx_model], 2) 360 | max_var_noise_model = torch.pow(constraints_gp['max_std_noise'][idx_model], 2) 361 | models[idx_model].likelihood.noise_covar.register_constraint("raw_noise", 362 | gpytorch.constraints.Interval(lower_bound=min_var_noise_model, upper_bound=max_var_noise_model)) 363 | 364 | if "min_outputscale" in constraints_gp.keys(): 365 | min_outputscale_model = constraints_gp['min_outputscale'][idx_model] 366 | max_outputscale_model = constraints_gp['max_outputscale'][idx_model] 367 | models[idx_model].covar_module.register_constraint("raw_outputscale", 368 | gpytorch.constraints.Interval(lower_bound=min_outputscale_model, upper_bound=max_outputscale_model)) 369 | 370 | if "min_lengthscale" in constraints_gp.keys(): 371 | min_lengthscale_model = constraints_gp['min_lengthscale'][idx_model] 372 | max_lengthscale_model = constraints_gp['max_lengthscale'][idx_model] 373 | models[idx_model].covar_module.base_kernel.register_constraint("raw_lengthscale", 374 | gpytorch.constraints.Interval(lower_bound=min_lengthscale_model, upper_bound=max_lengthscale_model)) 375 | 376 | if type(gp_init_dict) == list: 377 | models[idx_model].load_state_dict(gp_init_dict[idx_model]) 378 | else: 379 | hypers = {'base_kernel.lengthscale': gp_init_dict['base_kernel.lengthscale'][idx_model], 380 | 'outputscale': gp_init_dict['outputscale'][idx_model]} 381 | hypers_likelihood = {'noise_covar.noise': gp_init_dict['noise_covar.noise'][idx_model]} 382 | models[idx_model].likelihood.initialize(**hypers_likelihood) 383 | models[idx_model].covar_module.initialize(**hypers) 384 | return models 385 | 386 | 387 | class ExactGPModelMonoTask(gpytorch.models.ExactGP): 388 | def __init__(self, train_x, train_y, dim_input): 389 | likelihood = gpytorch.likelihoods.GaussianLikelihood() 390 | super(ExactGPModelMonoTask, self).__init__(train_x, train_y, likelihood) 391 | self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel(ard_num_dims=dim_input)) 392 | self.mean_module = gpytorch.means.ZeroMean() 393 | 394 | def forward(self, x): 395 | mean_x = self.mean_module(x) 396 | covar_x = self.covar_module(x) 397 | return gpytorch.distributions.MultivariateNormal(mean_x, covar_x) 398 | 399 | 400 | def calculate_factorizations(x:torch.Tensor, y:torch.Tensor, models: "list[ExactGPModelMonoTask]"): 401 | """ 402 | Compute iK and beta using the points in memory, which are needed to make predictions with the gaussian processes. 403 | These two variables only depends on data in memory, and not on input distribution, 404 | so they separated from other computation such that they can be computed outside the optimisation function, 405 | which is computed multiple times at each iteration 406 | 407 | Function inspired from 408 | https://github.com/nrontsis/PILCO/blob/6a962c8e4172f9e7f29ed6e373c4be2dd4b69cb7/pilco/models/mgpr.py#L81, 409 | reimplemented from tensorflow to pytorch 410 | Args: 411 | x (torch.Tensor): matrix containing the states and actions. Dim=(Nm, Ns + Na + Nc) 412 | y (torch.Tensor): matrix containing the states change. Dim=(Nm, Ns) 413 | models (list of gpytorch.models.ExactGP): list containing the gp models used to predict each state change. 414 | Len=Ns 415 | Ns: number of states 416 | Na: number of actions 417 | Nc: number of context variables. At the moment, only 1 if time used 418 | Nm: number of points in memory 419 | 420 | Returns: 421 | iK (torch.Tensor): needed by the gaussian processes models to compute the predictions 422 | beta (torch.Tensor): needed by the gaussian processes models to compute the predictions 423 | 424 | """ 425 | K = torch.stack([model.covar_module(x).evaluate() for model in models]) 426 | batched_eye = torch.eye(K.shape[1]).repeat(K.shape[0], 1, 1) 427 | L = torch.linalg.cholesky(K + torch.stack([model.likelihood.noise for model in models])[:, None] * batched_eye) 428 | iK = torch.cholesky_solve(batched_eye, L) 429 | Y_ = (y.t())[:, :, None] 430 | beta = torch.cholesky_solve(Y_, L)[:, :, 0] 431 | return iK, beta 432 | -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/observations_states_mappers/abstract_observation_state_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from rl_gp_mpc.config_classes.observation_config import ObservationConfig 7 | 8 | class AbstractObservationStateMapper: 9 | def __init__(self, observation_low: Union[np.ndarray, torch.Tensor], observation_high: Union[np.ndarray, torch.Tensor], config:ObservationConfig): 10 | self.config=config 11 | self.obs_low = torch.Tensor(observation_low) 12 | self.obs_high = torch.Tensor(observation_high) 13 | self.var_norm_factor = torch.pow((self.obs_high - self.obs_low), 2) 14 | self.dim_observation = len(observation_low) 15 | self.dim_state = self.dim_observation 16 | 17 | def get_state(self, obs:Union[np.ndarray, torch.Tensor], obs_var:Union[np.ndarray, torch.Tensor], update_internals:bool) -> torch.Tensor: 18 | raise(NotImplementedError()) 19 | 20 | def get_obs(self): 21 | raise(NotImplementedError()) 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/observations_states_mappers/normalization_observation_state_mapper.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from rl_gp_mpc.config_classes.observation_config import ObservationConfig 7 | from .abstract_observation_state_mapper import AbstractObservationStateMapper 8 | 9 | 10 | class NormalizationObservationStateMapper(AbstractObservationStateMapper): 11 | def __init__(self, observation_low: Union[np.ndarray, torch.Tensor], observation_high: Union[np.ndarray, torch.Tensor], config:ObservationConfig): 12 | super().__init__(observation_low, observation_high, config) 13 | 14 | def get_state(self, obs: Union[np.ndarray, torch.Tensor], obs_var:Union[np.ndarray, torch.Tensor]=None, update_internals=False) -> torch.Tensor: 15 | state = (torch.Tensor(obs) - self.obs_low) / (self.obs_high - self.obs_low) 16 | 17 | if obs_var is not None: 18 | state_var = torch.Tensor(obs_var) / self.var_norm_factor 19 | else: 20 | state_var = self.config.obs_var_norm 21 | 22 | return state, state_var 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/states_reward_mappers/abstract_state_reward_mapper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from rl_gp_mpc.config_classes.reward_config import RewardConfig 4 | 5 | class AbstractStateRewardMapper: 6 | def __init__(self, config:RewardConfig): 7 | self.config = config 8 | 9 | def get_reward(self, state_mu:torch.Tensor, state_var: torch.Tensor, action:torch.Tensor): 10 | raise NotImplementedError 11 | 12 | def get_reward_terminal(self, state_mu:torch.Tensor, state_var:torch.Tensor): 13 | raise NotImplementedError 14 | 15 | def get_rewards_trajectory(self, states_mu:torch.Tensor, states_var:torch.Tensor, actions:torch.Tensor) -> torch.Tensor: 16 | raise NotImplementedError 17 | 18 | -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/states_reward_mappers/setpoint_distance_reward_mapper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from rl_gp_mpc.config_classes.reward_config import RewardConfig 4 | from .abstract_state_reward_mapper import AbstractStateRewardMapper 5 | from ..utils.pytorch_utils import normal_cdf 6 | 7 | 8 | class SetpointStateRewardMapper(AbstractStateRewardMapper): 9 | def __init__(self, config:RewardConfig): 10 | super().__init__(config) 11 | 12 | def get_reward(self, state_mu: torch.Tensor, state_var: torch.Tensor, action:torch.Tensor) -> float: 13 | """ 14 | Compute the quadratic cost of one state distribution or a trajectory of states distributions 15 | given the mean value and variance of states (observations), the weight matrix, and target state. 16 | The state, state_var and action must be normalized. 17 | If reading directly from the gym env observation, 18 | this can be done with the gym env action space and observation space. 19 | See an example of normalization in the add_points_memory function. 20 | Args: 21 | state_mu (torch.Tensor): normed mean value of the state or observation distribution 22 | (elements between 0 and 1). dim=(Ns) or dim=(Np, Ns) 23 | state_var (torch.Tensor): normed variance matrix of the state or observation distribution 24 | (elements between 0 and 1) 25 | dim=(Ns, Ns) or dim=(Np, Ns, Ns) 26 | action (torch.Tensor): normed actions. (elements between 0 and 1). 27 | dim=(Na) or dim=(Np, Na) 28 | 29 | Np: length of the prediction trajectory. (=self.len_horizon) 30 | Na: dimension of the gym environment actions 31 | Ns: dimension of the gym environment states 32 | 33 | Returns: 34 | cost_mu (torch.Tensor): mean value of the cost distribution. dim=(1) or dim=(Np) 35 | cost_var (torch.Tensor): variance of the cost distribution. dim=(1) or dim=(Np) 36 | """ 37 | error = torch.cat((state_mu, action), -1) - self.config.target_state_action_norm 38 | if state_var.ndim == 3: 39 | #error = torch.cat((state_mu, action), 1) - self.config.target_state_action_norm 40 | state_action_var = torch.cat(( 41 | torch.cat((state_var, torch.zeros((state_var.shape[0], state_var.shape[1], action.shape[1]))), 2), 42 | torch.zeros((state_var.shape[0], action.shape[1], state_var.shape[1] + action.shape[1]))), 1) 43 | else: 44 | #error = torch.cat((state_mu, action), 0) - self.config.target_state_action_norm 45 | state_action_var = torch.block_diag(state_var, torch.zeros((action.shape[0], action.shape[0]))) 46 | 47 | weight_matrix_cost = self.config.weight_matrix_cost 48 | cost_mu = torch.diagonal(torch.matmul(state_action_var, weight_matrix_cost), 49 | dim1=-1, dim2=-2).sum(-1) + \ 50 | torch.matmul(torch.matmul(error[..., None].transpose(-1, -2), weight_matrix_cost), 51 | error[..., None]).squeeze() 52 | TS = weight_matrix_cost @ state_action_var 53 | cost_var_term1 = torch.diagonal(2 * TS @ TS, dim1=-1, dim2=-2).sum(-1) 54 | cost_var_term2 = TS @ weight_matrix_cost 55 | cost_var_term3 = (4 * error[..., None].transpose(-1, -2) @ cost_var_term2 @ error[..., None]).squeeze() 56 | cost_var = cost_var_term1 + cost_var_term3 57 | 58 | if self.config.use_constraints: 59 | if state_mu.ndim == 2: 60 | penalty_min_constraint = torch.stack([normal_cdf(self.config.state_min, state_mu[time_idx], state_var[time_idx].diag()) for time_idx in range(state_mu.shape[0])]) 61 | penalty_max_constraint = torch.stack([1 - normal_cdf(self.config.state_max, state_mu[time_idx], state_var[time_idx].diag()) for time_idx in range(state_mu.shape[0])]) 62 | else: 63 | penalty_min_constraint = normal_cdf(self.config.state_min, state_mu, state_var.diag()) 64 | penalty_max_constraint = 1 - normal_cdf(self.config.state_max, state_mu, state_var.diag()) 65 | 66 | cost_mu = cost_mu + penalty_max_constraint.sum(-1) + penalty_min_constraint.sum(-1) 67 | 68 | return -cost_mu, cost_var 69 | 70 | def get_rewards(self, state_mu: torch.Tensor, state_var: torch.Tensor, action:torch.Tensor) -> float: 71 | """ 72 | Compute the quadratic cost of one state distribution or a trajectory of states distributions 73 | given the mean value and variance of states (observations), the weight matrix, and target state. 74 | The state, state_var and action must be normalized. 75 | If reading directly from the gym env observation, 76 | this can be done with the gym env action space and observation space. 77 | See an example of normalization in the add_points_memory function. 78 | Args: 79 | state_mu (torch.Tensor): normed mean value of the state or observation distribution 80 | (elements between 0 and 1). dim=(Ns) or dim=(Np, Ns) 81 | state_var (torch.Tensor): normed variance matrix of the state or observation distribution 82 | (elements between 0 and 1) 83 | dim=(Ns, Ns) or dim=(Np, Ns, Ns) 84 | action (torch.Tensor): normed actions. (elements between 0 and 1). 85 | dim=(Na) or dim=(Np, Na) 86 | 87 | Np: length of the prediction trajectory. (=self.len_horizon) 88 | Na: dimension of the gym environment actions 89 | Ns: dimension of the gym environment states 90 | 91 | Returns: 92 | reward_mu (torch.Tensor): mean value of the reward distribution. (=-cost) dim=(1) or dim=(Np) 93 | cost_var (torch.Tensor): variance of the cost distribution. dim=(1) or dim=(Np) 94 | """ 95 | 96 | error = torch.cat((state_mu, action), 1) - self.config.target_state_action_norm 97 | state_action_var = torch.cat(( 98 | torch.cat((state_var, torch.zeros((state_var.shape[0], state_var.shape[1], action.shape[1]))), 2), 99 | torch.zeros((state_var.shape[0], action.shape[1], state_var.shape[1] + action.shape[1]))), 1) 100 | 101 | cost_mu = torch.diagonal(torch.matmul(state_action_var, self.config.weight_matrix_cost), 102 | dim1=-1, dim2=-2).sum(-1) + \ 103 | torch.matmul(torch.matmul(error[..., None].transpose(-1, -2), self.config.weight_matrix_cost), 104 | error[..., None]).squeeze() 105 | TS = self.config.weight_matrix_cost @ state_action_var 106 | cost_var_term1 = torch.diagonal(2 * TS @ TS, dim1=-1, dim2=-2).sum(-1) 107 | cost_var_term2 = TS @ self.config.weight_matrix_cost 108 | cost_var_term3 = (4 * error[..., None].transpose(-1, -2) @ cost_var_term2 @ error[..., None]).squeeze() 109 | 110 | cost_var = cost_var_term1 + cost_var_term3 111 | if self.config.use_constraints: 112 | state_distribution = [torch.distributions.normal.Normal(state_mu[idx], state_var[idx]) for idx in 113 | range(state_mu.shape[0])] 114 | penalty_min_constraint = torch.stack([state_distribution[time_idx].cdf(torch.Tensor( 115 | self.config.state_min)) * self.config.area_multiplier for 116 | time_idx in range(state_mu.shape[0])]).diagonal(0, -1, -2).sum(-1) 117 | penalty_max_constraint = torch.stack([(1 - state_distribution[time_idx].cdf(torch.Tensor( 118 | self.config.state_max))) * self.config.area_multiplier for 119 | time_idx in range(state_mu.shape[0])]).diagonal(0, -1, -2).sum(-1) 120 | cost_mu = cost_mu + penalty_max_constraint + penalty_min_constraint 121 | 122 | return -cost_mu, cost_var 123 | 124 | def get_reward_terminal(self, state_mu:torch.Tensor, state_var:torch.Tensor) -> float: 125 | """ 126 | Compute the terminal cost of the prediction trajectory. 127 | Args: 128 | state_mu (torch.Tensor): mean value of the terminal state distribution. dim=(Ns) 129 | state_var (torch.Tensor): variance matrix of the terminal state distribution. dim=(Ns, Ns) 130 | 131 | Returns: 132 | reward_mu (torch.Tensor): mean value of the reward distribution. (=-cost) dim=(1) 133 | cost_var (torch.Tensor): variance of the cost distribution. dim=(1) 134 | """ 135 | error = state_mu - self.config.target_state_norm 136 | cost_mu = torch.trace(torch.matmul(state_var, self.config.weight_matrix_cost_terminal)) + \ 137 | torch.matmul(torch.matmul(error.t(), self.config.weight_matrix_cost_terminal), error) 138 | TS = self.config.weight_matrix_cost_terminal @ state_var 139 | cost_var_term1 = torch.trace(2 * TS @ TS) 140 | cost_var_term2 = 4 * error.t() @ TS @ self.config.weight_matrix_cost_terminal @ error 141 | cost_var = cost_var_term1 + cost_var_term2 142 | return -cost_mu, cost_var 143 | 144 | def get_rewards_trajectory(self, states_mu:torch.Tensor, states_var:torch.Tensor, actions:torch.Tensor) -> torch.Tensor: 145 | rewards_traj, rewards_traj_var = self.get_reward(states_mu[:-1], states_var[:-1], actions) 146 | cost_traj_final, rewards_traj_var_final = self.get_reward_terminal(states_mu[-1], states_var[-1]) 147 | rewards_traj = torch.cat((rewards_traj, cost_traj_final[None]), 0) 148 | rewards_traj_var = torch.cat((rewards_traj_var, rewards_traj_var_final[None]), 0) 149 | return rewards_traj, rewards_traj_var 150 | 151 | 152 | def compute_squared_dist_cost(error, state_action_var, weight_matrix_cost): 153 | cost_mu = torch.diagonal(torch.matmul(state_action_var, weight_matrix_cost), 154 | dim1=-1, dim2=-2).sum(-1) + \ 155 | torch.matmul(torch.matmul(error[..., None].transpose(-1, -2), weight_matrix_cost), 156 | error[..., None]).squeeze() 157 | TS = weight_matrix_cost @ state_action_var 158 | cost_var_term1 = torch.diagonal(2 * TS @ TS, dim1=-1, dim2=-2).sum(-1) 159 | cost_var_term2 = TS @ weight_matrix_cost 160 | cost_var_term3 = (4 * error[..., None].transpose(-1, -2) @ cost_var_term2 @ error[..., None]).squeeze() 161 | cost_var = cost_var_term1 + cost_var_term3 162 | return cost_mu, cost_var -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/utils/data_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def form_model_input(state, action_model, time_idx, include_time_model, dim_input): 5 | x = torch.empty(dim_input) 6 | x[:((state.shape[0] + action_model.shape[0]))] = torch.cat((state, action_model)) 7 | if include_time_model: 8 | x[-1] = time_idx 9 | return x -------------------------------------------------------------------------------- /rl_gp_mpc/control_objects/utils/pytorch_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from math import sqrt 3 | 4 | class Clamp(torch.autograd.Function): 5 | # This class allow the flow of gradient when at the boundary. Otherwise, when using the clamp function from pytorch, the action remains blocked at 0 or 1 6 | 7 | @staticmethod 8 | def forward(ctx, input, min, max): 9 | return input.clamp(min=min, max=max) 10 | 11 | @staticmethod 12 | def backward(ctx, grad_output): 13 | return grad_output.clone(), None, None 14 | 15 | 16 | def normal_cdf(x, mu, sigma): 17 | return 0.5 * (1 + torch.erf((x - mu) / (sigma * sqrt(2)))) -------------------------------------------------------------------------------- /rl_gp_mpc/envs/process_control.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym 3 | from gym import spaces 4 | 5 | 6 | class ProcessControl(gym.Env): 7 | """ 8 | Example of custom Environment that follows gym interface. 9 | The additional methods: 10 | - define_params: used to be able to vary the internal parameters 11 | of the env to check the robustness of the control to change in the env. 12 | - get_obs: methods that returns the observation given the state of the env. 13 | It is used to add noise to measurement. 14 | For example, the state is the volume, but the level is measured 15 | If change_params is set to True, the internal params will be changed every period_change iterations with the env 16 | Params are ranges because they are chosen at random between the two values provided. """ 17 | metadata = {'render.modes': []} 18 | 19 | def __init__(self, dt=1, s_range=(9, 11), fi_range=(0., 0.2), ci_range=(0, 0.2), 20 | cr_range=(0.5, 1), noise_l_prop_range=(1e-5, 1e-3), 21 | noise_co_prop_range=(1e-5, 1e-3), sp_l_range=(0.2, 0.8), 22 | sp_co_range=(0.2, 0.4), change_params=True, period_change=50): 23 | """ 24 | The environment represents the simulation of a cuve with two input flow with different concentrations of product. 25 | The flow fi with concentration ci is not controllable 26 | The flow with concentration cr is controllable with action_1 27 | action_0 controls the output flow 28 | state0 = level 29 | state1 = concentration of product 30 | 31 | s: surface of the cuve 32 | ci: concentration of product with flow fi 33 | cr: concentration of product for input input_1 34 | 35 | There is a gaussian noise on the measurement of the level 36 | noise_l_prop: noise on level 37 | noise_co: noise on concentration 38 | 39 | The parameters are taken randomly between the ranges provided. 40 | If change_params is set to True, the parameters are changed every period_change timesteps. 41 | Can be used to see how the method is robust to environment changes 42 | dt: time between time steps 43 | """ 44 | super().__init__() 45 | self.name = 'processcontrol' 46 | # Definitions for gym env: action and observation space 47 | # They must be gym.spaces objects 48 | self.observation_space = spaces.Box( 49 | low=np.array([0, 0]), 50 | high=np.array([10, 1]), 51 | shape=(2,), dtype=np.float32) 52 | self.reward_range = (0, 1) 53 | self.action_space = spaces.Box( 54 | low=np.array([0, 0]), 55 | high=np.array([1, 1]), 56 | shape=(2,), dtype=np.float32) 57 | 58 | # definitions specific to my env 59 | self.dt = dt 60 | self.s_range = s_range 61 | self.fi_range = fi_range 62 | self.ci_range = ci_range 63 | self.cr_range = cr_range 64 | self.noise_l_prop_range = noise_l_prop_range 65 | self.noise_co_prop_range = noise_co_prop_range 66 | self.sp_l_range = sp_l_range 67 | self.sp_co_range = sp_co_range 68 | self.change_params = change_params 69 | self.period_change = period_change 70 | # initialization of parameters and states 71 | self.define_params() 72 | 73 | def define_params(self): 74 | """ 75 | Define the env parameters. It is not specific to gym 76 | """ 77 | self.s = np.random.uniform(self.s_range[0], self.s_range[1]) 78 | self.fi = np.random.uniform(self.fi_range[0], self.fi_range[1]) 79 | self.ci = np.random.uniform(self.ci_range[0], self.ci_range[1]) 80 | self.cr = np.random.uniform(self.cr_range[0], self.cr_range[1]) 81 | self.noise_l_prop = np.exp(np.random.uniform( 82 | np.log(self.noise_l_prop_range[0]), 83 | np.log(self.noise_l_prop_range[1]))) 84 | self.noise_co_prop = np.exp(np.random.uniform( 85 | np.log(self.noise_co_prop_range[0]), 86 | np.log(self.noise_co_prop_range[1]))) 87 | self.sp_l = np.random.uniform(self.sp_l_range[0], self.sp_l_range[1]) 88 | self.sp_co = np.random.uniform(self.sp_co_range[0], self.sp_co_range[1]) 89 | 90 | if hasattr(self, 'v'): 91 | self.clip_parameters() 92 | 93 | print("New params value: s: {:.2f}, fi: {:.2f}, ci: {:.2f}, cr: {:.2f}, " 94 | "noise_l: {:.4f}, noise_co: {:.4f}, sp_l: {:.2f}, sp_co: {:.2f}".format( 95 | self.s, self.fi, self.ci, self.cr, self.noise_l_prop, self.noise_co_prop, self.sp_l, self.sp_co)) 96 | 97 | def step(self, action): 98 | dv = (self.fi + action[1] - action[0]) 99 | dr = (self.fi * self.ci + action[1] * self.cr - action[0] * self.r / (self.v + 1e-3)) 100 | self.v += dv * self.dt 101 | self.r += dr * self.dt 102 | self.iter += 1 103 | 104 | self.v = np.clip(self.v, 105 | self.observation_space.low[0] * self.s, 106 | self.observation_space.high[0] * self.s) 107 | self.r = np.clip(self.r, 108 | self.observation_space.low[1] * self.v, 109 | self.observation_space.high[1] * self.v) 110 | 111 | reward = - (pow(self.v / self.s - self.sp_l, 2) + pow(self.r / (self.v + 1e-6) - self.sp_co, 2)) 112 | done = 0 113 | info = {} 114 | if self.change_params and self.iter % self.period_change == 0: 115 | self.define_params() 116 | 117 | return self.get_obs(), reward, done, info 118 | 119 | def reset(self, min_prop=0.3, max_prop=0.7): 120 | # Reset the state of the environment to an initial state 121 | self.iter = 0 122 | ranges = (self.observation_space.high - self.observation_space.low) 123 | obs_sample = np.clip(self.observation_space.sample(), 124 | min_prop * ranges + self.observation_space.low, 125 | max_prop * ranges + self.observation_space.low) 126 | self.v = obs_sample[0] * self.s 127 | self.r = obs_sample[1] * self.v 128 | return self.get_obs() 129 | 130 | def get_obs(self): 131 | """ 132 | Get observation given the internal states. It is not specific to gym 133 | """ 134 | l_mes = self.v / self.s 135 | co_mes = self.r / (self.v + 1e-6) 136 | if self.noise_l_prop != 0: 137 | l_mes += np.random.normal(0, self.noise_l_prop * self.observation_space.high[0]) 138 | if self.noise_co_prop != 0: 139 | co_mes += np.random.normal(0, self.noise_co_prop * self.observation_space.high[1]) 140 | l_mes = np.clip(l_mes, self.observation_space.low[0], self.observation_space.high[0]) 141 | co_mes = np.clip(co_mes, self.observation_space.low[1], self.observation_space.high[1]) 142 | return np.array([l_mes, co_mes]) 143 | 144 | def render(self, mode='human', close=False): 145 | # Render the environment 146 | pass 147 | 148 | def clip_parameters(self, prop_level_max_after_reset=0.9): 149 | v_p = self.v 150 | self.v = np.clip(self.v, 151 | a_min=0, 152 | a_max=prop_level_max_after_reset * (self.s * self.observation_space.high[0]) 153 | ) 154 | self.r = self.r * self.v / v_p 155 | -------------------------------------------------------------------------------- /rl_gp_mpc/run_env_function.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from gym.core import Env 5 | 6 | from rl_gp_mpc import GpMpcController 7 | from rl_gp_mpc import ControlVisualizations 8 | from rl_gp_mpc.config_classes.visu_config import VisuConfig 9 | from rl_gp_mpc.config_classes.total_config import Config 10 | 11 | NUM_DECIMALS_REPR = 3 12 | np.set_printoptions(precision=NUM_DECIMALS_REPR, suppress=True) 13 | 14 | def run_env(env: Env, control_config:Config, visu_config: VisuConfig, random_actions_init=10, num_steps=150, verbose=True): 15 | visu_obj = ControlVisualizations(env=env, num_steps=num_steps, control_config=control_config, visu_config=visu_config) 16 | 17 | ctrl_obj = GpMpcController(observation_low=env.observation_space.low, 18 | observation_high=env.observation_space.high, 19 | action_low=env.action_space.low, 20 | action_high=env.action_space.high, 21 | config=control_config) 22 | 23 | obs = env.reset() 24 | 25 | for idx_ctrl in range(num_steps): 26 | action_is_random = (idx_ctrl < random_actions_init) 27 | action = ctrl_obj.get_action(obs_mu=obs, random=action_is_random) 28 | 29 | iter_info = ctrl_obj.get_iter_info() 30 | 31 | cost, cost_var = ctrl_obj.compute_cost_unnormalized(obs, action) 32 | visu_obj.update(obs=obs, reward=-cost, action=action, env=env, iter_info=iter_info) 33 | 34 | obs_new, reward, done, info = env.step(action) 35 | 36 | ctrl_obj.add_memory(obs=obs, action=action, obs_new=obs_new, 37 | reward=-cost, 38 | predicted_state=iter_info.predicted_states[1], 39 | predicted_state_std=iter_info.predicted_states_std[1]) 40 | obs = obs_new 41 | if verbose: 42 | print(str(iter_info)) 43 | 44 | 45 | visu_obj.save(ctrl_obj) 46 | ctrl_obj.check_and_close_processes() 47 | env.__exit__() 48 | visu_obj.close() 49 | return visu_obj.get_costs() 50 | 51 | 52 | def run_env_multiple(env, env_name, control_config:Config, visu_config: VisuConfig, num_runs, random_actions_init=10, num_steps=150, verbose=True): 53 | costs_runs = [] 54 | for run_idx in range(num_runs): 55 | costs_iter = run_env(env, control_config, visu_config, random_actions_init, num_steps, verbose=verbose) 56 | costs_runs.append(costs_iter) 57 | time.sleep(1) 58 | 59 | costs_runs = np.array(costs_runs) 60 | 61 | costs_runs_mean = np.mean(costs_runs, axis=0) 62 | costs_runs_std = np.std(costs_runs, axis=0) 63 | 64 | x_axis = np.arange(len(costs_runs_mean)) 65 | fig, ax = plt.subplots(figsize=(10, 5)) 66 | ax.plot(x_axis, costs_runs_mean) 67 | ax.fill_between(x_axis, costs_runs_mean - costs_runs_std, costs_runs_mean + costs_runs_std, alpha=0.4) 68 | plt.title(f"Costs of multiples {env_name} runs") 69 | plt.ylabel("Cost") 70 | plt.xlabel("Env iteration") 71 | plt.savefig(f'multiple_runs_costs_{env_name}.png') 72 | plt.show() -------------------------------------------------------------------------------- /rl_gp_mpc/visu_objects/dynamic_2d_graph.py: -------------------------------------------------------------------------------- 1 | import time 2 | import threading 3 | import queue 4 | import multiprocessing 5 | 6 | import matplotlib 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | import imageio 10 | 11 | from rl_gp_mpc.control_objects.controllers.iteration_info_class import IterationInformation 12 | 13 | # matplotlib.use('TkAgg') 14 | matplotlib.rc('font', size='6') 15 | ALPHA_CONFIDENCE_BOUNDS = 0.3 16 | FONTSIZE = 6 17 | 18 | MUL_STD_BOUNDS = 3 19 | FONTSIZE = 6 20 | 21 | 22 | class LivePlotParallel: 23 | def __init__(self, num_steps_total, dim_states, dim_actions, use_constraints=False, 24 | state_min=None, state_max=None, time_between_updates=0.75, use_thread=False, 25 | path_save="control_animation.gif", save=False): 26 | self.use_thread = use_thread 27 | 28 | if use_thread: 29 | self.pqueue = queue.Queue() 30 | self.graph_p = threading.Thread(target=anim_plots_2d_p, 31 | args=(self.pqueue, num_steps_total, dim_states, 32 | dim_actions, use_constraints, 33 | state_min, state_max, time_between_updates, path_save, save)) 34 | else: 35 | self.pqueue = multiprocessing.Queue() 36 | self.graph_p = multiprocessing.Process(target=anim_plots_2d_p, 37 | args=(self.pqueue, num_steps_total, dim_states, 38 | dim_actions, use_constraints, 39 | state_min, state_max, time_between_updates, path_save, save)) 40 | self.graph_p.start() 41 | 42 | def update(self, state, action, cost, iter_info:IterationInformation=None): 43 | if iter_info is None: 44 | self.pqueue.put([state, action, cost]) 45 | else: 46 | self.pqueue.put([state, action, cost, 47 | iter_info.mean_predicted_cost, 48 | iter_info.mean_predicted_cost_std, 49 | iter_info.predicted_states, 50 | iter_info.predicted_states_std, 51 | iter_info.predicted_actions, 52 | iter_info.predicted_costs, 53 | iter_info.predicted_costs_std, 54 | iter_info.predicted_idxs]) 55 | def close(self): 56 | self.pqueue.put(None) 57 | if self.use_thread: 58 | self.graph_p.join() 59 | else: 60 | self.pqueue.close() 61 | self.graph_p.join() 62 | self.graph_p.close() 63 | 64 | def __exit__(self): 65 | if 'graph_p' in self.__dict__ and self.graph_p.is_alive(): 66 | self.close() 67 | 68 | 69 | def anim_plots_2d_p(queue, num_steps_total, dim_states, dim_actions, use_constraints=False, state_min=None, state_max=None, time_between_updates=0.5, path_save="control_animation.gif", save=False): 70 | fig, axes = plt.subplots(nrows=3, figsize=(10, 6), sharex=True) 71 | axes[0].set_title('Normed states and predictions') 72 | axes[1].set_title('Normed actions') 73 | axes[2].set_title('Cost and horizon cost') 74 | plt.xlabel("Env steps") 75 | min_state = -0.03 76 | max_state = 1.03 77 | axes[0].set_ylim(min_state, max_state) 78 | axes[0].grid() 79 | axes[1].set_ylim(-0.03, 1.03) 80 | axes[1].grid() 81 | axes[2].set_xlim(0, num_steps_total) 82 | axes[2].grid() 83 | plt.tight_layout() 84 | 85 | states = np.empty((num_steps_total, dim_states)) 86 | actions = np.empty((num_steps_total, dim_actions)) 87 | costs = np.empty(num_steps_total) 88 | mean_cost_pred = np.empty_like(costs) 89 | mean_cost_std_pred = np.empty_like(mean_cost_pred) 90 | 91 | if use_constraints: 92 | if state_min is not None: 93 | for idx_state in range(dim_states): 94 | axes[0].axhline(y=state_min[idx_state], 95 | color=plt.get_cmap('tab20').colors[2 * idx_state], 96 | linestyle='-.') 97 | if state_max is not None: 98 | for idx_state in range(dim_states): 99 | axes[0].axhline(y=state_max[idx_state], 100 | color=plt.get_cmap('tab20').colors[2 * idx_state], 101 | linestyle='-.') 102 | lines_states = [axes[0].plot([], [], label='state' + str(state_idx), 103 | color=plt.get_cmap('tab20').colors[2 * state_idx]) for state_idx in range(dim_states)] 104 | lines_actions = [axes[1].step([], [], label='action' + str(action_idx), 105 | color=plt.get_cmap('tab20').colors[1 + 2 * action_idx], where='post') for action_idx in range(dim_actions)] 106 | 107 | line_cost = axes[2].plot([], [], label='cost', color='k') 108 | line_costs_mean_pred = axes[2].plot([], [], label='mean predicted cost', color='orange') 109 | 110 | lines_predicted_states = [axes[0].plot([], [], 111 | label='predicted_states' + str(state_idx), 112 | color=plt.get_cmap('tab20').colors[2 * state_idx], linestyle='dashed') 113 | for state_idx in range(dim_states)] 114 | lines_actions_pred = [axes[1].step([], [], label='predicted_action' + str(action_idx), 115 | color=plt.get_cmap('tab20').colors[1 + 2 * action_idx], linestyle='dashed', where='post') 116 | for action_idx in range(dim_actions)] 117 | line_predicted_costs = axes[2].plot([], [], label='predicted cost', color='k', linestyle='dashed') 118 | 119 | axes[0].legend(fontsize=FONTSIZE) 120 | axes[1].legend(fontsize=FONTSIZE) 121 | axes[2].legend(fontsize=FONTSIZE) 122 | fig.canvas.draw() 123 | plt.show(block=False) 124 | 125 | num_pts_show = 0 126 | last_update_time = time.time() 127 | exit_loop = False 128 | if save: 129 | writer = imageio.get_writer(path_save, mode='I') 130 | while True: 131 | if exit_loop: break 132 | if (time.time() - last_update_time) > time_between_updates: 133 | got_data = False 134 | last_update_time = time.time() 135 | while not (queue.empty()): 136 | msg = queue.get() # Read from the queue 137 | if msg is None: # Allows to gracefully end the process by send None, then join and close 138 | exit_loop = True 139 | continue 140 | obs_norm = np.array(msg[0]) 141 | action_norm = np.array(msg[1]) 142 | states[num_pts_show] = obs_norm 143 | actions[num_pts_show] = action_norm 144 | costs[num_pts_show] = msg[2] 145 | 146 | update_limits = False 147 | min_state_actual = np.min(states[num_pts_show]) 148 | if min_state_actual < min_state: 149 | min_state = min_state_actual 150 | update_limits = True 151 | 152 | max_state_actual = np.max(states[num_pts_show]) 153 | if max_state_actual > max_state: 154 | max_state = max_state_actual 155 | update_limits = True 156 | 157 | if update_limits: 158 | axes[0].set_ylim(min_state, max_state) 159 | 160 | if len(msg) > 3: 161 | mean_cost_pred[num_pts_show] = np.nan_to_num(msg[3], nan=-1, posinf=-1, neginf=-1) 162 | mean_cost_std_pred[num_pts_show] = np.nan_to_num(msg[4], copy=True, nan=99, posinf=99, neginf=99) 163 | else: 164 | if num_pts_show == 0: 165 | # Init values for mean cost pred and mean cost std pred at the beginning when the control 166 | # has not started yet and only random actions has been applied. 167 | # Thus, the mean future cost and the std of the mean future cost has not been computed yet 168 | # and have not been provided to the graph object for the update 169 | mean_cost_pred[num_pts_show] = 0 170 | mean_cost_std_pred[num_pts_show] = 0 171 | else: 172 | # For when the information about prediction have not been provided to the graph object 173 | # for the update, for example at the init period when random actions are applied 174 | mean_cost_pred[num_pts_show] = mean_cost_pred[num_pts_show - 1] 175 | mean_cost_std_pred[num_pts_show] = mean_cost_std_pred[num_pts_show - 1] 176 | num_pts_show += 1 177 | got_data = True 178 | last_update_time = time.time() 179 | 180 | if got_data: 181 | for idx_axes in range(len(axes)): 182 | axes[idx_axes].collections.clear() 183 | 184 | idxs = np.arange(0, num_pts_show) 185 | for idx_axes in range(len(axes)): 186 | axes[idx_axes].collections.clear() 187 | 188 | for idx_state in range(len(obs_norm)): 189 | lines_states[idx_state][0].set_data(idxs, states[:num_pts_show, idx_state]) 190 | 191 | for idx_action in range(len(action_norm)): 192 | lines_actions[idx_action][0].set_data(idxs, actions[:num_pts_show, idx_action]) 193 | 194 | line_cost[0].set_data(idxs, costs[:num_pts_show]) 195 | 196 | line_costs_mean_pred[0].set_data(idxs, mean_cost_pred[:num_pts_show]) 197 | axes[2].fill_between(idxs, 198 | mean_cost_pred[:num_pts_show] - 199 | mean_cost_std_pred[:num_pts_show] * MUL_STD_BOUNDS, 200 | mean_cost_pred[:num_pts_show] + 201 | mean_cost_std_pred[:num_pts_show] * MUL_STD_BOUNDS, 202 | facecolor='orange', alpha=ALPHA_CONFIDENCE_BOUNDS, 203 | label='mean predicted ' + str(MUL_STD_BOUNDS) + ' std cost bounds') 204 | 205 | axes[2].set_ylim(0, np.max([np.max(mean_cost_pred[:num_pts_show]), 206 | np.max(costs[:num_pts_show])]) * 1.1) 207 | axes[2].set_ylim(0, np.max(line_cost[0].get_ydata()) * 1.1) 208 | 209 | # If predictions are not given in the last update, do not show them on the graph 210 | if len(msg) > 3: 211 | predicted_states = np.nan_to_num(msg[5], nan=-1, posinf=-1, neginf=-1) 212 | predicted_states_std = np.nan_to_num(msg[6], copy=False, nan=99, posinf=99, neginf=99) 213 | actions_pred = msg[7] 214 | predicted_costs = np.nan_to_num(msg[8], nan=-1, posinf=-1, neginf=-1) 215 | predicted_costs_std = np.nan_to_num(msg[9], nan=99, posinf=99, neginf=0) 216 | predicted_idxs = np.array(msg[10]) 217 | step_idx = (predicted_idxs[-1] - predicted_idxs[-2]) 218 | predicted_idxs_states = np.concatenate((predicted_idxs, [predicted_idxs[-1] + step_idx])) 219 | 220 | for idx_state in range(len(obs_norm)): 221 | future_states_show = predicted_states[:, idx_state] 222 | lines_predicted_states[idx_state][0].set_data(predicted_idxs_states, future_states_show) 223 | 224 | future_states_std_show = predicted_states_std[:, idx_state] 225 | axes[0].fill_between(predicted_idxs_states, 226 | future_states_show - future_states_std_show * MUL_STD_BOUNDS, 227 | future_states_show + future_states_std_show * MUL_STD_BOUNDS, 228 | facecolor=plt.get_cmap('tab20').colors[2 * idx_state], alpha=ALPHA_CONFIDENCE_BOUNDS, 229 | label='predicted ' + str(MUL_STD_BOUNDS) + ' std bounds state ' + str(idx_state)) 230 | 231 | for idx_action in range(len(action_norm)): 232 | future_actions_show = actions_pred[:, idx_action] 233 | lines_actions_pred[idx_action][0].set_data(predicted_idxs, future_actions_show) # 234 | 235 | future_costs_show = predicted_costs 236 | line_predicted_costs[0].set_data(predicted_idxs_states, future_costs_show) 237 | 238 | future_cost_std_show = predicted_costs_std 239 | axes[2].fill_between(predicted_idxs_states, 240 | future_costs_show - future_cost_std_show * MUL_STD_BOUNDS, 241 | future_costs_show + future_cost_std_show * MUL_STD_BOUNDS, 242 | facecolor='black', alpha=ALPHA_CONFIDENCE_BOUNDS, 243 | label='predicted ' + str(MUL_STD_BOUNDS) + ' std cost bounds') 244 | 245 | for ax in axes: 246 | ax.relim() 247 | ax.autoscale_view(True, True, True) 248 | 249 | fig.canvas.draw() 250 | if save: 251 | image = np.frombuffer(fig.canvas.tostring_rgb(), dtype='uint8') 252 | image = image.reshape(fig.canvas.get_width_height()[::-1] + (3,)) 253 | # Append the image to the animation 254 | writer.append_data(image) 255 | plt.pause(0.01) 256 | time.sleep(0.05) 257 | if save: writer.close() 258 | plt.close('all') 259 | 260 | 261 | class LivePlotSequential: 262 | ''' 263 | DEPRECATED CLASS, WILL NOT WORK 264 | ''' 265 | def __init__(self, num_steps_total, obs_space, action_space, step_pred, use_constraints=False, state_min=None, state_max=None): 266 | self.fig, self.axes = plt.subplots(nrows=3, figsize=(6, 5), sharex=True) 267 | self.axes[0].set_title('Normed states and predictions') 268 | self.axes[1].set_title('Normed actions') 269 | self.axes[2].set_title('Cost and horizon cost') 270 | plt.xlabel("Env steps") 271 | self.min_state = -0.03 272 | self.max_state = 1.03 273 | self.axes[0].set_ylim(self.min_state, self.max_state) 274 | self.axes[1].set_ylim(-0.03, 1.03) 275 | self.axes[2].set_xlim(0, num_steps_total) 276 | plt.tight_layout() 277 | 278 | self.step_pred = step_pred 279 | 280 | self.states = np.empty((num_steps_total, obs_space.shape[0])) 281 | self.actions = np.empty((num_steps_total, action_space.shape[0])) 282 | self.costs = np.empty(num_steps_total) 283 | 284 | self.mean_predicted_cost = np.empty_like(self.costs) 285 | self.mean_predicted_cost_std = np.empty_like(self.costs) 286 | 287 | self.min_obs = obs_space.low 288 | self.max_obs = obs_space.high 289 | self.min_action = action_space.low 290 | self.max_action = action_space.high 291 | 292 | self.num_points_show = 0 293 | if use_constraints: 294 | if state_min is not None: 295 | for state_idx in range(obs_space.shape[0]): 296 | self.axes[0].axhline(y=state_min[state_idx], 297 | color=plt.get_cmap('tab20').colors[2 * state_idx], 298 | linestyle='-.') 299 | if state_max is not None: 300 | for state_idx in range(obs_space.shape[0]): 301 | self.axes[0].axhline(y=state_max[state_idx], 302 | color=plt.get_cmap('tab20').colors[2 * state_idx], 303 | linestyle='-.') 304 | self.lines_states = [self.axes[0].plot([], [], label='state' + str(state_idx), 305 | color=plt.get_cmap('tab20').colors[2 * state_idx]) for state_idx in range(obs_space.shape[0])] 306 | self.line_cost = self.axes[2].plot([], [], label='cost', color='k') 307 | 308 | self.lines_actions = [self.axes[1].step([], [], label='action' + str(action_idx), 309 | color=plt.get_cmap('tab20').colors[1 + 2 * action_idx]) for action_idx in range(action_space.shape[0])] 310 | 311 | self.line_mean_predicted_cost = self.axes[2].plot([], [], label='mean predicted cost', color='orange') 312 | 313 | self.lines_predicted_states = [self.axes[0].plot([], [], 314 | label='predicted_states' + str(state_idx), 315 | color=plt.get_cmap('tab20').colors[2 * state_idx], linestyle='dashed') 316 | for state_idx in range(obs_space.shape[0])] 317 | self.lines_actions_pred = [self.axes[1].step([], [], label='predicted_action' + str(action_idx), 318 | color=plt.get_cmap('tab20').colors[1 + 2 * action_idx], linestyle='dashed') 319 | for action_idx in range(action_space.shape[0])] 320 | self.line_predicted_costs = self.axes[2].plot([], [], label='predicted cost', color='k', linestyle='dashed') 321 | 322 | self.axes[0].legend(fontsize=FONTSIZE) 323 | self.axes[0].grid() 324 | self.axes[1].legend(fontsize=FONTSIZE) 325 | self.axes[1].grid() 326 | self.axes[2].legend(fontsize=FONTSIZE) 327 | self.axes[2].grid() 328 | plt.show(block=False) 329 | 330 | def update(self, obs, action, cost, iter_info:IterationInformation=None): 331 | obs_norm = (obs - self.min_obs) / (self.max_obs - self.min_obs) 332 | action_norm = (action - self.min_action) / (self.max_action - self.min_action) 333 | self.states[self.num_points_show] = obs_norm 334 | self.costs[self.num_points_show] = cost 335 | 336 | update_limits = False 337 | min_state_actual = np.min(obs_norm) 338 | if min_state_actual < self.min_state: 339 | self.min_state = min_state_actual 340 | update_limits = True 341 | 342 | max_state_actual = np.max(obs_norm) 343 | if max_state_actual > self.max_state: 344 | self.max_state = max_state_actual 345 | update_limits = True 346 | 347 | if update_limits: 348 | self.axes[0].set_ylim(self.min_state, self.max_state) 349 | 350 | idxs = np.arange(0, (self.num_points_show + 1)) 351 | for idx_axes in range(len(self.axes)): 352 | self.axes[idx_axes].collections.clear() 353 | 354 | for idx_state in range(len(obs_norm)): 355 | self.lines_states[idx_state][0].set_data(idxs, self.states[:(self.num_points_show + 1), idx_state]) 356 | 357 | self.actions[self.num_points_show] = action_norm 358 | for idx_action in range(len(action_norm)): 359 | self.lines_actions[idx_action][0].set_data(idxs, self.actions[:(self.num_points_show + 1), idx_action]) 360 | 361 | self.line_cost[0].set_data(idxs, self.costs[:(self.num_points_show + 1)]) 362 | 363 | if iter_info is not None: 364 | np.nan_to_num(iter_info.mean_predicted_cost, copy=False, nan=-1, posinf=-1, neginf=-1) 365 | np.nan_to_num(iter_info.mean_predicted_cost_std, copy=False, nan=99, posinf=99, neginf=99) 366 | np.nan_to_num(iter_info.predicted_states, copy=False, nan=-1, posinf=-1, neginf=-1) 367 | np.nan_to_num(iter_info.predicted_states_std, copy=False, nan=99, posinf=99, neginf=0) 368 | np.nan_to_num(iter_info.predicted_costs, copy=False, nan=-1, posinf=-1, neginf=-1) 369 | np.nan_to_num(iter_info.predicted_costs_std, copy=False, nan=99, posinf=99, neginf=0) 370 | # if num_repeat_action is not 1, the control do not happen at each iteration, 371 | # we must select the last index where the control happened as the start of the prediction horizon 372 | idx_prev_control = (idxs[-1] // self.step_pred) * self.step_pred 373 | idxs_future = iter_info.predicted_idxs #np.arange(idx_prev_control, 374 | # idx_prev_control + self.step_pred + len(iter_info.predicted_states) * self.step_pred, 375 | # self.step_pred) 376 | 377 | self.mean_predicted_cost[self.num_points_show] = iter_info.mean_predicted_cost 378 | self.mean_predicted_cost_std[self.num_points_show] = iter_info.mean_predicted_cost_std 379 | 380 | for idx_state in range(len(obs_norm)): 381 | future_states_show = np.concatenate(([self.states[idx_prev_control, idx_state]], iter_info.predicted_states[:, idx_state])) 382 | self.lines_predicted_states[idx_state][0].set_data(idxs_future, future_states_show) 383 | future_states_std_show = np.concatenate(([0], iter_info.predicted_states_std[:, idx_state])) 384 | self.axes[0].fill_between(idxs_future, 385 | future_states_show - future_states_std_show * MUL_STD_BOUNDS, 386 | future_states_show + future_states_std_show * MUL_STD_BOUNDS, 387 | facecolor=plt.get_cmap('tab20').colors[2 * idx_state], alpha=ALPHA_CONFIDENCE_BOUNDS, 388 | label='predicted ' + str(MUL_STD_BOUNDS) + ' std bounds state ' + str(idx_state)) 389 | for idx_action in range(len(action_norm)): 390 | self.lines_actions_pred[idx_action][0].set_data(idxs_future, iter_info.actions_pred[:, idx_action]) 391 | #) np.concatenate(([self.actions[idx_prev_control, idx_action]], ) 392 | 393 | future_costs_show = np.concatenate(([self.costs[idx_prev_control]], iter_info.predicted_costs)) 394 | self.line_predicted_costs[0].set_data(idxs_future, future_costs_show) 395 | 396 | future_cost_std_show = np.concatenate(([0], iter_info.predicted_costs_std)) 397 | self.axes[2].fill_between(idxs_future, 398 | future_costs_show - future_cost_std_show * MUL_STD_BOUNDS, 399 | future_costs_show + future_cost_std_show * MUL_STD_BOUNDS, 400 | facecolor='black', alpha=ALPHA_CONFIDENCE_BOUNDS, 401 | label='predicted ' + str(MUL_STD_BOUNDS) + ' std cost bounds') 402 | else: 403 | if self.num_points_show == 0: 404 | self.mean_predicted_cost[self.num_points_show] = MEAN_PRED_COST_INIT 405 | self.mean_predicted_cost_std[self.num_points_show] = STD_MEAN_PRED_COST_INIT 406 | else: 407 | self.mean_predicted_cost[self.num_points_show] = self.mean_predicted_cost[self.num_points_show - 1] 408 | self.mean_predicted_cost_std[self.num_points_show] = self.mean_predicted_cost_std[self.num_points_show - 1] 409 | 410 | self.line_mean_predicted_cost[0].set_data(idxs, self.mean_predicted_cost[:(self.num_points_show + 1)]) 411 | self.axes[2].set_ylim(0, np.max([np.max(self.mean_predicted_cost[:(self.num_points_show + 1)]), 412 | np.max(self.costs[:(self.num_points_show + 1)])]) * 1.1) 413 | self.axes[2].fill_between(idxs, 414 | self.mean_predicted_cost[:(self.num_points_show + 1)] - 415 | self.mean_predicted_cost_std[:(self.num_points_show + 1)] * MUL_STD_BOUNDS, 416 | self.mean_predicted_cost[:(self.num_points_show + 1)] + 417 | self.mean_predicted_cost_std[:(self.num_points_show + 1)] * MUL_STD_BOUNDS, 418 | facecolor='orange', alpha=ALPHA_CONFIDENCE_BOUNDS, 419 | label='mean predicted ' + str(MUL_STD_BOUNDS) + ' std cost bounds') 420 | 421 | self.fig.canvas.draw() 422 | plt.pause(0.01) 423 | self.num_points_show += 1 424 | -------------------------------------------------------------------------------- /rl_gp_mpc/visu_objects/static_2d_graph.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | 7 | 8 | from rl_gp_mpc.control_objects.controllers.iteration_info_class import IterationInformation 9 | 10 | ALPHA_CONFIDENCE_BOUNDS = 0.4 11 | UPDATE_X_LIMIT = True 12 | MUL_STD_BOUNDS = 3 13 | 14 | 15 | def save_plot_2d(states, actions, costs, model_iter_infos: "list[IterationInformation]", folder_save, 16 | use_constraints=False, state_min_constraint=None, state_max_constraint=None, 17 | iter_ahead_show=3): 18 | fig, axes = plt.subplots(nrows=3, figsize=(6, 5), sharex=True) 19 | axes[0].set_title('Normed states and predictions') 20 | axes[1].set_title('Normed actions') 21 | axes[2].set_title('Cost and horizon cost') 22 | plt.xlabel("time") 23 | max_state_plot = np.max([states.max(), 1.03]) 24 | min_state_plot = np.min([states.min(), -0.03]) 25 | axes[0].set_ylim(min_state_plot, max_state_plot) 26 | axes[1].set_ylim(0, 1.02) 27 | plt.tight_layout() 28 | fig, axes = draw_plots_2d(states=states, actions=actions, costs=costs, 29 | model_iter_infos=model_iter_infos, 30 | use_constraints=use_constraints, 31 | state_min_constraint=state_min_constraint, 32 | state_max_constraint=state_max_constraint, 33 | iter_ahead_show=iter_ahead_show, fig=fig, axes=axes) 34 | axes[0].legend() 35 | axes[0].grid() 36 | axes[1].legend() 37 | axes[1].grid() 38 | axes[2].legend() 39 | axes[2].grid() 40 | fig.savefig(os.path.join(folder_save, f'history_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")}.png')) 41 | plt.close() 42 | 43 | 44 | def draw_plots_2d(states, actions, costs, model_iter_infos: "list[IterationInformation]"=None, 45 | use_constraints=False, state_min_constraint=None, state_max_constraint=None, 46 | iter_ahead_show=15, fig=None, axes=None, 47 | ): 48 | obs_idxs = np.arange(0, len(states)) 49 | colors = plt.get_cmap('tab20').colors 50 | if model_iter_infos is not None: 51 | pred_idxs = np.array([info.iteration for info in model_iter_infos]) 52 | pred_mean_costs = np.array([info.mean_predicted_cost for info in model_iter_infos]) 53 | pred_mean_cost_std = np.array([info.mean_predicted_cost_std for info in model_iter_infos]) 54 | 55 | states_pred_ahead_idxs = np.array([info.predicted_idxs[iter_ahead_show] for info in model_iter_infos]) 56 | states_pred_ahead = np.array([info.predicted_states[iter_ahead_show] for info in model_iter_infos]) 57 | states_pred_std_ahead = np.array([info.predicted_states_std[iter_ahead_show] for info in model_iter_infos]) 58 | 59 | for state_idx in range(len(states[0])): 60 | axes[0].plot(obs_idxs, np.array(states)[:, state_idx], label='state' + str(state_idx), 61 | color=colors[2 * state_idx]) 62 | if model_iter_infos is not None: 63 | axes[0].plot(states_pred_ahead_idxs, states_pred_ahead[:, state_idx], 64 | label=str(iter_ahead_show) + 'step_' + 'prediction' + str(state_idx), 65 | color=colors[2 * state_idx], linestyle='--') 66 | 67 | axes[0].fill_between(states_pred_ahead_idxs, 68 | states_pred_ahead[:, state_idx] - MUL_STD_BOUNDS * states_pred_std_ahead[:, state_idx], 69 | states_pred_ahead[:, state_idx] + MUL_STD_BOUNDS * states_pred_std_ahead[:, state_idx], 70 | color=colors[2 * state_idx], alpha=ALPHA_CONFIDENCE_BOUNDS) 71 | 72 | if use_constraints and (state_min_constraint is not None) and (state_max_constraint is not None): 73 | axes[0].axhline(y=state_min_constraint[state_idx], 74 | color=colors[2 * state_idx], linestyle='-.') 75 | axes[0].axhline(y=state_max_constraint[state_idx], 76 | color=colors[2 * state_idx], linestyle='-.') 77 | 78 | axes[2].plot(obs_idxs, costs, label='cost', color='k') 79 | 80 | axes[2].plot(pred_idxs, pred_mean_costs, label='mean predicted cost', color='orange') 81 | axes[2].fill_between(pred_idxs, 82 | pred_mean_costs - MUL_STD_BOUNDS * pred_mean_cost_std, 83 | pred_mean_costs + MUL_STD_BOUNDS * pred_mean_cost_std, 84 | color='orange', alpha=ALPHA_CONFIDENCE_BOUNDS) 85 | 86 | for idx_action in range(len(actions[0])): 87 | axes[1].step(obs_idxs, np.array(actions)[:, idx_action], label='a_' + str(idx_action), 88 | color=colors[1 + 2 * idx_action]) 89 | 90 | if UPDATE_X_LIMIT is True: 91 | axes[2].set_xlim(0, np.max(obs_idxs)) 92 | 93 | axes[2].set_ylim(0, np.max([np.max(pred_mean_costs), np.max(costs)]) * 1.2) 94 | plt.xlabel('Env steps') 95 | return fig, axes -------------------------------------------------------------------------------- /rl_gp_mpc/visu_objects/static_3d_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import datetime 3 | 4 | import torch 5 | import gpytorch 6 | import matplotlib.pyplot as plt 7 | from sklearn.pipeline import Pipeline 8 | from sklearn.preprocessing import StandardScaler 9 | from sklearn.neighbors import KNeighborsRegressor 10 | import numpy as np 11 | 12 | from rl_gp_mpc.control_objects.models.gp_model import create_models, SavedState 13 | from rl_gp_mpc.control_objects.memories.gp_memory import Memory 14 | 15 | PROP_EXTEND_DOMAIN = 1 16 | N_TICKS = 75 17 | TOTAL_COL_MAX= 3 18 | FONTSIZE = 6 19 | 20 | def save_plot_model_3d(saved_state: SavedState, folder_save, memory:Memory, plot_points_memory=True): 21 | models = create_models(saved_state.parameters, saved_state.constraints_hyperparams, saved_state.inputs, saved_state.states_change) 22 | inputs_total, targets_total = memory.get_memory_total() 23 | active_memory_mask = memory.get_mask_model_inputs() 24 | inputs_model = inputs_total[active_memory_mask] 25 | targets_model = targets_total[active_memory_mask] 26 | with torch.no_grad(), gpytorch.settings.fast_pred_var(): 27 | torch.set_num_threads(1) 28 | num_input_model = len(inputs_model[0]) 29 | num_models = len(targets_model[0]) 30 | for idx_model in range(len(models)): 31 | models[idx_model].eval() 32 | targets_model = targets_model.numpy() 33 | 34 | num_figures = int(num_input_model / TOTAL_COL_MAX + 0.5) 35 | figs = [] 36 | axes_s = [] 37 | for index_figure in range(num_figures): 38 | fig = plt.figure(figsize=(15, 6)) 39 | axes = [] 40 | columns_active = np.min([num_models - (TOTAL_COL_MAX * index_figure), TOTAL_COL_MAX]) 41 | for idx_ax in range(columns_active * 2): 42 | axes.append(fig.add_subplot(2, columns_active, idx_ax + 1, projection='3d')) 43 | axes_s.append(axes) 44 | figs.append(fig) 45 | for idx_subplot in range(columns_active): 46 | idx_obs_repr = index_figure * columns_active + idx_subplot 47 | if idx_obs_repr >= num_models: 48 | break 49 | feat_importance = (1 / models[idx_obs_repr].covar_module.base_kernel.lengthscale).numpy() 50 | feat_importance = feat_importance / np.sum(feat_importance) 51 | best_features = np.argsort(-feat_importance)[0, :2] 52 | estimator_other_columns = Pipeline( 53 | steps=[('standardscaler', StandardScaler()), 54 | ('features', KNeighborsRegressor(n_neighbors=3, weights='distance'))]) 55 | 56 | estimator_other_columns.fit(inputs_total[:, best_features].numpy(), 57 | np.delete(inputs_total.numpy(), best_features, axis=1)) 58 | 59 | domain_extension_x = (PROP_EXTEND_DOMAIN - 1) * ( 60 | inputs_total[:, best_features[0]].max() - inputs_total[:, best_features[0]].min()) 61 | domain_extension_y = (PROP_EXTEND_DOMAIN - 1) * ( 62 | inputs_total[:, best_features[1]].max() - inputs_total[:, best_features[1]].min()) 63 | x_grid = np.linspace(inputs_total[:, best_features[0]].min() - domain_extension_x / 2, 64 | inputs_total[:, best_features[0]].max() + domain_extension_x / 2, N_TICKS) 65 | y_grid = np.linspace(inputs_total[:, best_features[1]].min() - domain_extension_y / 2, 66 | inputs_total[:, best_features[1]].max() + domain_extension_y / 2, N_TICKS) 67 | 68 | x_grid, y_grid = np.meshgrid(x_grid, y_grid) 69 | 70 | x_grid_1d = np.expand_dims(x_grid.flatten(), axis=-1) 71 | y_grid_1d = np.expand_dims(y_grid.flatten(), axis=-1) 72 | xy_grid = np.concatenate((x_grid_1d, y_grid_1d), axis=1) 73 | pred_other_columns = estimator_other_columns.predict(xy_grid) 74 | all_col = np.zeros((xy_grid.shape[0], num_input_model)) 75 | all_col[:, best_features] = xy_grid 76 | all_col[:, np.delete(np.arange(num_input_model), best_features)] = pred_other_columns 77 | gauss_pred = models[idx_obs_repr].likelihood(models[idx_obs_repr](torch.from_numpy( 78 | all_col.astype(np.float64)))) 79 | z_grid_1d_mean = gauss_pred.mean.numpy() 80 | z_grid_1d_std = np.sqrt(gauss_pred.stddev.numpy()) 81 | 82 | z_grid_mean = z_grid_1d_mean.reshape(x_grid.shape) 83 | z_grid_std = z_grid_1d_std.reshape(x_grid.shape) 84 | surf1 = axes[idx_subplot].contour3D(x_grid, y_grid, z_grid_mean, N_TICKS, cmap='cool') 85 | axes[idx_subplot].set_xlabel('Input ' + str(best_features[0]), fontsize=FONTSIZE, rotation=150) 86 | axes[idx_subplot].set_ylabel('Input ' + str(best_features[1]), fontsize=FONTSIZE, rotation=150) 87 | axes[idx_subplot].set_zlabel('Variation state ' + str(idx_obs_repr), fontsize=FONTSIZE, 88 | rotation=60) 89 | 90 | surf2 = axes[idx_subplot + columns_active].contour3D(x_grid, y_grid, z_grid_std, N_TICKS, cmap='cool') 91 | axes[idx_subplot + columns_active].set_xlabel('Input ' + str(best_features[0]), fontsize=FONTSIZE, rotation=150) 92 | axes[idx_subplot + columns_active].set_ylabel('Input ' + str(best_features[1]), fontsize=FONTSIZE, rotation=150) 93 | axes[idx_subplot + columns_active].set_zlabel('Uncertainty: std state ' + str(idx_obs_repr), 94 | fontsize=FONTSIZE, rotation=60) 95 | 96 | if plot_points_memory: 97 | axes[idx_subplot].scatter( 98 | inputs_total[~active_memory_mask, best_features[0]], 99 | inputs_total[~active_memory_mask, best_features[1]], 100 | targets_total[~active_memory_mask, idx_obs_repr], 101 | marker='x', c='k') 102 | 103 | axes[idx_subplot].scatter( 104 | inputs_model[:, best_features[0]], 105 | inputs_model[:, best_features[1]], 106 | targets_model[:, idx_obs_repr], 107 | marker='x', c='g') 108 | 109 | axes[idx_subplot].quiver(inputs_total[:-1, best_features[0]], inputs_total[:-1, best_features[1]], 110 | targets_total[:-1, idx_obs_repr], 111 | inputs_total[1:, best_features[0]] - inputs_total[:-1, best_features[0]], 112 | inputs_total[1:, best_features[1]] - inputs_total[:-1, best_features[1]], 113 | targets_total[1:, idx_obs_repr] - targets_total[:-1, idx_obs_repr], 114 | color='k', linestyle="solid", alpha=0.3, arrow_length_ratio=0.001, length=0.9) 115 | 116 | preds_total = models[idx_obs_repr].likelihood(models[idx_obs_repr](inputs_total)) 117 | errors_total = np.abs(preds_total.mean.numpy() - targets_total[:, idx_obs_repr].numpy()) 118 | axes[idx_subplot + columns_active].scatter( 119 | inputs_total[~active_memory_mask, best_features[0]], 120 | inputs_total[~active_memory_mask, best_features[1]], 121 | errors_total[~active_memory_mask], marker='x', c='k') 122 | axes[idx_subplot + columns_active].scatter( 123 | inputs_total[active_memory_mask, best_features[0]], 124 | inputs_total[active_memory_mask, best_features[1]], 125 | errors_total[active_memory_mask], marker='x', c='g') 126 | axes[idx_subplot + columns_active].quiver( 127 | inputs_total[:-1, best_features[0]], inputs_total[:-1, best_features[1]], 128 | errors_total[:-1], 129 | inputs_total[1:, best_features[0]] - inputs_total[:-1, best_features[0]], 130 | inputs_total[1:, best_features[1]] - inputs_total[:-1, best_features[1]], 131 | errors_total[1:] - errors_total[:-1], 132 | color='k', linestyle="solid", alpha=0.3, arrow_length_ratio=0.001, length=0.9) 133 | 134 | plt.tight_layout() 135 | fig.savefig(os.path.join(folder_save, f'model_3d_{datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")}.png')) 136 | plt.close() -------------------------------------------------------------------------------- /rl_gp_mpc/visu_objects/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import os 3 | 4 | 5 | def get_env_name(env): 6 | try: 7 | env_name = env.env.spec.entry_point.replace('-', '_').replace(':', '_').replace('.', '_') 8 | except: 9 | env_name = env.name 10 | return env_name 11 | 12 | 13 | def create_folder_save(env_name): 14 | datetime_now = datetime.datetime.now() 15 | 16 | folder_save = os.path.join('folder_save', env_name, datetime_now.strftime("%Y_%m_%d_%H_%M_%S")) 17 | if not os.path.exists(folder_save): 18 | os.makedirs(folder_save) 19 | return folder_save 20 | -------------------------------------------------------------------------------- /rl_gp_mpc/visu_objects/visu_object.py: -------------------------------------------------------------------------------- 1 | import os 2 | import multiprocessing 3 | import copy 4 | import time 5 | 6 | import numpy as np 7 | from gym.wrappers.monitoring.video_recorder import VideoRecorder 8 | from gym.core import Env 9 | 10 | from rl_gp_mpc.config_classes.visu_config import VisuConfig 11 | from rl_gp_mpc.config_classes.total_config import Config 12 | from rl_gp_mpc.control_objects.controllers.iteration_info_class import IterationInformation 13 | from rl_gp_mpc.control_objects.controllers.gp_mpc_controller import GpMpcController 14 | 15 | from .utils import get_env_name, create_folder_save 16 | from .static_3d_graph import save_plot_model_3d 17 | from .static_2d_graph import save_plot_2d 18 | from .dynamic_2d_graph import LivePlotParallel 19 | 20 | 21 | class ControlVisualizations: 22 | def __init__(self, env:Env, num_steps:int, control_config: Config, visu_config: VisuConfig): 23 | self.control_config = control_config 24 | self.visu_config = visu_config 25 | 26 | self.states = [] 27 | self.actions = [] 28 | self.rewards = [] 29 | self.model_iter_infos = [] 30 | 31 | self.rec = None 32 | self.use_thread = False 33 | 34 | self.env_str = get_env_name(env) 35 | self.folder_save = create_folder_save(self.env_str) 36 | 37 | self.obs_min = env.observation_space.low 38 | self.obs_max = env.observation_space.high 39 | self.action_min = env.action_space.low 40 | self.action_max = env.action_space.high 41 | 42 | self.init = True 43 | 44 | if self.visu_config.render_live_plot_2d: 45 | self.live_plot_obj = LivePlotParallel(num_steps, 46 | use_constraints=bool(self.control_config.reward.use_constraints), 47 | dim_states=len(self.obs_min), 48 | dim_actions=len(self.action_min), 49 | state_min=self.control_config.reward.state_min, 50 | state_max=self.control_config.reward.state_max, 51 | use_thread=self.use_thread, 52 | path_save=os.path.join(self.folder_save, 'control_animation.mp4'), 53 | save=self.visu_config.save_live_plot_2d) 54 | 55 | if self.visu_config.save_render_env: 56 | self.rec = VideoRecorder(env, path=os.path.join(self.folder_save, 'gym_animation.mp4')) 57 | 58 | self.processes_running = True 59 | 60 | def update(self, obs:np.ndarray, action:np.ndarray, reward:float, env:Env, iter_info:IterationInformation=None): 61 | state = (obs - self.obs_min) / (self.obs_max - self.obs_min) 62 | action_norm = (action - self.action_min) / (self.action_max - self.action_min) 63 | self.states.append(state) 64 | self.actions.append(action_norm) 65 | self.rewards.append(reward) 66 | 67 | iter_info_arrays = copy.deepcopy(iter_info) 68 | iter_info_arrays.to_arrays() 69 | self.model_iter_infos.append(iter_info_arrays) 70 | 71 | if self.visu_config.render_live_plot_2d: 72 | self.live_plot_obj.update(state=state, action=action_norm, cost=-reward, iter_info=iter_info_arrays) 73 | self.env_render_step(env) 74 | if self.init: 75 | time.sleep(2) 76 | self.init = False 77 | 78 | def env_render_step(self, env): 79 | if self.visu_config.render_env: 80 | env.render() 81 | if self.visu_config.save_render_env: 82 | self.rec.capture_frame() 83 | 84 | def save_plot_model_3d(self, controller: GpMpcController): 85 | save_plot_model_3d(controller.transition_model.save_state(), self.folder_save, controller.memory) 86 | 87 | def save_plot_2d(self): 88 | save_plot_2d(np.array(self.states), np.array(self.actions), -np.array(self.rewards), 89 | self.model_iter_infos, self.folder_save, 90 | self.control_config.reward.use_constraints, 91 | self.control_config.reward.state_min, 92 | self.control_config.reward.state_max) 93 | 94 | def save(self, controller: GpMpcController): 95 | self.save_plot_2d() 96 | self.save_plot_model_3d(controller) 97 | 98 | def close(self): 99 | if self.visu_config.save_render_env: 100 | self.rec.close() 101 | self.close_running_processes() 102 | 103 | def close_running_processes(self): 104 | if 'live_plot_obj' in self.__dict__ and self.live_plot_obj.graph_p.is_alive(): 105 | self.live_plot_obj.close() 106 | 107 | self.processes_running = False 108 | 109 | def __exit__(self): 110 | if self.processes_running: 111 | self.close() 112 | 113 | def get_costs(self): 114 | return -np.array(self.rewards) 115 | --------------------------------------------------------------------------------