├── DQN ├── DQN-CartPoleStab │ ├── DQN.py │ ├── README.md │ ├── config.yml │ ├── storage │ │ ├── README.md │ │ ├── config-1.yml │ │ ├── config-2.yml │ │ ├── config-3.yml │ │ ├── config-4.yml │ │ ├── exp_4.ckpt │ │ ├── loss-1.png │ │ ├── loss-2.png │ │ ├── loss-3.png │ │ ├── loss-4.png │ │ ├── loss-5.png │ │ ├── loss-6.png │ │ ├── reward-1.png │ │ ├── reward-2.png │ │ ├── reward-3.png │ │ ├── reward-4.png │ │ ├── reward-5.png │ │ └── reward-6.png │ ├── test.py │ ├── test_rr.py │ ├── train.py │ └── utils.py ├── DQN-Double │ ├── DQN.py │ ├── README.md │ ├── config.yml │ ├── storage │ │ ├── README.md │ │ ├── config-0.yml │ │ ├── config-1.yml │ │ ├── config-10.yml │ │ ├── config-11.yml │ │ ├── config-12.yml │ │ ├── config-13.yml │ │ ├── config-2.yml │ │ ├── config-3.yml │ │ ├── config-4.yml │ │ ├── config-5.yml │ │ ├── config-6.yml │ │ ├── config-7.yml │ │ ├── config-8.yml │ │ ├── config-9.yml │ │ ├── exp_4.ckpt │ │ ├── loss-0.png │ │ ├── loss-1.png │ │ ├── loss-10.png │ │ ├── loss-11.png │ │ ├── loss-12.png │ │ ├── loss-13.png │ │ ├── loss-2.png │ │ ├── loss-3.png │ │ ├── loss-4.png │ │ ├── loss-5.png │ │ ├── loss-6.png │ │ ├── loss-7.png │ │ ├── loss-8.png │ │ ├── loss-9.png │ │ ├── reward-0.png │ │ ├── reward-1.png │ │ ├── reward-10.png │ │ ├── reward-11.png │ │ ├── reward-12.png │ │ ├── reward-13.png │ │ ├── reward-2.png │ │ ├── reward-3.png │ │ ├── reward-4.png │ │ ├── reward-5.png │ │ ├── reward-6.png │ │ ├── reward-7.png │ │ ├── reward-8.png │ │ └── reward-9.png │ ├── test.py │ ├── train.py │ └── utils.py ├── DQN-Qube │ ├── DQN.py │ ├── README.md │ ├── config.yml │ ├── storage │ │ ├── .~lock.Parameters.ods# │ │ ├── Parameters.ods │ │ ├── README.md │ │ ├── config-1.yml │ │ ├── config-2.yml │ │ ├── config-3.yml │ │ ├── config-5.yml │ │ ├── config-6.yml │ │ ├── config-7.yml │ │ ├── config-8.yml │ │ ├── config-9.yml │ │ ├── data_real_world.pkl │ │ ├── exp_6.ckpt │ │ ├── loss-1.png │ │ ├── loss-2.png │ │ ├── loss-3.png │ │ ├── loss-4.png │ │ ├── loss-5.png │ │ ├── loss-6.png │ │ ├── loss-7.png │ │ ├── loss-8.png │ │ ├── loss-9.png │ │ ├── reward-1.png │ │ ├── reward-2.png │ │ ├── reward-3.png │ │ ├── reward-4.png │ │ ├── reward-5.png │ │ ├── reward-6-real-world.png │ │ ├── reward-6.png │ │ ├── reward-7.png │ │ ├── reward-8.png │ │ ├── reward-9.png │ │ ├── simulatedModelOnRealPlatform-2.png │ │ ├── simulatedModelOnRealPlatform-3.png │ │ ├── simulatedModelOnRealPlatform-4.png │ │ └── simulatedModelOnRealPlatform.png │ ├── test.py │ ├── test_on_real_platform.py │ ├── train.py │ └── utils.py ├── DQN-Swing │ ├── DQN.py │ ├── README.md │ ├── config.yml │ ├── storage │ │ ├── README.md │ │ ├── config-0.yml │ │ ├── config-1.yml │ │ ├── config-2.yml │ │ ├── config-3.yml │ │ ├── exp_0.ckpt │ │ ├── exp_1_best.ckpt │ │ ├── loss-0.png │ │ ├── loss-1-find-best.png │ │ ├── loss-1.png │ │ ├── loss-2.png │ │ ├── loss-3.png │ │ ├── reward-0.png │ │ ├── reward-1-find-best.png │ │ ├── reward-1.png │ │ ├── reward-2.png │ │ └── reward-3.png │ ├── test.py │ ├── test_rr.py │ ├── train.py │ └── utils.py └── README.md ├── LICENSE ├── MPC ├── MPC-CartPoleStab │ ├── .idea │ │ ├── MPC qube.iml │ │ ├── misc.xml │ │ ├── modules.xml │ │ └── workspace.xml │ ├── .ipynb_checkpoints │ │ └── example-checkpoint.ipynb │ ├── Hive │ │ ├── Hive.py │ │ ├── README.md │ │ ├── SelectionMethods.py │ │ ├── Utilities.py │ │ └── __init__.py │ ├── README.md │ ├── config.yml │ ├── controller.py │ ├── dynamics.py │ ├── example.ipynb │ ├── run.py │ ├── storage │ │ ├── config-1.yml │ │ ├── config-2.yml │ │ ├── exp_1.ckpt │ │ ├── loss-1.png │ │ ├── loss-2.png │ │ ├── model_error_exp_1.png │ │ ├── reward-1.png │ │ └── reward-2.png │ └── utils.py ├── MPC-CartPoleSwing │ ├── Hive │ │ ├── Hive.py │ │ ├── README.md │ │ ├── SelectionMethods.py │ │ ├── Utilities.py │ │ └── __init__.py │ ├── README.md │ ├── config.yml │ ├── controller.py │ ├── dynamics.py │ ├── run.py │ ├── storage │ │ ├── config-1.yml │ │ ├── config-2.yml │ │ ├── config-3.yml │ │ ├── config-4.yml │ │ ├── config-5.yml │ │ ├── loss-1.png │ │ ├── loss-2.png │ │ ├── model_error_exp_1.png │ │ ├── model_error_exp_2.png │ │ ├── reward-1.png │ │ └── reward-2.png │ └── utils.py ├── MPC-Double │ ├── Hive │ │ ├── Hive.py │ │ ├── README.md │ │ ├── SelectionMethods.py │ │ ├── Utilities.py │ │ └── __init__.py │ ├── README.md │ ├── config.yml │ ├── controller.py │ ├── dynamics.py │ ├── run.py │ ├── storage │ │ ├── config-1.yml │ │ ├── config-2.yml │ │ ├── config-3.yml │ │ ├── config-4.yml │ │ ├── loss-1.png │ │ ├── loss-2.png │ │ ├── loss-3.png │ │ ├── loss-4.png │ │ ├── model_error_exp_1.png │ │ ├── reward-1.png │ │ ├── reward-2.png │ │ ├── reward-3.png │ │ └── reward-4.png │ └── utils.py ├── MPC-Qube │ ├── Hive │ │ ├── Hive.py │ │ ├── README.md │ │ ├── SelectionMethods.py │ │ ├── Utilities.py │ │ └── __init__.py │ ├── README.md │ ├── config.yml │ ├── controller.py │ ├── dynamics.py │ ├── run.py │ ├── storage │ │ ├── Angle Error h_0 100.png │ │ ├── State Error h_0 100.png │ │ ├── config-1.yml │ │ ├── config-2.yml │ │ ├── config-4.yml │ │ ├── config-5.yml │ │ ├── config-6.yml │ │ ├── config-7.yml │ │ ├── config_3.yml │ │ ├── loss-1.png │ │ ├── loss-2.png │ │ ├── loss-3.png │ │ ├── loss-4.png │ │ ├── loss-5.png │ │ ├── loss-6.png │ │ ├── loss-7.png │ │ ├── mpc.png │ │ ├── reward-1.png │ │ ├── reward-2.png │ │ ├── reward-3.png │ │ ├── reward-4.png │ │ ├── reward-5.png │ │ ├── reward-6.png │ │ └── reward-7.png │ ├── test.py │ └── utils.py └── README.md ├── README.md ├── Resources ├── DQN │ ├── Playing Atari with Deep Reinforcement Learning.pdf │ ├── Q-Learning in Continuous State Action Spaces.pdf │ └── README.md ├── MPC │ ├── Approximate Dynamic Programming with Gaussian Processes.pdf │ ├── Constrained model predictive control: Stability and optimality.pdf │ ├── Neural Network Dynamics for Model based Deep Rl with Model free fine tuning.pdf │ └── README.md ├── README.md └── figures │ ├── README.md │ ├── qube-after-fine-tuning.gif │ ├── qube-before-fine-tuning.gif │ ├── qube.gif │ ├── stabe.gif │ ├── swing.gif │ └── swing_interesting.gif └── environment.yaml /DQN/DQN-CartPoleStab/README.md: -------------------------------------------------------------------------------- 1 | # DQN - CartPoleStab 2 | 3 | This folder contains the implementation of DQN algorithm and the evaluation on the CartPoleStab environment 4 | 5 | All the hyper-parameters and experiment setting are stored in the ```config.yml``` file 6 | 7 | All the results (figure and model) will be stored in the ```./storage``` folder by default 8 | 9 | If you are not familiar with this environment, you can use the `analyze_env()` function in the `utils.py` to help you quickly understand the environment's state space, action space, reward range, etc. 10 | 11 | ## How to run 12 | 13 | ### Test the pre-trained 14 | 15 | To try our pre-trained model, simply run 16 | 17 | ```angularjs 18 | python test.py 19 | ``` 20 | 21 | The script will load the model from the path specified in the ```config.yml``` file 22 | 23 | ### Train your own model 24 | 25 | To train your own model, you can change the hyper-parameters in the ```config.yml``` to whatever you want, 26 | and then run 27 | 28 | ```angularjs 29 | python train.py 30 | ``` 31 | 32 | The script will load the configurations in the ```config.yml``` file and begin to train 33 | 34 | ### Configuration parameter explanation 35 | 36 | In the ```config.yml``` file, there are two set of configuration. 37 | The first `model_config` is the configuration of the neural network architecture; 38 | The second `training_config` is the configuration for the training process. 39 | 40 | The `exp_number` parameter in the `training_config` is the number of your experiment. The name of saved figure results in the `./storage` folder will be determined by this parameter. 41 | 42 | If you want to train your model from scratch, then set the `load_model` parameter to `False`. If set to `True`, the trainer will load the model from `model_path`. 43 | 44 | If you think your training process is not stable and you want to save the model when the model has the best performance, set the `save_best` parameter to `True`. 45 | 46 | -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/config.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_4.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: False 8 | 9 | training_config: 10 | render: True # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 32 14 | gamma: 0.98 15 | n_update_target: 4 # how many episode to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 2000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/test.ckpt" # the path to save the model 20 | use_fix_epsilon: True # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.5 # episilon decay start 23 | epsilon_final: 0.2 # episilon decay end 24 | epsilon_decay: 200 # bandwidth 25 | exp_number: 7 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/README.md: -------------------------------------------------------------------------------- 1 | # DQN - Experiment Results 2 | 3 | This folder contains the experiment results on the CartPoleStab environment 4 | 5 | The number in each file name represents the experiment number. 6 | 7 | For example, `config-1.yml` represents the configuration parameters in the first experiment. 8 | 9 | We only store the best model, the configuration files and the figures here to save space, 10 | for complete results, please see [https://github.com/liuzuxin/RL_Project_Results](https://github.com/liuzuxin/RL_Project_Results) -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/config-1.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_actions: 7 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.98 15 | n_update_target: 2 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_1.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.2 # episilon decay end 24 | epsilon_decay: 300 # bandwidth 25 | exp_number: 1 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/config-2.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_2.ckpt" # the path to load the model 4 | n_actions: 5 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 64 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.98 15 | n_update_target: 2 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_2.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.3 # episilon decay end 24 | epsilon_decay: 500 # bandwidth 25 | exp_number: 2 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/config-3.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_3.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.995 15 | n_update_target: 2 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_3.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.3 # episilon decay end 24 | epsilon_decay: 500 # bandwidth 25 | exp_number: 3 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/config-4.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_4.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 32 14 | gamma: 0.98 15 | n_update_target: 2 # how many episode to update the target network 16 | memory_size: 50000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_4.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.5 # episilon decay start 23 | epsilon_final: 0.2 # episilon decay end 24 | epsilon_decay: 200 # bandwidth 25 | exp_number: 4 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/exp_4.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/exp_4.ckpt -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/loss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/loss-1.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/loss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/loss-2.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/loss-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/loss-3.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/loss-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/loss-4.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/loss-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/loss-5.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/loss-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/loss-6.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/reward-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/reward-1.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/reward-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/reward-2.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/reward-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/reward-3.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/reward-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/reward-4.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/reward-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/reward-5.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/storage/reward-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-CartPoleStab/storage/reward-6.png -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from DQN import * 4 | import gym 5 | from quanser_robots.common import GentlyTerminating 6 | import time 7 | 8 | 9 | def test(): 10 | config_path = "config.yml" 11 | print_config(config_path) 12 | config = load_config(config_path) 13 | training_config = config["training_config"] 14 | config["model_config"]["load_model"] = True 15 | 16 | env_id = "CartpoleStabShort-v0" 17 | env = GentlyTerminating(gym.make(env_id)) 18 | 19 | n_episodes = 10 20 | max_episode_step = 10000 21 | print("*********************************************") 22 | print("Testing the model for 10 episodes with 10000 maximum steps per episode") 23 | print("*********************************************") 24 | 25 | policy = Policy(env,config) 26 | 27 | losses = [] 28 | all_rewards = [] 29 | avg_rewards = [] 30 | epsilons = [] 31 | for i_episode in range(n_episodes): 32 | episode_reward = 0 33 | state = env.reset() 34 | epsilon = 0 35 | epsilons.append(epsilon) 36 | for step in range(max_episode_step): 37 | env.render() 38 | time.sleep(0.003) 39 | action = policy.act(state, epsilon) 40 | 41 | f_action = 12*(action-(policy.n_actions-1)/2)/((policy.n_actions-1)/2) 42 | next_state, reward, done, _ = env.step(f_action) 43 | 44 | policy.replay_buffer.push(state, action[0], reward, next_state, done) 45 | 46 | state = next_state 47 | episode_reward += reward 48 | 49 | if done: 50 | break 51 | print(" episode: %s, episode reward: %s" % (i_episode, episode_reward)) 52 | all_rewards.append(episode_reward) 53 | avg_rewards.append(np.mean(all_rewards[-3:])) 54 | 55 | env.close() 56 | plot_fig(n_episodes, all_rewards,avg_rewards, losses) 57 | 58 | if __name__ =="__main__": 59 | test() 60 | -------------------------------------------------------------------------------- /DQN/DQN-CartPoleStab/test_rr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from DQN import * 4 | import argparse 5 | 6 | 7 | use_plot = True 8 | render = True 9 | 10 | window = 500 11 | collect_fr = 10 12 | plot_fr = 10 13 | render_fr = 10 14 | 15 | if use_plot: 16 | plt.ion() 17 | plot = PlotSignal(window=window) 18 | 19 | # Initialize Controller & Environment: 20 | env, ctrl = get_env_and_controller(long_pendulum=False, simulation=True, swinging=False, mouse_control=False) 21 | 22 | 23 | config_path = "config.yml" 24 | print_config(config_path) 25 | config = load_config(config_path) 26 | training_config = config["training_config"] 27 | config["model_config"]["load_model"] = True 28 | 29 | n_episodes = 10 30 | max_episode_step = 100000 31 | print("*********************************************") 32 | print("Testing the model on real platform for 10 episodes with 100000 maximum steps per episode") 33 | print("*********************************************") 34 | 35 | policy = Policy(env,config) 36 | losses = [] 37 | all_rewards = [] 38 | avg_rewards = [] 39 | epsilons = [] 40 | 41 | 42 | for i in range(n_episodes): 43 | print("\n\n###############################") 44 | print("Episode {0}".format(0)) 45 | 46 | # Reset the environment: 47 | env.reset() 48 | obs, reward, done, _ = env.step(np.zeros(1)) 49 | # Start the Control Loop: 50 | print("\nStart Controller:\t\t\t", end="") 51 | for n in range(max_episode_step): 52 | action = policy.act(obs, 0) 53 | f_action = 12 * (action - (policy.n_actions - 1) / 2) / ((policy.n_actions - 1) / 2) 54 | obs, reward, done, _ = env.step(f_action) 55 | all_rewards.append(reward) 56 | if done: 57 | print("Physical Limits or End of Time reached") 58 | break 59 | 60 | if render and np.mod(n, render_fr) == 0: 61 | env.render() 62 | 63 | if use_plot and np.mod(n, collect_fr) == 0: 64 | alpha, theta = get_angles(obs[1], obs[2]) 65 | plot.update(theta=theta, alpha=alpha, theta_dt=obs[4], volt=f_action, u=0, x=obs[0]) 66 | env.render() 67 | 68 | if use_plot and np.mod(n, plot_fr) == 0: 69 | plot.plot_signal() 70 | 71 | # Stop the cart: 72 | env.step(np.zeros(1)) 73 | 74 | print("avg reward: ",np.mean(all_rewards)) 75 | print("rewards: ", all_rewards) 76 | env.close() 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /DQN/DQN-Double/README.md: -------------------------------------------------------------------------------- 1 | # DQN - DoublePendulum 2 | 3 | This folder contains the implementation of DQN algorithm and the evaluation on the DoublePendulum environment 4 | 5 | All the hyper-parameters and experiment setting are stored in the ```config.yml``` file 6 | 7 | All the results (figure and model) will be stored in the ```./storage``` folder by default 8 | 9 | If you are not familiar with this environment, you can use the `analyze_env()` function in the `utils.py` to help you quickly understand the environment's state space, action space, reward range, etc. 10 | 11 | ## How to run 12 | 13 | ### Test the pre-trained 14 | 15 | To try our pre-trained model, simply run 16 | 17 | ```angularjs 18 | python test.py 19 | ``` 20 | 21 | The script will load the model from the path specified in the ```config.yml``` file 22 | 23 | ### Train your own model 24 | 25 | To train your own model, you can change the hyper-parameters in the ```config.yml``` to whatever you want, 26 | and then run 27 | 28 | ```angularjs 29 | python train.py 30 | ``` 31 | 32 | The script will load the configurations in the ```config.yml``` file and begin to train 33 | 34 | ### Configuration parameter explanation 35 | 36 | In the ```config.yml``` file, there are two set of configuration. 37 | The first `model_config` is the configuration of the neural network architecture; 38 | The second `training_config` is the configuration for the training process. 39 | 40 | The `exp_number` parameter in the `training_config` is the number of your experiment. The name of saved figure results in the `./storage` folder will be determined by this parameter. 41 | 42 | If you want to train your model from scratch, then set the `load_model` parameter to `False`. If set to `True`, the trainer will load the model from `model_path`. 43 | 44 | If you think your training process is not stable and you want to save the model when the model has the best performance, set the `save_best` parameter to `True`. 45 | 46 | -------------------------------------------------------------------------------- /DQN/DQN-Double/config.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_4.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.99 # discount factor 15 | n_update_target: 8 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/test.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 14 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/README.md: -------------------------------------------------------------------------------- 1 | # DQN - Experiment Results 2 | 3 | This folder contains the experiment results on the DoublePendulum environment 4 | 5 | The number in each file name represents the experiment number. 6 | 7 | For example, `config-1.yml` represents the configuration parameters in the first experiment. 8 | 9 | We only store the best model, the configuration files and the figures here to save space, 10 | for complete results, please see [https://github.com/liuzuxin/RL_Project_Results](https://github.com/liuzuxin/RL_Project_Results) -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-0.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_0.ckpt" # the path to load the model 4 | n_actions: 7 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 64 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 15000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.995 15 | n_update_target: 10 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_0.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 2000 # bandwidth 25 | exp_number: 0 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-1.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.99 15 | n_update_target: 8 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_10.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 1 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-10.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_10.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.9 15 | n_update_target: 8 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_10.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.2 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 10 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-11.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_11.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.9 15 | n_update_target: 8 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_11.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.2 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 11 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-12.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_12.ckpt" # the path to load the model 4 | n_actions: 15 5 | n_hidden: 3 # hidden layer number 6 | size_hidden: 64 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 32 14 | gamma: 0.95 15 | n_update_target: 8 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_12.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 12 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-13.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_13.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 3 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 32 14 | gamma: 0.95 15 | n_update_target: 8 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_13.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 2000 # bandwidth 25 | exp_number: 13 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-2.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_2.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.99 15 | n_update_target: 8 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_2.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 2000 # bandwidth 25 | exp_number: 2 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-3.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_3.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.99 15 | n_update_target: 8 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_3.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 3000 # bandwidth 25 | exp_number: 3 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-4.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_4.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 64 14 | gamma: 0.95 15 | n_update_target: 5 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_4.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 5 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-5.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_5.ckpt" # the path to load the model 4 | n_actions: 17 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 64 14 | gamma: 0.95 15 | n_update_target: 5 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_5.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 2000 # bandwidth 25 | exp_number: 5 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-6.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_6.ckpt" # the path to load the model 4 | n_actions: 13 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 64 14 | gamma: 0.95 15 | n_update_target: 5 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_5.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 2000 # bandwidth 25 | exp_number: 5 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-7.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_7.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 64 14 | gamma: 0.9 15 | n_update_target: 5 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_7.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.5 # episilon decay start 23 | epsilon_final: 0.2 # episilon decay end 24 | epsilon_decay: 5000 # bandwidth 25 | exp_number: 7 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-8.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_8.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.01 13 | batch_size: 64 14 | gamma: 0.9 15 | n_update_target: 5 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_8.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 5000 # bandwidth 25 | exp_number: 8 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/config-9.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_9.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 64 14 | gamma: 0.9 15 | n_update_target: 5 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 3000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_9.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.5 # episilon decay start 23 | epsilon_final: 0.2 # episilon decay end 24 | epsilon_decay: 3000 # bandwidth 25 | exp_number: 9 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/exp_4.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/exp_4.ckpt -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-0.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-1.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-10.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-11.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-12.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-13.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-2.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-3.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-4.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-5.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-6.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-7.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-8.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/loss-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/loss-9.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-0.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-1.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-10.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-11.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-12.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-13.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-2.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-3.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-4.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-5.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-6.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-7.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-8.png -------------------------------------------------------------------------------- /DQN/DQN-Double/storage/reward-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Double/storage/reward-9.png -------------------------------------------------------------------------------- /DQN/DQN-Double/test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from DQN import * 4 | import gym 5 | from quanser_robots.common import GentlyTerminating 6 | import time 7 | 8 | def test(): 9 | config_path = "config.yml" 10 | print_config(config_path) 11 | config = load_config(config_path) 12 | training_config = config["training_config"] 13 | config["model_config"]["load_model"] = True 14 | 15 | env_id = "DoublePendulum-v0" 16 | env = GentlyTerminating(gym.make(env_id)) 17 | 18 | n_episodes = 10 19 | max_episode_step = 10000 20 | print("*********************************************") 21 | print("Testing the model for 10 episodes with 10000 maximum steps per episode") 22 | print("*********************************************") 23 | 24 | policy = Policy(env,config) 25 | 26 | losses = [] 27 | all_rewards = [] 28 | avg_rewards = [] 29 | epsilons = [] 30 | for i_episode in range(n_episodes): 31 | episode_reward = 0 32 | state = env.reset() 33 | state[4]/=10 34 | epsilon = 0 35 | epsilons.append(epsilon) 36 | for step in range(max_episode_step): 37 | env.render() 38 | time.sleep(0.01) 39 | action = policy.act(state, epsilon) 40 | 41 | f_action = 6*(action-(policy.n_actions-1)/2)/((policy.n_actions-1)/2) 42 | next_state, reward, done, _ = env.step(f_action) 43 | reward = 10*reward 44 | next_state[4]/=10 45 | 46 | policy.replay_buffer.push(state, action[0], reward, next_state, done) 47 | 48 | state = next_state 49 | episode_reward += reward 50 | 51 | if done: 52 | break 53 | print(" episode: %s, episode reward: %s" % (i_episode, episode_reward)) 54 | all_rewards.append(episode_reward) 55 | avg_rewards.append(np.mean(all_rewards[-3:])) 56 | 57 | env.close() 58 | plot_fig(n_episodes, all_rewards,avg_rewards, losses) 59 | 60 | if __name__ =="__main__": 61 | test() 62 | 63 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/README.md: -------------------------------------------------------------------------------- 1 | # DQN - Qube 2 | 3 | This folder contains the implementation of DQN algorithm and the evaluation on the Qube environment 4 | 5 | All the hyper-parameters and experiment setting are stored in the ```config.yml``` file 6 | 7 | All the results (figure and model) will be stored in the ```./storage``` folder by default 8 | 9 | If you are not familiar with this environment, you can use the `analyze_env()` function in the `utils.py` to help you quickly understand the environment's state space, action space, reward range, etc. 10 | 11 | ## How to run 12 | 13 | ### Test the pre-trained 14 | 15 | To try our pre-trained model, simply run 16 | 17 | ```angularjs 18 | python test.py 19 | ``` 20 | 21 | The script will load the model from the path specified in the ```config.yml``` file 22 | 23 | ### Train your own model 24 | 25 | To train your own model, you can change the hyper-parameters in the ```config.yml``` to whatever you want, 26 | and then run 27 | 28 | ```angularjs 29 | python train.py 30 | ``` 31 | 32 | The script will load the configurations in the ```config.yml``` file and begin to train 33 | 34 | ### Configuration parameter explanation 35 | 36 | In the ```config.yml``` file, there are two set of configuration. 37 | The first `model_config` is the configuration of the neural network architecture; 38 | The second `training_config` is the configuration for the training process. 39 | 40 | The `exp_number` parameter in the `training_config` is the number of your experiment. The name of saved figure results in the `./storage` folder will be determined by this parameter. 41 | 42 | If you want to train your model from scratch, then set the `load_model` parameter to `False`. If set to `True`, the trainer will load the model from `model_path`. 43 | 44 | If you think your training process is not stable and you want to save the model when the model has the best performance, set the `save_best` parameter to `True`. 45 | 46 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/config.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: True # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_6.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 25000 # how many episodes to train 12 | learning_rate: 0.0001 13 | batch_size: 64 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/test.ckpt" # the path to save the model 20 | use_fix_epsilon: True # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 11 # experiment number 26 | save_best: False 27 | save_thres: 510 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/.~lock.Parameters.ods#: -------------------------------------------------------------------------------- 1 | ,lambert,lambert-Alienware-15-R3,09.03.2019 21:02,file:///home/lambert/.config/libreoffice/4; -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/Parameters.ods: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/Parameters.ods -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/README.md: -------------------------------------------------------------------------------- 1 | # DQN - Experiment Results 2 | 3 | This folder contains the experiment results on the Qube environment 4 | 5 | The number in each file name represents the experiment number. 6 | 7 | For example, `config-1.yml` represents the configuration parameters in the first experiment. 8 | 9 | We only store the best model, the configuration files and the figures here to save space, 10 | for complete results, please see [https://github.com/liuzuxin/RL_Project_Results](https://github.com/liuzuxin/RL_Project_Results) -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/config-1.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/1000_5000_0.001.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 5000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 50 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/1000_5000_0.001.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 1 # experiment number 26 | save_best: False 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/config-2.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/1000_5000_0.0001_2.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 5000 # how many episodes to train 12 | learning_rate: 0.00001 13 | batch_size: 50 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/1000_5000_0.0001_2.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 2 # experiment number 26 | save_best: False 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/config-3.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/1000_5000_0.00001_3.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 5000 # how many episodes to train 12 | learning_rate: 0.000001 13 | batch_size: 50 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/1000_5000_0.00001_3.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 3 # experiment number 26 | save_best: False 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/config-5.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/5000_5000_0.001_5.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 5000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 50 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/5000_5000_0.001_5.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 5000 # bandwidth 25 | exp_number: 5 # experiment number 26 | save_best: False 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/config-6.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_6.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 5000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 30 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_6.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 6 # experiment number 26 | save_best: False 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/config-7.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/1000_5000_0.001_7.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 5000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 70 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/1000_5000_0.001_7.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 7 # experiment number 26 | save_best: False 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/config-8.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/1000_5000_0.001_8.ckpt" # the path to load the model 4 | n_actions: 27 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 5000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 30 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/1000_20000_0.001_8.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 8 # experiment number 26 | save_best: False 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/config-9.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/1000_20000_0.001_9.ckpt" # the path to load the model 4 | n_actions: 45 5 | n_hidden: 1 # hidden layer number 6 | size_hidden: 256 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 20000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 30 14 | gamma: 0.99 15 | n_update_target: 6 # how many episodes to update the target network 16 | memory_size: 100000 # replay memory buffer size 17 | max_episode_step: 500 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/1000_20000_0.001_9.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.1 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 9 # experiment number 26 | save_best: False 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/data_real_world.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/data_real_world.pkl -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/exp_6.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/exp_6.ckpt -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-1.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-2.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-3.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-4.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-5.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-6.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-7.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-8.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/loss-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/loss-9.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-1.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-2.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-3.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-4.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-5.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-6-real-world.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-6-real-world.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-6.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-7.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-8.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/reward-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/reward-9.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/simulatedModelOnRealPlatform-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/simulatedModelOnRealPlatform-2.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/simulatedModelOnRealPlatform-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/simulatedModelOnRealPlatform-3.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/simulatedModelOnRealPlatform-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/simulatedModelOnRealPlatform-4.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/storage/simulatedModelOnRealPlatform.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Qube/storage/simulatedModelOnRealPlatform.png -------------------------------------------------------------------------------- /DQN/DQN-Qube/test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from DQN import * 4 | import gym 5 | from quanser_robots.common import GentlyTerminating 6 | 7 | def test(): 8 | config_path = "config.yml" 9 | print_config(config_path) 10 | config = load_config(config_path) 11 | training_config = config["training_config"] 12 | config["model_config"]["load_model"] = True 13 | 14 | env_id ="Qube-v0" 15 | env = GentlyTerminating(gym.make(env_id)) 16 | 17 | n_episodes = 10 18 | max_episode_step = 10000 19 | print("*********************************************") 20 | print("Testing the model for 10 episodes with 10000 maximum steps per episode") 21 | print("*********************************************") 22 | 23 | policy = Policy(env,config) 24 | 25 | losses = [] 26 | all_rewards = [] 27 | avg_rewards = [] 28 | epsilons = [] 29 | for i_episode in range(n_episodes): 30 | episode_reward = 0 31 | state = env.reset() 32 | state[4:6]/=20 33 | epsilon = 0 34 | epsilons.append(epsilon) 35 | for step in range(max_episode_step): 36 | env.render() 37 | action = policy.act(state, epsilon) 38 | f_action = 5*(action-(policy.n_actions-1)/2)/((policy.n_actions-1)/2) 39 | next_state, reward, done, _ = env.step(f_action) 40 | reward = 100*reward 41 | next_state[4:6]/=20 42 | policy.replay_buffer.push(state, action[0], reward, next_state, done) 43 | state = next_state 44 | episode_reward += reward 45 | if done: 46 | break 47 | all_rewards.append(episode_reward) 48 | avg_rewards.append(np.mean(all_rewards[-3:])) 49 | plot_fig(n_episodes, all_rewards,avg_rewards, losses) 50 | env.close() 51 | 52 | if __name__ =="__main__": 53 | test() 54 | -------------------------------------------------------------------------------- /DQN/DQN-Qube/test_on_real_platform.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from DQN import * 4 | import argparse 5 | from quanser_robots import GentlyTerminating 6 | 7 | plt.style.use('seaborn') 8 | env = GentlyTerminating(gym.make('QubeRR-v0')) 9 | 10 | config_path = "config.yml" 11 | print_config(config_path) 12 | config = load_config(config_path) 13 | training_config = config["training_config"] 14 | config["model_config"]["load_model"] = True 15 | 16 | n_episodes = 10 17 | max_episode_step = 10000 18 | print("*********************************************") 19 | print("Testing the model for 10 episodes with 10000 maximum steps per episode") 20 | print("*********************************************") 21 | 22 | policy = Policy(env,config) 23 | 24 | losses = [] 25 | all_rewards = [] 26 | avg_rewards = [] 27 | epsilons = [] 28 | 29 | s_all = [] 30 | a_all = [] 31 | 32 | for i in range(n_episodes): 33 | print("Testing episodes %s" %i) 34 | obs_old = env.reset() 35 | obs_old[4:6] /= 20 36 | done = False 37 | while not done: 38 | env.render() 39 | action = policy.act(obs_old, 0.0) 40 | f_action = 5 * (action - (policy.n_actions - 1) / 2) / ((policy.n_actions - 1) / 2) 41 | obs_new, reward, done, info = env.step(f_action) 42 | reward = 100*reward 43 | all_rewards.append(reward) 44 | obs_new[4:6] /= 20 45 | obs_old = obs_new 46 | s_all.append(info['s']) 47 | a_all.append(info['a']) 48 | 49 | print("avg reward: ",np.mean(all_rewards)) 50 | print("rewards: ", all_rewards) 51 | env.close() 52 | 53 | fig, axes = plt.subplots(5, 1, figsize=(5, 8), tight_layout=True) 54 | 55 | s_all = np.stack(s_all) 56 | a_all = np.stack(a_all) 57 | 58 | n_points = s_all.shape[0] 59 | t = np.linspace(0, n_points * env.unwrapped.timing.dt_ctrl, n_points) 60 | for i in range(4): 61 | state_labels = env.unwrapped.state_space.labels[i] 62 | axes[i].plot(t, s_all.T[i], label=state_labels, c='C{}'.format(i)) 63 | axes[i].legend(loc='lower right') 64 | action_labels = env.unwrapped.action_space.labels[0] 65 | axes[4].plot(t, a_all.T[0], label=action_labels, c='C{}'.format(4)) 66 | axes[4].legend(loc='lower right') 67 | 68 | axes[0].set_ylabel('ang pos [rad]') 69 | axes[1].set_ylabel('ang pos [rad]') 70 | axes[2].set_ylabel('ang vel [rad/s]') 71 | axes[3].set_ylabel('ang vel [rad/s]') 72 | axes[4].set_ylabel('voltage [V]') 73 | axes[4].set_xlabel('time [seconds]') 74 | plt.show() 75 | 76 | 77 | -------------------------------------------------------------------------------- /DQN/DQN-Swing/README.md: -------------------------------------------------------------------------------- 1 | # DQN - CartPoleSwing 2 | 3 | This folder contains the implementation of DQN algorithm and the evaluation on the CartPoleSwing environment 4 | 5 | All the hyper-parameters and experiment setting are stored in the ```config.yml``` file 6 | 7 | All the results (figure and model) will be stored in the ```./storage``` folder by default 8 | 9 | If you are not familiar with this environment, you can use the `analyze_env()` function in the `utils.py` to help you quickly understand the environment's state space, action space, reward range, etc. 10 | 11 | ## How to run 12 | 13 | ### Test the pre-trained 14 | 15 | To try our pre-trained model, simply run 16 | 17 | ```angularjs 18 | python test.py 19 | ``` 20 | 21 | The script will load the model from the path specified in the ```config.yml``` file 22 | 23 | ### Train your own model 24 | 25 | To train your own model, you can change the hyper-parameters in the ```config.yml``` to whatever you want, 26 | and then run 27 | 28 | ```angularjs 29 | python train.py 30 | ``` 31 | 32 | The script will load the configurations in the ```config.yml``` file and begin to train 33 | 34 | ### Configuration parameter explanation 35 | 36 | In the ```config.yml``` file, there are two set of configuration. 37 | The first `model_config` is the configuration of the neural network architecture; 38 | The second `training_config` is the configuration for the training process. 39 | 40 | The `exp_number` parameter in the `training_config` is the number of your experiment. The name of saved figure results in the `./storage` folder will be determined by this parameter. 41 | 42 | If you want to train your model from scratch, then set the `load_model` parameter to `False`. If set to `True`, the trainer will load the model from `model_path`. 43 | 44 | If you think your training process is not stable and you want to save the model when the model has the best performance, set the `save_best` parameter to `True`. 45 | 46 | -------------------------------------------------------------------------------- /DQN/DQN-Swing/config.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1_best.ckpt" # the path to load the model 4 | n_actions: 7 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 64 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 15000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.99 15 | n_update_target: 3 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/test.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 4 # experiment number 26 | save_best: False 27 | save_thres: 5000 28 | 29 | -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/README.md: -------------------------------------------------------------------------------- 1 | # DQN - Experiment Results 2 | 3 | This folder contains the experiment results on the CartPoleSwing environment 4 | 5 | The number in each file name represents the experiment number. 6 | 7 | For example, `config-1.yml` represents the configuration parameters in the first experiment. 8 | 9 | We only store the best model, the configuration files and the figures here to save space, 10 | for complete results, please see [https://github.com/liuzuxin/RL_Project_Results](https://github.com/liuzuxin/RL_Project_Results) -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/config-0.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/epsilon_decay_4000.ckpt" # the path to load the model 4 | n_actions: 9 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 128 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 15000 # how many episodes to train 12 | learning_rate: 0.0003 13 | batch_size: 64 14 | gamma: 0.995 15 | n_update_target: 4 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/epsilon_decay_4000.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.05 # episilon decay end 24 | epsilon_decay: 4000 # bandwidth 25 | exp_number: 0 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/config-1.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_actions: 7 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 64 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 15000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.995 15 | n_update_target: 3 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_1.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 1 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/config-2.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_2.ckpt" # the path to load the model 4 | n_actions: 5 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 64 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 15000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.999 15 | n_update_target: 3 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_2.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 2000 # bandwidth 25 | exp_number: 2 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/config-3.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_2.ckpt" # the path to load the model 4 | n_actions: 11 5 | n_hidden: 2 # hidden layer number 6 | size_hidden: 64 # hidden layer size 7 | use_cuda: True 8 | 9 | training_config: 10 | render: False # render the environment, set false to accelerate training. test.py does not need this parameter 11 | n_episodes: 15000 # how many episodes to train 12 | learning_rate: 0.001 13 | batch_size: 64 14 | gamma: 0.999 15 | n_update_target: 3 # how many episode to update the target network 16 | memory_size: 1000000 # replay memory buffer size 17 | max_episode_step: 4000 # maximum steps per episode 18 | random_seed: 1234 # do not have to change this parameter 19 | save_model_path: "storage/exp_3.ckpt" # the path to save the model 20 | use_fix_epsilon: False # set true to use a fixed epsilon, otherwise the episilon will decay 21 | fix_epsilon: 0.3 22 | epsilon_start: 0.9 # episilon decay start 23 | epsilon_final: 0.1 # episilon decay end 24 | epsilon_decay: 1000 # bandwidth 25 | exp_number: 3 # experiment number 26 | save_best: False 27 | 28 | -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/exp_0.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/exp_0.ckpt -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/exp_1_best.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/exp_1_best.ckpt -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/loss-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/loss-0.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/loss-1-find-best.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/loss-1-find-best.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/loss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/loss-1.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/loss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/loss-2.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/loss-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/loss-3.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/reward-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/reward-0.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/reward-1-find-best.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/reward-1-find-best.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/reward-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/reward-1.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/reward-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/reward-2.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/storage/reward-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/DQN/DQN-Swing/storage/reward-3.png -------------------------------------------------------------------------------- /DQN/DQN-Swing/test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from DQN import * 4 | import gym 5 | from quanser_robots.common import GentlyTerminating 6 | import time 7 | 8 | def test(): 9 | config_path = "config.yml" 10 | print_config(config_path) 11 | config = load_config(config_path) 12 | training_config = config["training_config"] 13 | config["model_config"]["load_model"] = True 14 | 15 | env_id = "CartpoleSwingShort-v0" 16 | env = GentlyTerminating(gym.make(env_id)) 17 | 18 | n_episodes = 10 19 | max_episode_step = 10000 20 | print("*********************************************") 21 | print("Testing the model for 10 episodes with 10000 maximum steps per episode") 22 | print("*********************************************") 23 | 24 | policy = Policy(env,config) 25 | losses = [] 26 | all_rewards = [] 27 | avg_rewards = [] 28 | epsilons = [] 29 | for i_episode in range(n_episodes): 30 | episode_reward = 0 31 | state = env.reset() 32 | state[4]/=10 33 | epsilon = 0 34 | epsilons.append(epsilon) 35 | for step in range(max_episode_step): 36 | env.render() 37 | time.sleep(0.001) 38 | action = policy.act(state, epsilon) 39 | f_action = 5*(action-(policy.n_actions-1)/2)/((policy.n_actions-1)/2) 40 | next_state, reward, done, _ = env.step(f_action) 41 | next_state[4]/=10 42 | policy.replay_buffer.push(state, action[0], reward, next_state, done) 43 | state = next_state 44 | episode_reward += reward 45 | if done: 46 | break 47 | print(" episode: %s, episode reward: %s" % (i_episode, episode_reward)) 48 | all_rewards.append(episode_reward) 49 | avg_rewards.append(np.mean(all_rewards[-3:])) 50 | 51 | env.close() 52 | plot_fig(n_episodes, all_rewards,avg_rewards, losses) 53 | 54 | if __name__ =="__main__": 55 | test() 56 | -------------------------------------------------------------------------------- /DQN/DQN-Swing/test_rr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from DQN import * 4 | 5 | 6 | use_plot = True 7 | render = True 8 | 9 | window = 500 10 | collect_fr = 10 11 | plot_fr = 10 12 | render_fr = 10 13 | 14 | if use_plot: 15 | plt.ion() 16 | plot = PlotSignal(window=window) 17 | 18 | # Initialize Controller & Environment: 19 | env, ctrl = get_env_and_controller(long_pendulum=False, simulation=True, swinging=True, mouse_control=False) 20 | 21 | 22 | config_path = "config.yml" 23 | print_config(config_path) 24 | config = load_config(config_path) 25 | training_config = config["training_config"] 26 | config["model_config"]["load_model"] = True 27 | 28 | n_episodes = 10 29 | max_episode_step = 100000 30 | print("*********************************************") 31 | print("Testing the model on real platform for 10 episodes with 100000 maximum steps per episode") 32 | print("*********************************************") 33 | 34 | policy = Policy(env,config) 35 | losses = [] 36 | all_rewards = [] 37 | avg_rewards = [] 38 | epsilons = [] 39 | 40 | 41 | for i in range(n_episodes): 42 | print("\n\n###############################") 43 | print("Episode {0}".format(0)) 44 | 45 | # Reset the environment: 46 | env.reset() 47 | obs, reward, done, _ = env.step(np.zeros(1)) 48 | # Start the Control Loop: 49 | print("\nStart Controller:\t\t\t", end="") 50 | for n in range(max_episode_step): 51 | obs[4] /= 10 52 | action = policy.act(obs, 0) 53 | f_action = 5 * (action - (policy.n_actions - 1) / 2) / ((policy.n_actions - 1) / 2) 54 | obs, reward, done, _ = env.step(f_action) 55 | all_rewards.append(reward) 56 | if done: 57 | print("Physical Limits or End of Time reached") 58 | break 59 | 60 | if render and np.mod(n, render_fr) == 0: 61 | env.render() 62 | 63 | if use_plot and np.mod(n, collect_fr) == 0: 64 | alpha, theta = get_angles(obs[1], obs[2]) 65 | plot.update(theta=theta, alpha=alpha, theta_dt=obs[4], volt=f_action, u=0, x=obs[0]) 66 | env.render() 67 | 68 | if use_plot and np.mod(n, plot_fr) == 0: 69 | plot.plot_signal() 70 | 71 | # Stop the cart: 72 | env.step(np.zeros(1)) 73 | 74 | print("avg reward: ",np.mean(all_rewards)) 75 | print("rewards: ", all_rewards) 76 | env.close() 77 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /DQN/README.md: -------------------------------------------------------------------------------- 1 | # DQN - Deep Q-Network 2 | 3 | This folder contains the implementation of DQN algorithm and the evaluation of it. 4 | 5 | For more details of DQN, see the paper [here](https://arxiv.org/abs/1312.5602) 6 | 7 | Choose the environment folder and follow the instructions to run everything. 8 | 9 | ## Overview of the experiment results: 10 | 11 | The best experients parameters in different environments: 12 | 13 | | Environment | Learning Rate | Epsilon Decay | Batch Size | Action Number | Gamma | average episode reward | 14 | | -------- | -----: | :----: | :----: | :----: | :----: | :----: | 15 | | Qube | 0.001 | 1000 | 50 | 9 | 0.99 | 410 | 16 | | CartPole Swingup | 0.001 | 1000 | 64 | 7 | 0.995 | 4126 | 17 | | CartPole Stab | 0.001 | 500 | 64 | 9 | 0.995 | 1535 | 18 | | Double CartPole | 0.001 | 2000 | 64 | 7 | 0.995 | 383 | 19 | 20 | 21 | 22 | 23 | ### CartpoleStabShort-v0 24 | episode_rewards: 25 | 26 | learning_rate: 3e-5 27 | 28 | networks architecture: 29 | 30 | gamma: 0.98 31 | 32 | batch size: 20 33 | 34 | weight_decay: 1e-4 35 | 36 | num_epochs: 2000 37 | 38 | ### Qube-v0: 39 | episode_rewards: 40 | 41 | learning_rate: 3e-5 42 | 43 | networks architecture: 44 | 45 | gamma: 0.98 46 | 47 | batch size: 20 48 | 49 | weight_decay: 1e-4 50 | 51 | num_epochs: 2000 52 | 53 | 54 | ### DoublePendulum-v0 55 | episode_rewards: 56 | 57 | learning_rate: 3e-5 58 | 59 | networks architecture: 60 | 61 | gamma: 0.99 62 | 63 | batch size: 20 64 | 65 | weight_decay: 1e-4 66 | 67 | num_epochs: 2000 68 | 69 | ### CartpoleSwingShort-v0 70 | learning_rate: 3e-5 71 | 72 | networks architecture: 73 | 74 | gamma: 0.98 75 | 76 | batch size: 20 77 | 78 | weight_decay: 1e-4 79 | 80 | num_epochs: 2000 81 | 82 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 liuzuxin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/.idea/MPC qube.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/Hive/SelectionMethods.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- SELECTION METHODS 4 | 5 | __all__ = ["tournament", "disruptive"] 6 | 7 | # ---- MODULE DOCSTRING 8 | 9 | __doc__ = """ 10 | 11 | (C) Hive, Romain Wuilbercq, 2017 12 | _ 13 | /_/_ .'''. 14 | =O(_)))) ...' `. 15 | \_\ `. .'''X 16 | `..' 17 | .---. .---..-./`) ,---. ,---. .-''-. 18 | | | |_ _|\ .-.')| / | | .'_ _ \ 19 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 20 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 21 | | (_,_) .---. | _( )_ || (_,_)___| 22 | | _ _--. | | | \ (_ o._) /' \ .---. 23 | |( ' ) | | | | \ (_,_) / \ `-' / 24 | (_{;}_)| | | | \ / \ / 25 | '(_,_) '---' '---' `---` `'-..-' 26 | 27 | Description: 28 | ----------- 29 | 30 | SelectionMethods.py 31 | 32 | Defines a collection of selection methods to be used with Hive. 33 | 34 | """ 35 | 36 | # ---- IMPORT MODULES 37 | 38 | import random 39 | 40 | import numpy as np 41 | 42 | # ---- SELECTION METHOD(S) 43 | 44 | def tournament(values, crowd_size=None): 45 | """ 46 | 47 | Defines a selection process whereby a number of individuals 48 | from a colony/generation are selected to compete. 49 | 50 | Individuals with greater fitness values compared to the rest 51 | have higher chance to be kept for the next cycle/generation 52 | - i.e. survival of the fittest. This method prones elitism. 53 | 54 | A solution compete with a fixed number of randomly chosen individuals 55 | (i.e. "crowd_size") from the population. 56 | 57 | This function uses the "random.sample" function from the python base 58 | "random" module and the "np.where" function from the "numpy" module. 59 | 60 | Parameters: 61 | ---------- 62 | 63 | :param int crowd_size: number of individuals competing 64 | 65 | """ 66 | 67 | # computes battle score metrics 68 | scores = [] 69 | for i in range(len(values)): 70 | 71 | # selects a pool of opponents randomly 72 | if (crowd_size != None) and (type(crowd_size) is int): 73 | opponents = random.sample(values, crowd_size) 74 | else: 75 | opponents = values 76 | 77 | # battles against opponents 78 | scores.append( sum(np.where(values[i]>opponents, 1, 0)) ) 79 | 80 | # returns an array of normalized scores 81 | return scores / sum(scores) 82 | 83 | def disruptive(values): 84 | """ 85 | 86 | Defines a selection process whereby a better chance is given to 87 | individuals with the highest and lowest fitness values - i.e. those 88 | further away from a "norm". 89 | 90 | This method represents a good mechanism by which diversity can 91 | be passed onto the next generation/cycle and avoid too-early 92 | convergence - i.e. improves the exploration of the search domain. 93 | 94 | This function uses the "np.mean" function from the "numpy" module. 95 | 96 | """ 97 | 98 | # computes mean fitness of population 99 | mean_ = np.mean(values) 100 | 101 | # computes score metrics 102 | scores = [] 103 | for i in range(len(values)): 104 | scores.append(abs(values[i] - mean_)) 105 | 106 | # returns an array of normalized scores 107 | return scores / sum(scores) 108 | 109 | # ---- END 110 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/Hive/Utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- MODULE DOCSTRING 4 | 5 | __doc__ = """ 6 | 7 | (C) Hive, Romain Wuilbercq, 2017 8 | _ 9 | /_/_ .'''. 10 | =O(_)))) ...' `. 11 | \_\ `. .'''X 12 | `..' 13 | .---. .---..-./`) ,---. ,---. .-''-. 14 | | | |_ _|\ .-.')| / | | .'_ _ \ 15 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 16 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 17 | | (_,_) .---. | _( )_ || (_,_)___| 18 | | _ _--. | | | \ (_ o._) /' \ .---. 19 | |( ' ) | | | | \ (_,_) / \ `-' / 20 | (_{;}_)| | | | \ / \ / 21 | '(_,_) '---' '---' `---` `'-..-' 22 | 23 | Description: 24 | ----------- 25 | 26 | A series of utility functions (such as plotting function etc...). 27 | 28 | """ 29 | 30 | # ---- IMPORT MODULES 31 | 32 | try: 33 | import matplotlib.pyplot as plt 34 | from matplotlib.font_manager import FontProperties 35 | except: 36 | raise ImportError("Install 'matplotlib' to plot convergence results.") 37 | 38 | # ---- CONVERGENCE PLOT 39 | 40 | def ConvergencePlot(cost): 41 | """ 42 | 43 | Monitors convergence. 44 | 45 | Parameters: 46 | ---------- 47 | 48 | :param dict cost: mean and best cost over cycles/generations as returned 49 | by an optimiser. 50 | 51 | """ 52 | plt.rc('font',family= 'Tibetan Machine Uni') 53 | fs=15 54 | font = FontProperties(); 55 | font.set_size( fs) #'larger'); 56 | labels = ["Best Bee's Cost", "Mean Bees' Cost"] 57 | plt.figure(figsize=(7, 4.5)); 58 | plt.plot(range(len(cost["best"])), cost["best"], label=labels[0]); 59 | plt.scatter(range(len(cost["mean"])), cost["mean"], color='red', label=labels[1]); 60 | plt.xlabel("Iteration",fontsize=fs); 61 | plt.ylabel("Cost",fontsize=fs); 62 | plt.xticks(fontsize=10) 63 | plt.yticks(fontsize=10) 64 | plt.legend(loc="best", prop = font); 65 | plt.xlim([0,len(cost["mean"])]); 66 | plt.grid(); 67 | plt.show(); 68 | 69 | # ---- END 70 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/Hive/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- MODULE DOCSTRING 4 | 5 | __doc__ = """ 6 | 7 | (C) Hive, Romain Wuilbercq, 2017 8 | _ 9 | /_/_ .'''. 10 | =O(_)))) ...' `. 11 | \_\ `. .'''X 12 | `..' 13 | .---. .---..-./`) ,---. ,---. .-''-. 14 | | | |_ _|\ .-.')| / | | .'_ _ \ 15 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 16 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 17 | | (_,_) .---. | _( )_ || (_,_)___| 18 | | _ _--. | | | \ (_ o._) /' \ .---. 19 | |( ' ) | | | | \ (_,_) / \ `-' / 20 | (_{;}_)| | | | \ / \ / 21 | '(_,_) '---' '---' `---` `'-..-' 22 | 23 | **Hive** is a simple implementation of a swarm-based optimisation 24 | algorithm called the Artificial Bee Colony (ABC) algorithm. 25 | 26 | The Artificial Bee Colony (ABC) algorithm is based on the intelligent foraging 27 | behaviour of honey bee swarm, proposed by Karaboga in 2005. 28 | 29 | """ 30 | 31 | __author__ = "Romain Wuilbercq" 32 | 33 | # ---- END 34 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/README.md: -------------------------------------------------------------------------------- 1 | # MPC - CartPoleStab 2 | This folder contains the implementation of MPC algorithm and the evaluation on the CartPoleStab environment 3 | 4 | The implementation is mainly followed in this paper [here](https://ieeexplore.ieee.org/abstract/document/8463189) 5 | 6 | To optimize the MPC controller, we use the [Artificial Bee Colony](https://en.wikipedia.org/wiki/Artificial_bee_colony_algorithm) (ABC) optimization algorithm, 7 | instead of the original random shooting method in the paper. The implementation of ABC algorithm is based on this repo: [https://github.com/rwuilbercq/Hive](https://github.com/rwuilbercq/Hive) 8 | 9 | All the hyper-parameters and experiment setting are stored in the ```config.yml``` file 10 | 11 | All the results (figure and model) will be stored in the ```./storage``` folder by default 12 | 13 | If you are not familiar with this environment, you can use the  `analyze_env()`  function in the `utils.py` to help you quickly understand the environment's state space, action space, reward range, etc. 14 | 15 | ### How to run 16 | 17 | To try our pre-trained model, simply run 18 | 19 | ```angularjs 20 | python run.py --path config.yml 21 | ``` 22 | The script will load the configurations in the ```config.yml``` file and begin to train 23 | 24 | Note that because of the long time of optimization, boost the data with MPC controller would take a long time 25 | 26 | If you want to load the dataset and a pre-trained dynamic model, note that you should normalize the dataset first, because the dynamic model need the data distribution information. 27 | You can use the `norm_train_data()` method in the `DynamicModel` class. 28 | 29 | You can also see some results in the Jupyter Notebook ```example.ipynb``` 30 | 31 | ### Configuration explanation 32 | 33 | In the ```config.yml``` file, there are 4 sets of configuration. 34 | 35 | The `model_config`  part is the configuration of the parameters which determine the neural network architecture and the environment basis. 36 | 37 | The `training_config` part is the configuration of the training process parameters. 38 | 39 | The `dataset_config` part is the configuration of the dataset parameters. 40 | 41 | The `mpc_config` part is the configuration of the MPC algorithm parameters. 42 | 43 | The `exp_number` parameter in the `training_config` is the number of your experiment. The name of saved figure results in the `./storage` folder will be determined by this parameter. 44 | 45 | If you want to train your model from scratch, then set the `load_model` parameter to `False`. If set to `True`, the trainer will load the model from `model_path`. 46 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/config.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_states: 5 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 512 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 60 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 256 14 | save_model_flag: True 15 | save_model_path: "storage/exp_1.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 18 | exp_number: 1 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_1.pkl" 23 | n_max_steps: 1000 # maximum steps per episode 24 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_1.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 12 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 20 # max iterations for the ABC optimization 38 | gamma: 0.99 # reward discount coefficient 39 | action_low: -12 # lower bound of the solution space 40 | action_high: 12 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/controller.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from Hive import Hive 3 | from Hive import Utilities 4 | 5 | 6 | class MPC(object): 7 | def __init__(self, env, config): 8 | self.env = env 9 | mpc_config = config["mpc_config"] 10 | self.horizon = mpc_config["horizon"] 11 | self.numb_bees = mpc_config["numb_bees"] 12 | self.max_itrs = mpc_config["max_itrs"] 13 | self.gamma = mpc_config["gamma"] 14 | self.action_low = mpc_config["action_low"] 15 | self.action_high = mpc_config["action_high"] 16 | self.evaluator = Evaluator(self.gamma) 17 | 18 | def act(self, state, dynamic_model): 19 | ''' 20 | Optimize the action by Artificial Bee Colony algorithm 21 | :param state: (numpy array) current state 22 | :param dynamic_model: system dynamic model 23 | :return: (float) optimal action 24 | ''' 25 | self.evaluator.update(state, dynamic_model) 26 | optimizer = Hive.BeeHive( lower = [float(self.action_low)] * self.horizon, 27 | upper = [float(self.action_high)] * self.horizon, 28 | fun = self.evaluator.evaluate, 29 | numb_bees = self.numb_bees, 30 | max_itrs = self.max_itrs, 31 | verbose=False) 32 | cost = optimizer.run() 33 | #print("Solution: ",optimizer.solution[0]) 34 | #print("Fitness Value ABC: {0}".format(optimizer.best)) 35 | # Uncomment this if you want to see the performance of the optimizer 36 | #Utilities.ConvergencePlot(cost) 37 | return optimizer.solution[0] 38 | 39 | class Evaluator(object): 40 | def __init__(self, gamma=0.8): 41 | self.gamma = gamma 42 | 43 | def update(self, state, dynamic_model): 44 | self.state = state 45 | self.dynamic_model = dynamic_model 46 | 47 | def evaluate(self, actions): 48 | actions = np.array(actions) 49 | horizon = actions.shape[0] 50 | rewards = 0 51 | state_tmp = self.state.copy() 52 | for j in range(horizon): 53 | input_data = np.concatenate( (state_tmp,[actions[j]]) ) 54 | state_dt = self.dynamic_model.predict(input_data) 55 | state_tmp = state_tmp + state_dt[0] 56 | rewards -= (self.gamma ** j) * self.get_reward(state_tmp, actions[j]) 57 | return rewards 58 | 59 | def get_reward(self,obs, action_n): 60 | ''' 61 | Overwrite this function according to different environment 62 | ''' 63 | x, sin_th, cos_th, x_dot, theta_dot = obs 64 | cos_th = min(max(cos_th, -1), 1) 65 | reward = -cos_th + 1 66 | return reward 67 | 68 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/run.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import gym 3 | import argparse 4 | from dynamics import * 5 | from controller import * 6 | from utils import * 7 | from quanser_robots.common import GentlyTerminating 8 | import time 9 | 10 | parser = argparse.ArgumentParser(description='Specify the configuraton file path') 11 | parser.add_argument('--path', required=False, type=str, default='config.yml', 12 | help='Specify the configuraton file path') 13 | 14 | 15 | args = parser.parse_args() 16 | 17 | config_path = args.path # "config.yml" 18 | config = load_config(config_path) 19 | print_config(config_path) 20 | 21 | env_id = "CartpoleStabShort-v0" 22 | env = GentlyTerminating(gym.make(env_id)) 23 | 24 | model = DynamicModel(config) 25 | 26 | data_fac = DatasetFactory(env,config) 27 | 28 | data_fac.collect_random_dataset() 29 | 30 | '''Train on the random collected dataset''' 31 | loss = model.train(data_fac.random_trainset,data_fac.random_testset) 32 | 33 | mpc = MPC(env,config) 34 | 35 | rewards_list = [] 36 | for itr in range(config["dataset_config"]["n_mpc_itrs"]): 37 | t = time.time() 38 | print("**********************************************") 39 | print("The reinforce process [%s], collecting data ..." % itr) 40 | rewards = data_fac.collect_mpc_dataset(mpc, model) 41 | trainset, testset = data_fac.make_dataset() 42 | rewards_list += rewards 43 | 44 | plt.close("all") 45 | plt.figure(figsize=(12, 5)) 46 | plt.title('Reward Trend with %s iteration' % itr) 47 | plt.plot(rewards_list) 48 | plt.savefig("storage/reward-" + str(model.exp_number) + ".png") 49 | print("Consume %s s in this iteration" % (time.time() - t)) 50 | loss = model.train(trainset, testset) -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/storage/config-1.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_states: 5 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 512 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 60 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 256 14 | save_model_flag: True 15 | save_model_path: "storage/exp_1.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 18 | exp_number: 1 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_1.pkl" 23 | n_max_steps: 1000 # maximum steps per episode 24 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_1.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 12 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 20 # max iterations for the ABC optimization 38 | gamma: 0.99 # reward discount coefficient 39 | action_low: -12 # lower bound of the solution space 40 | action_high: 12 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/storage/config-2.yml: -------------------------------------------------------------------------------- 1 | # change the mpc horizon w.r.t. config 1 to compare 2 | 3 | model_config: 4 | load_model: False # If set true, you must specify the model path, otherwise train a new model 5 | model_path: "storage/exp_2.ckpt" # the path to load the model 6 | n_states: 5 # environment states 7 | n_actions: 1 # how many controls we need 8 | n_hidden: 1 # hidden layer number 9 | size_hidden: 512 # hidden layer size 10 | use_cuda: True 11 | 12 | training_config: 13 | n_epochs: 60 # how many epoches to train the dynamic model 14 | learning_rate: 0.001 15 | batch_size: 256 16 | save_model_flag: True 17 | save_model_path: "storage/exp_2.ckpt" # the path to save the model 18 | save_loss_fig: True 19 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 20 | exp_number: 2 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_2.pkl" 25 | n_max_steps: 1000 # maximum steps per episode 26 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 30 | min_train_samples: 6000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_2.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 5 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 20 # max iterations for the ABC optimization 40 | gamma: 0.99 # reward discount coefficient 41 | action_low: -12 # lower bound of the solution space 42 | action_high: 12 # upper bound of the solution space 43 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/storage/exp_1.ckpt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleStab/storage/exp_1.ckpt -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/storage/loss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleStab/storage/loss-1.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/storage/loss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleStab/storage/loss-2.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/storage/model_error_exp_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleStab/storage/model_error_exp_1.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/storage/reward-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleStab/storage/reward-1.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleStab/storage/reward-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleStab/storage/reward-2.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/Hive/Utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- MODULE DOCSTRING 4 | 5 | __doc__ = """ 6 | 7 | (C) Hive, Romain Wuilbercq, 2017 8 | _ 9 | /_/_ .'''. 10 | =O(_)))) ...' `. 11 | \_\ `. .'''X 12 | `..' 13 | .---. .---..-./`) ,---. ,---. .-''-. 14 | | | |_ _|\ .-.')| / | | .'_ _ \ 15 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 16 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 17 | | (_,_) .---. | _( )_ || (_,_)___| 18 | | _ _--. | | | \ (_ o._) /' \ .---. 19 | |( ' ) | | | | \ (_,_) / \ `-' / 20 | (_{;}_)| | | | \ / \ / 21 | '(_,_) '---' '---' `---` `'-..-' 22 | 23 | Description: 24 | ----------- 25 | 26 | A series of utility functions (such as plotting function etc...). 27 | 28 | """ 29 | 30 | # ---- IMPORT MODULES 31 | 32 | try: 33 | import matplotlib.pyplot as plt 34 | from matplotlib.font_manager import FontProperties 35 | except: 36 | raise ImportError("Install 'matplotlib' to plot convergence results.") 37 | 38 | # ---- CONVERGENCE PLOT 39 | 40 | def ConvergencePlot(cost): 41 | """ 42 | 43 | Monitors convergence. 44 | 45 | Parameters: 46 | ---------- 47 | 48 | :param dict cost: mean and best cost over cycles/generations as returned 49 | by an optimiser. 50 | 51 | """ 52 | plt.rc('font',family= 'Tibetan Machine Uni') 53 | fs=15 54 | font = FontProperties(); 55 | font.set_size( fs) #'larger'); 56 | labels = ["Best Bee's Cost", "Mean Bees' Cost"] 57 | plt.figure(figsize=(7, 4.5)); 58 | plt.plot(range(len(cost["best"])), cost["best"], label=labels[0]); 59 | plt.scatter(range(len(cost["mean"])), cost["mean"], color='red', label=labels[1]); 60 | plt.xlabel("Iteration",fontsize=fs); 61 | plt.ylabel("Cost",fontsize=fs); 62 | plt.xticks(fontsize=10) 63 | plt.yticks(fontsize=10) 64 | plt.legend(loc="best", prop = font); 65 | plt.xlim([0,len(cost["mean"])]); 66 | plt.grid(); 67 | plt.show(); 68 | 69 | # ---- END 70 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/Hive/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- MODULE DOCSTRING 4 | 5 | __doc__ = """ 6 | 7 | (C) Hive, Romain Wuilbercq, 2017 8 | _ 9 | /_/_ .'''. 10 | =O(_)))) ...' `. 11 | \_\ `. .'''X 12 | `..' 13 | .---. .---..-./`) ,---. ,---. .-''-. 14 | | | |_ _|\ .-.')| / | | .'_ _ \ 15 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 16 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 17 | | (_,_) .---. | _( )_ || (_,_)___| 18 | | _ _--. | | | \ (_ o._) /' \ .---. 19 | |( ' ) | | | | \ (_,_) / \ `-' / 20 | (_{;}_)| | | | \ / \ / 21 | '(_,_) '---' '---' `---` `'-..-' 22 | 23 | **Hive** is a simple implementation of a swarm-based optimisation 24 | algorithm called the Artificial Bee Colony (ABC) algorithm. 25 | 26 | The Artificial Bee Colony (ABC) algorithm is based on the intelligent foraging 27 | behaviour of honey bee swarm, proposed by Karaboga in 2005. 28 | 29 | """ 30 | 31 | __author__ = "Romain Wuilbercq" 32 | 33 | # ---- END 34 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/README.md: -------------------------------------------------------------------------------- 1 | # MPC - CartPoleSwing 2 | This folder contains the implementation of MPC algorithm and the evaluation on the CartPoleSwing environment 3 | 4 | The implementation is mainly followed in this paper [here](https://ieeexplore.ieee.org/abstract/document/8463189) 5 | 6 | To optimize the MPC controller, we use the [Artificial Bee Colony](https://en.wikipedia.org/wiki/Artificial_bee_colony_algorithm) (ABC) optimization algorithm, 7 | instead of the original random shooting method in the paper. The implementation of ABC algorithm is based on this repo: [https://github.com/rwuilbercq/Hive](https://github.com/rwuilbercq/Hive) 8 | 9 | All the hyper-parameters and experiment setting are stored in the ```config.yml``` file 10 | 11 | All the results (figure and model) will be stored in the ```./storage``` folder by default 12 | 13 | If you are not familiar with this environment, you can use the  `analyze_env()`  function in the `utils.py` to help you quickly understand the environment's state space, action space, reward range, etc. 14 | 15 | ### How to run 16 | 17 | To try our pre-trained model, simply run 18 | 19 | ```angularjs 20 | python run.py --path config.yml 21 | ``` 22 | The script will load the configurations in the ```config.yml``` file and begin to train 23 | 24 | Note that because of the long time of optimization, boost the data with MPC controller would take a long time 25 | 26 | If you want to load the dataset and a pre-trained dynamic model, note that you should normalize the dataset first, because the dynamic model need the data distribution information. 27 | You can use the `norm_train_data()` method in the `DynamicModel` class. 28 | ### Configuration explanation 29 | 30 | In the ```config.yml``` file, there are 4 sets of configuration. 31 | 32 | The `model_config`  part is the configuration of the parameters which determine the neural network architecture and the environment basis. 33 | 34 | The `training_config` part is the configuration of the training process parameters. 35 | 36 | The `dataset_config` part is the configuration of the dataset parameters. 37 | 38 | The `mpc_config` part is the configuration of the MPC algorithm parameters. 39 | 40 | The `exp_number` parameter in the `training_config` is the number of your experiment. The name of saved figure results in the `./storage` folder will be determined by this parameter. 41 | 42 | If you want to train your model from scratch, then set the `load_model` parameter to `False`. If set to `True`, the trainer will load the model from `model_path`. 43 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/config.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_states: 5 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 2 # hidden layer number 7 | size_hidden: 512 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 100 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 512 14 | save_model_flag: True 15 | save_model_path: "storage/exp_1.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 18 | exp_number: 1 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_1.pkl" 23 | n_max_steps: 2000 # maximum steps per episode 24 | n_random_episodes: 80 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 2 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_1.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 12 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 20 # max iterations for the ABC optimization 38 | gamma: 0.99 # reward discount coefficient 39 | action_low: -12 # lower bound of the solution space 40 | action_high: 12 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/controller.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from Hive import Hive 3 | from Hive import Utilities 4 | 5 | 6 | class MPC(object): 7 | def __init__(self, env, config): 8 | self.env = env 9 | mpc_config = config["mpc_config"] 10 | self.horizon = mpc_config["horizon"] 11 | self.numb_bees = mpc_config["numb_bees"] 12 | self.max_itrs = mpc_config["max_itrs"] 13 | self.gamma = mpc_config["gamma"] 14 | self.action_low = mpc_config["action_low"] 15 | self.action_high = mpc_config["action_high"] 16 | self.evaluator = Evaluator(self.gamma) 17 | 18 | def act(self, state, dynamic_model): 19 | ''' 20 | Optimize the action by Artificial Bee Colony algorithm 21 | :param state: (numpy array) current state 22 | :param dynamic_model: system dynamic model 23 | :return: (float) optimal action 24 | ''' 25 | self.evaluator.update(state, dynamic_model) 26 | optimizer = Hive.BeeHive( lower = [float(self.action_low)] * self.horizon, 27 | upper = [float(self.action_high)] * self.horizon, 28 | fun = self.evaluator.evaluate, 29 | numb_bees = self.numb_bees, 30 | max_itrs = self.max_itrs, 31 | verbose=False) 32 | cost = optimizer.run() 33 | #print("Solution: ",optimizer.solution[0]) 34 | #print("Fitness Value ABC: {0}".format(optimizer.best)) 35 | # Uncomment this if you want to see the performance of the optimizer 36 | #Utilities.ConvergencePlot(cost) 37 | return optimizer.solution[0] 38 | 39 | class Evaluator(object): 40 | def __init__(self, gamma=0.8): 41 | self.gamma = gamma 42 | 43 | def update(self, state, dynamic_model): 44 | self.state = state 45 | self.dynamic_model = dynamic_model 46 | 47 | def evaluate(self, actions): 48 | actions = np.array(actions) 49 | horizon = actions.shape[0] 50 | rewards = 0 51 | state_tmp = self.state.copy() 52 | for j in range(horizon): 53 | input_data = np.concatenate( (state_tmp,[actions[j]]) ) 54 | state_dt = self.dynamic_model.predict(input_data) 55 | state_tmp = state_tmp + state_dt[0] 56 | rewards -= (self.gamma ** j) * self.get_reward(state_tmp, actions[j]) 57 | return rewards 58 | 59 | # need to change this function according to different environment 60 | def get_reward(self,obs, action_n): 61 | x, sin_th, cos_th, x_dot, theta_dot = obs 62 | cos_th = min(max(cos_th, -1), 1) 63 | reward = -cos_th + 1 64 | return reward 65 | 66 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/run.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import gym 3 | import argparse 4 | from dynamics import * 5 | from controller import * 6 | from utils import * 7 | from quanser_robots.common import GentlyTerminating 8 | import time 9 | 10 | parser = argparse.ArgumentParser(description='Specify the configuraton file path') 11 | parser.add_argument('--path', required=False, type=str, default='config.yml', 12 | help='Specify the configuraton file path') 13 | 14 | 15 | args = parser.parse_args() 16 | 17 | config_path = args.path # "config.yml" 18 | config = load_config(config_path) 19 | print_config(config_path) 20 | 21 | env_id = "CartpoleSwingShort-v0" 22 | env = GentlyTerminating(gym.make(env_id)) 23 | 24 | model = DynamicModel(config) 25 | 26 | data_fac = DatasetFactory(env,config) 27 | data_fac.collect_random_dataset() 28 | 29 | loss = model.train(data_fac.random_trainset,data_fac.random_testset) 30 | model.plot_model_validation(env,n_sample=200) 31 | mpc = MPC(env,config) 32 | 33 | rewards_list = [] 34 | for itr in range(config["dataset_config"]["n_mpc_itrs"]): 35 | t = time.time() 36 | print("**********************************************") 37 | print("The reinforce process [%s], collecting data ..." % itr) 38 | rewards = data_fac.collect_mpc_dataset(mpc, model) 39 | trainset, testset = data_fac.make_dataset() 40 | rewards_list += rewards 41 | 42 | plt.close("all") 43 | plt.figure(figsize=(12, 5)) 44 | plt.title('Reward Trend with %s iteration' % itr) 45 | plt.plot(rewards_list) 46 | plt.savefig("storage/reward-" + str(model.exp_number) + ".png") 47 | print("Consume %s s in this iteration" % (time.time() - t)) 48 | loss = model.train(trainset, testset) 49 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/config-1.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_states: 5 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 512 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 60 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 256 14 | save_model_flag: True 15 | save_model_path: "storage/exp_1.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 18 | exp_number: 1 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_1.pkl" 23 | n_max_steps: 1000 # maximum steps per episode 24 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_1.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 12 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 20 # max iterations for the ABC optimization 38 | gamma: 0.99 # reward discount coefficient 39 | action_low: -12 # lower bound of the solution space 40 | action_high: 12 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/config-2.yml: -------------------------------------------------------------------------------- 1 | # change the mpc horizon and network architecture w.r.t. config 1 to compare 2 | 3 | model_config: 4 | load_model: False # If set true, you must specify the model path, otherwise train a new model 5 | model_path: "storage/exp_2.ckpt" # the path to load the model 6 | n_states: 5 # environment states 7 | n_actions: 1 # how many controls we need 8 | n_hidden: 1 # hidden layer number 9 | size_hidden: 512 # hidden layer size 10 | use_cuda: True 11 | 12 | training_config: 13 | n_epochs: 60 # how many epoches to train the dynamic model 14 | learning_rate: 0.001 15 | batch_size: 256 16 | save_model_flag: True 17 | save_model_path: "storage/exp_2.ckpt" # the path to save the model 18 | save_loss_fig: True 19 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 20 | exp_number: 2 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_2.pkl" 25 | n_max_steps: 2000 # maximum steps per episode 26 | n_random_episodes: 80 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 2 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 30 | min_train_samples: 6000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_2.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 20 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 20 # max iterations for the ABC optimization 40 | gamma: 0.99 # reward discount coefficient 41 | action_low: -12 # lower bound of the solution space 42 | action_high: 12 # upper bound of the solution space 43 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/config-3.yml: -------------------------------------------------------------------------------- 1 | # change the n_mpc_episodes w.r.t. config 1 to compare 2 | 3 | model_config: 4 | load_model: False # If set true, you must specify the model path, otherwise train a new model 5 | model_path: "storage/exp_3.ckpt" # the path to load the model 6 | n_states: 5 # environment states 7 | n_actions: 1 # how many controls we need 8 | n_hidden: 1 # hidden layer number 9 | size_hidden: 512 # hidden layer size 10 | use_cuda: True 11 | 12 | training_config: 13 | n_epochs: 60 # how many epoches to train the dynamic model 14 | learning_rate: 0.001 15 | batch_size: 256 16 | save_model_flag: True 17 | save_model_path: "storage/exp_3.ckpt" # the path to save the model 18 | save_loss_fig: True 19 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 20 | exp_number: 3 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_3.pkl" 25 | n_max_steps: 2000 # maximum steps per episode 26 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 30 | min_train_samples: 6000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_3.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 12 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 20 # max iterations for the ABC optimization 40 | gamma: 0.99 # reward discount coefficient 41 | action_low: -12 # lower bound of the solution space 42 | action_high: 12 # upper bound of the solution space 43 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/config-4.yml: -------------------------------------------------------------------------------- 1 | # change the mpc_dataset_split w.r.t. config 3 to compare 2 | 3 | model_config: 4 | load_model: False # If set true, you must specify the model path, otherwise train a new model 5 | model_path: "storage/exp_4.ckpt" # the path to load the model 6 | n_states: 5 # environment states 7 | n_actions: 1 # how many controls we need 8 | n_hidden: 1 # hidden layer number 9 | size_hidden: 512 # hidden layer size 10 | use_cuda: True 11 | 12 | training_config: 13 | n_epochs: 60 # how many epoches to train the dynamic model 14 | learning_rate: 0.001 15 | batch_size: 256 16 | save_model_flag: True 17 | save_model_path: "storage/exp_4.ckpt" # the path to save the model 18 | save_loss_fig: True 19 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 20 | exp_number: 4 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_3.pkl" 25 | n_max_steps: 2000 # maximum steps per episode 26 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.8 # mpc dataset's portion in the training set 30 | min_train_samples: 6000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_4.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 12 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 20 # max iterations for the ABC optimization 40 | gamma: 0.99 # reward discount coefficient 41 | action_low: -12 # lower bound of the solution space 42 | action_high: 12 # upper bound of the solution space 43 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/config-5.yml: -------------------------------------------------------------------------------- 1 | # change the mpc horizon w.r.t. config 1 to compare 2 | 3 | model_config: 4 | load_model: False # If set true, you must specify the model path, otherwise train a new model 5 | model_path: "storage/exp_5.ckpt" # the path to load the model 6 | n_states: 5 # environment states 7 | n_actions: 1 # how many controls we need 8 | n_hidden: 1 # hidden layer number 9 | size_hidden: 512 # hidden layer size 10 | use_cuda: True 11 | 12 | training_config: 13 | n_epochs: 60 # how many epoches to train the dynamic model 14 | learning_rate: 0.001 15 | batch_size: 256 16 | save_model_flag: True 17 | save_model_path: "storage/exp_5.ckpt" # the path to save the model 18 | save_loss_fig: True 19 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 20 | exp_number: 5 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_5.pkl" 25 | n_max_steps: 2000 # maximum steps per episode 26 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 2 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 30 | min_train_samples: 6000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_5.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 25 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 20 # max iterations for the ABC optimization 40 | gamma: 0.99 # reward discount coefficient 41 | action_low: -12 # lower bound of the solution space 42 | action_high: 12 # upper bound of the solution space 43 | -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/loss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleSwing/storage/loss-1.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/loss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleSwing/storage/loss-2.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/model_error_exp_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleSwing/storage/model_error_exp_1.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/model_error_exp_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleSwing/storage/model_error_exp_2.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/reward-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleSwing/storage/reward-1.png -------------------------------------------------------------------------------- /MPC/MPC-CartPoleSwing/storage/reward-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-CartPoleSwing/storage/reward-2.png -------------------------------------------------------------------------------- /MPC/MPC-Double/Hive/SelectionMethods.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- SELECTION METHODS 4 | 5 | __all__ = ["tournament", "disruptive"] 6 | 7 | # ---- MODULE DOCSTRING 8 | 9 | __doc__ = """ 10 | 11 | (C) Hive, Romain Wuilbercq, 2017 12 | _ 13 | /_/_ .'''. 14 | =O(_)))) ...' `. 15 | \_\ `. .'''X 16 | `..' 17 | .---. .---..-./`) ,---. ,---. .-''-. 18 | | | |_ _|\ .-.')| / | | .'_ _ \ 19 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 20 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 21 | | (_,_) .---. | _( )_ || (_,_)___| 22 | | _ _--. | | | \ (_ o._) /' \ .---. 23 | |( ' ) | | | | \ (_,_) / \ `-' / 24 | (_{;}_)| | | | \ / \ / 25 | '(_,_) '---' '---' `---` `'-..-' 26 | 27 | Description: 28 | ----------- 29 | 30 | SelectionMethods.py 31 | 32 | Defines a collection of selection methods to be used with Hive. 33 | 34 | """ 35 | 36 | # ---- IMPORT MODULES 37 | 38 | import random 39 | 40 | import numpy as np 41 | 42 | # ---- SELECTION METHOD(S) 43 | 44 | def tournament(values, crowd_size=None): 45 | """ 46 | 47 | Defines a selection process whereby a number of individuals 48 | from a colony/generation are selected to compete. 49 | 50 | Individuals with greater fitness values compared to the rest 51 | have higher chance to be kept for the next cycle/generation 52 | - i.e. survival of the fittest. This method prones elitism. 53 | 54 | A solution compete with a fixed number of randomly chosen individuals 55 | (i.e. "crowd_size") from the population. 56 | 57 | This function uses the "random.sample" function from the python base 58 | "random" module and the "np.where" function from the "numpy" module. 59 | 60 | Parameters: 61 | ---------- 62 | 63 | :param int crowd_size: number of individuals competing 64 | 65 | """ 66 | 67 | # computes battle score metrics 68 | scores = [] 69 | for i in range(len(values)): 70 | 71 | # selects a pool of opponents randomly 72 | if (crowd_size != None) and (type(crowd_size) is int): 73 | opponents = random.sample(values, crowd_size) 74 | else: 75 | opponents = values 76 | 77 | # battles against opponents 78 | scores.append( sum(np.where(values[i]>opponents, 1, 0)) ) 79 | 80 | # returns an array of normalized scores 81 | return scores / sum(scores) 82 | 83 | def disruptive(values): 84 | """ 85 | 86 | Defines a selection process whereby a better chance is given to 87 | individuals with the highest and lowest fitness values - i.e. those 88 | further away from a "norm". 89 | 90 | This method represents a good mechanism by which diversity can 91 | be passed onto the next generation/cycle and avoid too-early 92 | convergence - i.e. improves the exploration of the search domain. 93 | 94 | This function uses the "np.mean" function from the "numpy" module. 95 | 96 | """ 97 | 98 | # computes mean fitness of population 99 | mean_ = np.mean(values) 100 | 101 | # computes score metrics 102 | scores = [] 103 | for i in range(len(values)): 104 | scores.append(abs(values[i] - mean_)) 105 | 106 | # returns an array of normalized scores 107 | return scores / sum(scores) 108 | 109 | # ---- END 110 | -------------------------------------------------------------------------------- /MPC/MPC-Double/Hive/Utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- MODULE DOCSTRING 4 | 5 | __doc__ = """ 6 | 7 | (C) Hive, Romain Wuilbercq, 2017 8 | _ 9 | /_/_ .'''. 10 | =O(_)))) ...' `. 11 | \_\ `. .'''X 12 | `..' 13 | .---. .---..-./`) ,---. ,---. .-''-. 14 | | | |_ _|\ .-.')| / | | .'_ _ \ 15 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 16 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 17 | | (_,_) .---. | _( )_ || (_,_)___| 18 | | _ _--. | | | \ (_ o._) /' \ .---. 19 | |( ' ) | | | | \ (_,_) / \ `-' / 20 | (_{;}_)| | | | \ / \ / 21 | '(_,_) '---' '---' `---` `'-..-' 22 | 23 | Description: 24 | ----------- 25 | 26 | A series of utility functions (such as plotting function etc...). 27 | 28 | """ 29 | 30 | # ---- IMPORT MODULES 31 | 32 | try: 33 | import matplotlib.pyplot as plt 34 | from matplotlib.font_manager import FontProperties 35 | except: 36 | raise ImportError("Install 'matplotlib' to plot convergence results.") 37 | 38 | # ---- CONVERGENCE PLOT 39 | 40 | def ConvergencePlot(cost): 41 | """ 42 | 43 | Monitors convergence. 44 | 45 | Parameters: 46 | ---------- 47 | 48 | :param dict cost: mean and best cost over cycles/generations as returned 49 | by an optimiser. 50 | 51 | """ 52 | plt.rc('font',family= 'Tibetan Machine Uni') 53 | fs=15 54 | font = FontProperties(); 55 | font.set_size( fs) #'larger'); 56 | labels = ["Best Bee's Cost", "Mean Bees' Cost"] 57 | plt.figure(figsize=(7, 4.5)); 58 | plt.plot(range(len(cost["best"])), cost["best"], label=labels[0]); 59 | plt.scatter(range(len(cost["mean"])), cost["mean"], color='red', label=labels[1]); 60 | plt.xlabel("Iteration",fontsize=fs); 61 | plt.ylabel("Cost",fontsize=fs); 62 | plt.xticks(fontsize=10) 63 | plt.yticks(fontsize=10) 64 | plt.legend(loc="best", prop = font); 65 | plt.xlim([0,len(cost["mean"])]); 66 | plt.grid(); 67 | plt.show(); 68 | 69 | # ---- END 70 | -------------------------------------------------------------------------------- /MPC/MPC-Double/Hive/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- MODULE DOCSTRING 4 | 5 | __doc__ = """ 6 | 7 | (C) Hive, Romain Wuilbercq, 2017 8 | _ 9 | /_/_ .'''. 10 | =O(_)))) ...' `. 11 | \_\ `. .'''X 12 | `..' 13 | .---. .---..-./`) ,---. ,---. .-''-. 14 | | | |_ _|\ .-.')| / | | .'_ _ \ 15 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 16 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 17 | | (_,_) .---. | _( )_ || (_,_)___| 18 | | _ _--. | | | \ (_ o._) /' \ .---. 19 | |( ' ) | | | | \ (_,_) / \ `-' / 20 | (_{;}_)| | | | \ / \ / 21 | '(_,_) '---' '---' `---` `'-..-' 22 | 23 | **Hive** is a simple implementation of a swarm-based optimisation 24 | algorithm called the Artificial Bee Colony (ABC) algorithm. 25 | 26 | The Artificial Bee Colony (ABC) algorithm is based on the intelligent foraging 27 | behaviour of honey bee swarm, proposed by Karaboga in 2005. 28 | 29 | """ 30 | 31 | __author__ = "Romain Wuilbercq" 32 | 33 | # ---- END 34 | -------------------------------------------------------------------------------- /MPC/MPC-Double/README.md: -------------------------------------------------------------------------------- 1 | # MPC - Double 2 | This folder contains the implementation of MPC algorithm and the evaluation on the Double environment 3 | 4 | The implementation is mainly followed in this paper [here](https://ieeexplore.ieee.org/abstract/document/8463189) 5 | 6 | To optimize the MPC controller, we use the [Artificial Bee Colony](https://en.wikipedia.org/wiki/Artificial_bee_colony_algorithm) (ABC) optimization algorithm, 7 | instead of the original random shooting method in the paper. The implementation of ABC algorithm is based on this repo: [https://github.com/rwuilbercq/Hive](https://github.com/rwuilbercq/Hive) 8 | 9 | All the hyper-parameters and experiment setting are stored in the ```config.yml``` file 10 | 11 | All the results (figure and model) will be stored in the ```./storage``` folder by default 12 | 13 | If you are not familiar with this environment, you can use the  `analyze_env()`  function in the `utils.py` to help you quickly understand the environment's state space, action space, reward range, etc. 14 | 15 | ### How to run 16 | 17 | To try our pre-trained model, simply run 18 | 19 | ```angularjs 20 | python run.py --path config.yml 21 | ``` 22 | The script will load the configurations in the ```config.yml``` file and begin to train 23 | 24 | Note that because of the long time of optimization, boost the data with MPC controller would take a long time 25 | 26 | If you want to load the dataset and a pre-trained dynamic model, note that you should normalize the dataset first, because the dynamic model need the data distribution information. 27 | You can use the `norm_train_data()` method in the `DynamicModel` class. 28 | ### Configuration explanation 29 | 30 | In the ```config.yml``` file, there are 4 sets of configuration. 31 | 32 | The `model_config`  part is the configuration of the parameters which determine the neural network architecture and the environment basis. 33 | 34 | The `training_config` part is the configuration of the training process parameters. 35 | 36 | The `dataset_config` part is the configuration of the dataset parameters. 37 | 38 | The `mpc_config` part is the configuration of the MPC algorithm parameters. 39 | 40 | The `exp_number` parameter in the `training_config` is the number of your experiment. The name of saved figure results in the `./storage` folder will be determined by this parameter. 41 | 42 | If you want to train your model from scratch, then set the `load_model` parameter to `False`. If set to `True`, the trainer will load the model from `model_path`. 43 | -------------------------------------------------------------------------------- /MPC/MPC-Double/config.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_5.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 512 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 60 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 256 14 | save_model_flag: True 15 | save_model_path: "storage/exp_5.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 18 | exp_number: 5 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_5.pkl" 23 | n_max_steps: 1000 # maximum steps per episode 24 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_5.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 25 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 20 # max iterations for the ABC optimization 38 | gamma: 0.99 # reward discount coefficient 39 | action_low: -12 # lower bound of the solution space 40 | action_high: 12 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-Double/controller.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from Hive import Hive 3 | from Hive import Utilities 4 | 5 | 6 | class MPC(object): 7 | def __init__(self, env, config): 8 | self.env = env 9 | mpc_config = config["mpc_config"] 10 | self.horizon = mpc_config["horizon"] 11 | self.numb_bees = mpc_config["numb_bees"] 12 | self.max_itrs = mpc_config["max_itrs"] 13 | self.gamma = mpc_config["gamma"] 14 | self.action_low = mpc_config["action_low"] 15 | self.action_high = mpc_config["action_high"] 16 | self.evaluator = Evaluator(self.gamma) 17 | 18 | def act(self, state, dynamic_model): 19 | ''' 20 | Optimize the action by Artificial Bee Colony algorithm 21 | :param state: (numpy array) current state 22 | :param dynamic_model: system dynamic model 23 | :return: (float) optimal action 24 | ''' 25 | self.evaluator.update(state, dynamic_model) 26 | optimizer = Hive.BeeHive( lower = [float(self.action_low)] * self.horizon, 27 | upper = [float(self.action_high)] * self.horizon, 28 | fun = self.evaluator.evaluate, 29 | numb_bees = self.numb_bees, 30 | max_itrs = self.max_itrs, 31 | verbose=False) 32 | cost = optimizer.run() 33 | #print("Solution: ",optimizer.solution[0]) 34 | #print("Fitness Value ABC: {0}".format(optimizer.best)) 35 | # Uncomment this if you want to see the performance of the optimizer 36 | #Utilities.ConvergencePlot(cost) 37 | return optimizer.solution[0] 38 | 39 | class Evaluator(object): 40 | def __init__(self, gamma=0.8): 41 | self.gamma = gamma 42 | 43 | def update(self, state, dynamic_model): 44 | self.state = state 45 | self.dynamic_model = dynamic_model 46 | 47 | def evaluate(self, actions): 48 | actions = np.array(actions) 49 | horizon = actions.shape[0] 50 | rewards = 0 51 | state_tmp = self.state.copy() 52 | for j in range(horizon): 53 | input_data = np.concatenate( (state_tmp,[actions[j]]) ) 54 | state_dt = self.dynamic_model.predict(input_data) 55 | state_tmp = state_tmp + state_dt[0] 56 | rewards -= (self.gamma ** j) * self.get_reward(state_tmp, actions[j]) 57 | return rewards 58 | 59 | # need to change this function according to different environment 60 | def get_reward(self,obs, action_n): 61 | x_c, th1, th2, _, _, _ = obs 62 | rwd = -(th1 ** 2 + th2 ** 2) 63 | return np.float32(rwd) + 2 * 0.25 ** 2 64 | 65 | 66 | -------------------------------------------------------------------------------- /MPC/MPC-Double/run.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import gym 3 | import argparse 4 | from dynamics import * 5 | from controller import * 6 | from utils import * 7 | from quanser_robots.common import GentlyTerminating 8 | import time 9 | 10 | parser = argparse.ArgumentParser(description='Specify the configuraton file path') 11 | parser.add_argument('--path', required=False, type=str, default='config.yml', 12 | help='Specify the configuraton file path') 13 | 14 | 15 | args = parser.parse_args() 16 | 17 | config_path = args.path # "config.yml" 18 | config = load_config(config_path) 19 | print_config(config_path) 20 | 21 | env_id = "DoublePendulum-v0" 22 | env = GentlyTerminating(gym.make(env_id)) 23 | 24 | model = DynamicModel(config) 25 | 26 | data_fac = DatasetFactory(env,config) 27 | data_fac.collect_random_dataset() 28 | 29 | loss = model.train(data_fac.random_trainset,data_fac.random_testset) 30 | 31 | mpc = MPC(env,config) 32 | 33 | rewards_list = [] 34 | for itr in range(config["dataset_config"]["n_mpc_itrs"]): 35 | t = time.time() 36 | print("**********************************************") 37 | print("The reinforce process [%s], collecting data ..." % itr) 38 | rewards = data_fac.collect_mpc_dataset(mpc, model) 39 | trainset, testset = data_fac.make_dataset() 40 | rewards_list += rewards 41 | 42 | plt.close("all") 43 | plt.figure(figsize=(12, 5)) 44 | plt.title('Reward Trend with %s iteration' % itr) 45 | plt.plot(rewards_list) 46 | plt.savefig("storage/reward-" + str(model.exp_number) + ".png") 47 | print("Consume %s s in this iteration" % (time.time() - t)) 48 | loss = model.train(trainset, testset) -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/config-1.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 512 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 60 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 256 14 | save_model_flag: True 15 | save_model_path: "storage/exp_1.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 18 | exp_number: 1 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_1.pkl" 23 | n_max_steps: 20000 # maximum steps per episode 24 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_1.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 12 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 15 # max iterations for the ABC optimization 38 | gamma: 0.99 # reward discount coefficient 39 | action_low: -12 # lower bound of the solution space 40 | action_high: 12 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/config-2.yml: -------------------------------------------------------------------------------- 1 | # change the mpc horizon w.r.t. config 1 to compare 2 | 3 | model_config: 4 | load_model: False # If set true, you must specify the model path, otherwise train a new model 5 | model_path: "storage/exp_2.ckpt" # the path to load the model 6 | n_states: 6 # environment states 7 | n_actions: 1 # how many controls we need 8 | n_hidden: 1 # hidden layer number 9 | size_hidden: 512 # hidden layer size 10 | use_cuda: True 11 | 12 | training_config: 13 | n_epochs: 60 # how many epoches to train the dynamic model 14 | learning_rate: 0.001 15 | batch_size: 256 16 | save_model_flag: True 17 | save_model_path: "storage/exp_2.ckpt" # the path to save the model 18 | save_loss_fig: True 19 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 20 | exp_number: 2 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_2.pkl" 25 | n_max_steps: 1000 # maximum steps per episode 26 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 30 | min_train_samples: 6000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_2.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 5 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 20 # max iterations for the ABC optimization 40 | gamma: 0.99 # reward discount coefficient 41 | action_low: -12 # lower bound of the solution space 42 | action_high: 12 # upper bound of the solution space 43 | -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/config-3.yml: -------------------------------------------------------------------------------- 1 | # change the n_mpc_episodes w.r.t. config 1 to compare 2 | 3 | model_config: 4 | load_model: False # If set true, you must specify the model path, otherwise train a new model 5 | model_path: "storage/exp_3.ckpt" # the path to load the model 6 | n_states: 6 # environment states 7 | n_actions: 1 # how many controls we need 8 | n_hidden: 1 # hidden layer number 9 | size_hidden: 512 # hidden layer size 10 | use_cuda: True 11 | 12 | training_config: 13 | n_epochs: 60 # how many epoches to train the dynamic model 14 | learning_rate: 0.001 15 | batch_size: 256 16 | save_model_flag: True 17 | save_model_path: "storage/exp_3.ckpt" # the path to save the model 18 | save_loss_fig: True 19 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 20 | exp_number: 3 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_3.pkl" 25 | n_max_steps: 1000 # maximum steps per episode 26 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 8 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 30 | min_train_samples: 6000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_3.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 12 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 20 # max iterations for the ABC optimization 40 | gamma: 0.99 # reward discount coefficient 41 | action_low: -12 # lower bound of the solution space 42 | action_high: 12 # upper bound of the solution space 43 | -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/config-4.yml: -------------------------------------------------------------------------------- 1 | # change the mpc_dataset_split w.r.t. config 3 to compare 2 | 3 | model_config: 4 | load_model: False # If set true, you must specify the model path, otherwise train a new model 5 | model_path: "storage/exp_4.ckpt" # the path to load the model 6 | n_states: 6 # environment states 7 | n_actions: 1 # how many controls we need 8 | n_hidden: 1 # hidden layer number 9 | size_hidden: 512 # hidden layer size 10 | use_cuda: True 11 | 12 | training_config: 13 | n_epochs: 60 # how many epoches to train the dynamic model 14 | learning_rate: 0.001 15 | batch_size: 256 16 | save_model_flag: True 17 | save_model_path: "storage/exp_4.ckpt" # the path to save the model 18 | save_loss_fig: True 19 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 20 | exp_number: 4 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_3.pkl" 25 | n_max_steps: 1000 # maximum steps per episode 26 | n_random_episodes: 800 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 8 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.8 # mpc dataset's portion in the training set 30 | min_train_samples: 6000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_4.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 12 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 20 # max iterations for the ABC optimization 40 | gamma: 0.99 # reward discount coefficient 41 | action_low: -12 # lower bound of the solution space 42 | action_high: 12 # upper bound of the solution space 43 | -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/loss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/loss-1.png -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/loss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/loss-2.png -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/loss-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/loss-3.png -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/loss-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/loss-4.png -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/model_error_exp_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/model_error_exp_1.png -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/reward-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/reward-1.png -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/reward-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/reward-2.png -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/reward-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/reward-3.png -------------------------------------------------------------------------------- /MPC/MPC-Double/storage/reward-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Double/storage/reward-4.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/Hive/SelectionMethods.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- SELECTION METHODS 4 | 5 | __all__ = ["tournament", "disruptive"] 6 | 7 | # ---- MODULE DOCSTRING 8 | 9 | __doc__ = """ 10 | 11 | (C) Hive, Romain Wuilbercq, 2017 12 | _ 13 | /_/_ .'''. 14 | =O(_)))) ...' `. 15 | \_\ `. .'''X 16 | `..' 17 | .---. .---..-./`) ,---. ,---. .-''-. 18 | | | |_ _|\ .-.')| / | | .'_ _ \ 19 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 20 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 21 | | (_,_) .---. | _( )_ || (_,_)___| 22 | | _ _--. | | | \ (_ o._) /' \ .---. 23 | |( ' ) | | | | \ (_,_) / \ `-' / 24 | (_{;}_)| | | | \ / \ / 25 | '(_,_) '---' '---' `---` `'-..-' 26 | 27 | Description: 28 | ----------- 29 | 30 | SelectionMethods.py 31 | 32 | Defines a collection of selection methods to be used with Hive. 33 | 34 | """ 35 | 36 | # ---- IMPORT MODULES 37 | 38 | import random 39 | 40 | import numpy as np 41 | 42 | # ---- SELECTION METHOD(S) 43 | 44 | def tournament(values, crowd_size=None): 45 | """ 46 | 47 | Defines a selection process whereby a number of individuals 48 | from a colony/generation are selected to compete. 49 | 50 | Individuals with greater fitness values compared to the rest 51 | have higher chance to be kept for the next cycle/generation 52 | - i.e. survival of the fittest. This method prones elitism. 53 | 54 | A solution compete with a fixed number of randomly chosen individuals 55 | (i.e. "crowd_size") from the population. 56 | 57 | This function uses the "random.sample" function from the python base 58 | "random" module and the "np.where" function from the "numpy" module. 59 | 60 | Parameters: 61 | ---------- 62 | 63 | :param int crowd_size: number of individuals competing 64 | 65 | """ 66 | 67 | # computes battle score metrics 68 | scores = [] 69 | for i in range(len(values)): 70 | 71 | # selects a pool of opponents randomly 72 | if (crowd_size != None) and (type(crowd_size) is int): 73 | opponents = random.sample(values, crowd_size) 74 | else: 75 | opponents = values 76 | 77 | # battles against opponents 78 | scores.append( sum(np.where(values[i]>opponents, 1, 0)) ) 79 | 80 | # returns an array of normalized scores 81 | return scores / sum(scores) 82 | 83 | def disruptive(values): 84 | """ 85 | 86 | Defines a selection process whereby a better chance is given to 87 | individuals with the highest and lowest fitness values - i.e. those 88 | further away from a "norm". 89 | 90 | This method represents a good mechanism by which diversity can 91 | be passed onto the next generation/cycle and avoid too-early 92 | convergence - i.e. improves the exploration of the search domain. 93 | 94 | This function uses the "np.mean" function from the "numpy" module. 95 | 96 | """ 97 | 98 | # computes mean fitness of population 99 | mean_ = np.mean(values) 100 | 101 | # computes score metrics 102 | scores = [] 103 | for i in range(len(values)): 104 | scores.append(abs(values[i] - mean_)) 105 | 106 | # returns an array of normalized scores 107 | return scores / sum(scores) 108 | 109 | # ---- END 110 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/Hive/Utilities.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- MODULE DOCSTRING 4 | 5 | __doc__ = """ 6 | 7 | (C) Hive, Romain Wuilbercq, 2017 8 | _ 9 | /_/_ .'''. 10 | =O(_)))) ...' `. 11 | \_\ `. .'''X 12 | `..' 13 | .---. .---..-./`) ,---. ,---. .-''-. 14 | | | |_ _|\ .-.')| / | | .'_ _ \ 15 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 16 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 17 | | (_,_) .---. | _( )_ || (_,_)___| 18 | | _ _--. | | | \ (_ o._) /' \ .---. 19 | |( ' ) | | | | \ (_,_) / \ `-' / 20 | (_{;}_)| | | | \ / \ / 21 | '(_,_) '---' '---' `---` `'-..-' 22 | 23 | Description: 24 | ----------- 25 | 26 | A series of utility functions (such as plotting function etc...). 27 | 28 | """ 29 | 30 | # ---- IMPORT MODULES 31 | 32 | try: 33 | import matplotlib.pyplot as plt 34 | from matplotlib.font_manager import FontProperties 35 | except: 36 | raise ImportError("Install 'matplotlib' to plot convergence results.") 37 | 38 | # ---- CONVERGENCE PLOT 39 | 40 | def ConvergencePlot(cost): 41 | """ 42 | 43 | Monitors convergence. 44 | 45 | Parameters: 46 | ---------- 47 | 48 | :param dict cost: mean and best cost over cycles/generations as returned 49 | by an optimiser. 50 | 51 | """ 52 | plt.rc('font',family= 'Tibetan Machine Uni') 53 | fs=15 54 | font = FontProperties(); 55 | font.set_size( fs) #'larger'); 56 | labels = ["Best Bee's Cost", "Mean Bees' Cost"] 57 | plt.figure(figsize=(7, 4.5)); 58 | plt.plot(range(len(cost["best"])), cost["best"], label=labels[0]); 59 | plt.scatter(range(len(cost["mean"])), cost["mean"], color='red', label=labels[1]); 60 | plt.xlabel("Iteration",fontsize=fs); 61 | plt.ylabel("Cost",fontsize=fs); 62 | plt.xticks(fontsize=10) 63 | plt.yticks(fontsize=10) 64 | plt.legend(loc="best", prop = font); 65 | plt.xlim([0,len(cost["mean"])]); 66 | plt.grid(); 67 | plt.savefig("mpc.png") 68 | plt.show(); 69 | 70 | # ---- END 71 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/Hive/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # ---- MODULE DOCSTRING 4 | 5 | __doc__ = """ 6 | 7 | (C) Hive, Romain Wuilbercq, 2017 8 | _ 9 | /_/_ .'''. 10 | =O(_)))) ...' `. 11 | \_\ `. .'''X 12 | `..' 13 | .---. .---..-./`) ,---. ,---. .-''-. 14 | | | |_ _|\ .-.')| / | | .'_ _ \ 15 | | | ( ' )/ `-' \| | | .'/ ( ` ) ' 16 | | '-(_{;}_)`-'`"`| | _ | |. (_ o _) | 17 | | (_,_) .---. | _( )_ || (_,_)___| 18 | | _ _--. | | | \ (_ o._) /' \ .---. 19 | |( ' ) | | | | \ (_,_) / \ `-' / 20 | (_{;}_)| | | | \ / \ / 21 | '(_,_) '---' '---' `---` `'-..-' 22 | 23 | **Hive** is a simple implementation of a swarm-based optimisation 24 | algorithm called the Artificial Bee Colony (ABC) algorithm. 25 | 26 | The Artificial Bee Colony (ABC) algorithm is based on the intelligent foraging 27 | behaviour of honey bee swarm, proposed by Karaboga in 2005. 28 | 29 | """ 30 | 31 | __author__ = "Romain Wuilbercq" 32 | 33 | # ---- END 34 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/README.md: -------------------------------------------------------------------------------- 1 | # MPC - Qube 2 | This folder contains the implementation of MPC algorithm and the evaluation on the Qube environment 3 | 4 | The implementation is mainly followed in this paper [here](https://ieeexplore.ieee.org/abstract/document/8463189) 5 | 6 | To optimize the MPC controller, we use the [Artificial Bee Colony](https://en.wikipedia.org/wiki/Artificial_bee_colony_algorithm) (ABC) optimization algorithm, 7 | instead of the original random shooting method in the paper. The implementation of ABC algorithm is based on this repo: [https://github.com/rwuilbercq/Hive](https://github.com/rwuilbercq/Hive) 8 | 9 | All the hyper-parameters and experiment setting are stored in the ```config.yml``` file 10 | 11 | All the results (figure and model) will be stored in the ```./storage``` folder by default 12 | 13 | If you are not familiar with this environment, you can use the  `analyze_env()`  function in the `utils.py` to help you quickly understand the environment's state space, action space, reward range, etc. 14 | 15 | ### How to run 16 | 17 | To try our pre-trained model, simply run 18 | 19 | ```angularjs 20 | python run.py --path config.yml 21 | ``` 22 | The script will load the configurations in the ```config.yml``` file and begin to train 23 | 24 | Note that because of the long time of optimization, boost the data with MPC controller would take a long time 25 | 26 | If you want to load the dataset and a pre-trained dynamic model, note that you should normalize the dataset first, because the dynamic model need the data distribution information. 27 | You can use the `norm_train_data()` method in the `DynamicModel` class. 28 | 29 | ### Configuration explanation 30 | 31 | In the ```config.yml``` file, there are 4 sets of configuration. 32 | 33 | The `model_config`  part is the configuration of the parameters which determine the neural network architecture and the environment basis. 34 | 35 | The `training_config` part is the configuration of the training process parameters. 36 | 37 | The `dataset_config` part is the configuration of the dataset parameters. 38 | 39 | The `mpc_config` part is the configuration of the MPC algorithm parameters. 40 | 41 | The `exp_number` parameter in the `training_config` is the number of your experiment. The name of saved figure results in the `./storage` folder will be determined by this parameter. 42 | 43 | If you want to train your model from scratch, then set the `load_model` parameter to `False`. If set to `True`, the trainer will load the model from `model_path`. 44 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/config.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_7.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 500 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 100 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 512 14 | save_model_flag: True 15 | save_model_path: "storage/exp_7.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 18 | exp_number: 5 # experiment number 19 | save_loss_fig_frequency: 100 # how many every epochs to save the loss figure 20 | exp_number: 7 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_7.pkl" 25 | n_max_steps: 500 # maximum steps per episode 26 | n_random_episodes: 700 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.1 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 30 | min_train_samples: 8000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_7.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 5 # how long of the horizon to predict 38 | numb_bees: 4 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 10 # max iterations for the ABC optimization 40 | gamma: 0.999 # reward discount coefficient 41 | max_itrs: 40 # max iterations for the ABC optimization 42 | action_low: -5 # lower bound of the solution space 43 | action_high: 5 # upper bound of the solution space 44 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/controller.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from Hive import Hive 3 | from Hive import Utilities 4 | 5 | 6 | class MPC(object): 7 | def __init__(self, env, config): 8 | self.env = env 9 | mpc_config = config["mpc_config"] 10 | self.horizon = mpc_config["horizon"] 11 | self.numb_bees = mpc_config["numb_bees"] 12 | self.max_itrs = mpc_config["max_itrs"] 13 | self.gamma = mpc_config["gamma"] 14 | self.action_low = mpc_config["action_low"] 15 | self.action_high = mpc_config["action_high"] 16 | self.evaluator = Evaluator(self.gamma) 17 | 18 | def act(self, state, dynamic_model): 19 | ''' 20 | Optimize the action by Artificial Bee Colony algorithm 21 | :param state: (numpy array) current state 22 | :param dynamic_model: system dynamic model 23 | :return: (float) optimal action 24 | ''' 25 | self.evaluator.update(state, dynamic_model) 26 | optimizer = Hive.BeeHive( lower = [float(self.action_low)] * self.horizon, 27 | upper = [float(self.action_high)] * self.horizon, 28 | fun = self.evaluator.evaluate, 29 | numb_bees = self.numb_bees, 30 | max_itrs = self.max_itrs, 31 | verbose=False) 32 | cost = optimizer.run() 33 | #print("Solution: ",optimizer.solution[0]) 34 | #print("Fitness Value ABC: {0}".format(optimizer.best)) 35 | # Uncomment this if you want to see the performance of the optimizer 36 | #Utilities.ConvergencePlot(cost) 37 | return optimizer.solution[0] 38 | 39 | class Evaluator(object): 40 | def __init__(self, gamma=0.8): 41 | self.gamma = gamma 42 | 43 | def update(self, state, dynamic_model): 44 | self.state = state 45 | self.dynamic_model = dynamic_model 46 | 47 | def evaluate(self, actions): 48 | actions = np.array(actions) 49 | horizon = actions.shape[0] 50 | rewards = 0 51 | state_tmp = self.state.copy() 52 | for j in range(horizon): 53 | input_data = np.concatenate( (state_tmp,[actions[j]]) ) 54 | state_dt = self.dynamic_model.predict(input_data) 55 | state_tmp = state_tmp + state_dt[0] 56 | rewards -= (self.gamma ** j) * self.get_reward(state_tmp, actions[j]) 57 | return rewards 58 | 59 | def get_reward(self,obs, action_n): 60 | cos_th, sin_th, cos_al, sin_al, th_d, al_d = obs 61 | cos_th = min(max(cos_th, -1), 1) 62 | cos_al = min(max(cos_al, -1), 1) 63 | al=np.arccos(cos_al) 64 | th=np.arccos(cos_th) 65 | al_mod = al % (2 * np.pi) - np.pi 66 | action = action_n * 5 67 | cost = al_mod**2 + 5e-3*al_d**2 + 1e-1*th**2 + 2e-2*th_d**2 + 3e-3*action**2 68 | reward = np.exp(-cost)*0.02 69 | return reward 70 | 71 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/run.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import gym 3 | import torch.utils.data as data 4 | from dynamics import * 5 | from controller import * 6 | from utils import * 7 | from quanser_robots.common import GentlyTerminating 8 | import time 9 | 10 | # datasets: numpy array, size:[sample number, input dimension] 11 | # labels: numpy array, size:[sample number, output dimension] 12 | 13 | env_id ="Qube-v0" # "CartPole-v0" 14 | env = GentlyTerminating(gym.make(env_id)) 15 | config_path = "config.yml" 16 | config = load_config(config_path) 17 | print_config(config_path) 18 | 19 | model = DynamicModel(config) 20 | 21 | data_fac = DatasetFactory(env,config) 22 | data_fac.collect_random_dataset() 23 | 24 | loss = model.train(data_fac.random_trainset,data_fac.random_testset) 25 | 26 | mpc = MPC(env,config) 27 | 28 | rewards_list = [] 29 | for itr in range(config["dataset_config"]["n_mpc_itrs"]): 30 | t = time.time() 31 | print("**********************************************") 32 | print("The reinforce process [%s], collecting data ..." % itr) 33 | rewards = data_fac.collect_mpc_dataset(mpc, model) 34 | trainset, testset = data_fac.make_dataset() 35 | rewards_list += rewards 36 | 37 | plt.close("all") 38 | plt.figure(figsize=(12, 5)) 39 | plt.title('Reward Trend with %s iteration' % itr) 40 | plt.plot(rewards_list) 41 | plt.savefig("storage/reward-" + str(model.exp_number) + ".png") 42 | print("Consume %s s in this iteration" % (time.time() - t)) 43 | loss = model.train(trainset, testset) -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/Angle Error h_0 100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/Angle Error h_0 100.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/State Error h_0 100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/State Error h_0 100.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/config-1.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_1.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 256 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 500 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 64 14 | save_model_flag: True 15 | save_model_path: "storage/exp_1.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 50 # how many every epochs to save the loss figure 18 | exp_number: 1 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_1.pkl" 23 | n_max_steps: 500 # maximum steps per episode 24 | n_random_episodes: 30 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 6 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_1.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 12 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 20 # max iterations for the ABC optimization 38 | gamma: 0.98 # reward discount coefficient 39 | action_low: -5 # lower bound of the solution space 40 | action_high: 5 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/config-2.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_2.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 256 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 1000 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 32 14 | save_model_flag: True 15 | save_model_path: "storage/exp_2.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 50 # how many every epochs to save the loss figure 18 | exp_number: 2 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_2.pkl" 23 | n_max_steps: 500 # maximum steps per episode 24 | n_random_episodes: 30 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 6 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_2.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 15 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 15 # max iterations for the ABC optimization 38 | gamma: 0.99 # reward discount coefficient 39 | action_low: -5 # lower bound of the solution space 40 | action_high: 5 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/config-4.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_4.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 2 # hidden layer number 7 | size_hidden: 128 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 1000 # how many epoches to train the dynamic model 12 | learning_rate: 0.0006 13 | batch_size: 64 14 | save_model_flag: True 15 | save_model_path: "storage/exp_4.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 100 # how many every epochs to save the loss figure 18 | exp_number: 4 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_4.pkl" 23 | n_max_steps: 500 # maximum steps per episode 24 | n_random_episodes: 30 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 8 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_4.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 20 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 20 # max iterations for the ABC optimization 38 | gamma: 0.98 # reward discount coefficient 39 | action_low: -5 # lower bound of the solution space 40 | action_high: 5 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/config-5.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_5.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 2 # hidden layer number 7 | size_hidden: 128 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 1000 # how many epoches to train the dynamic model 12 | learning_rate: 0.0006 13 | batch_size: 64 14 | save_model_flag: True 15 | save_model_path: "storage/exp_5.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 100 # how many every epochs to save the loss figure 18 | exp_number: 5 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_5.pkl" 23 | n_max_steps: 500 # maximum steps per episode 24 | n_random_episodes: 30 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 8 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_5.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 30 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 30 # max iterations for the ABC optimization 38 | gamma: 0.98 # reward discount coefficient 39 | action_low: -5 # lower bound of the solution space 40 | action_high: 5 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/config-6.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_6.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 2 # hidden layer number 7 | size_hidden: 128 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 1000 # how many epoches to train the dynamic model 12 | learning_rate: 0.0006 13 | batch_size: 64 14 | save_model_flag: True 15 | save_model_path: "storage/exp_6.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 100 # how many every epochs to save the loss figure 18 | exp_number: 6 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_6.pkl" 23 | n_max_steps: 500 # maximum steps per episode 24 | n_random_episodes: 30 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 8 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_6.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 20 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 40 # max iterations for the ABC optimization 38 | gamma: 0.999 # reward discount coefficient 39 | action_low: -5 # lower bound of the solution space 40 | action_high: 5 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/config-7.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_7.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 1 # hidden layer number 7 | size_hidden: 500 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 100 # how many epoches to train the dynamic model 12 | learning_rate: 0.001 13 | batch_size: 512 14 | save_model_flag: True 15 | save_model_path: "storage/exp_7.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 10 # how many every epochs to save the loss figure 18 | exp_number: 5 # experiment number 19 | save_loss_fig_frequency: 100 # how many every epochs to save the loss figure 20 | exp_number: 7 # experiment number 21 | 22 | dataset_config: 23 | load_flag: False 24 | load_path: "storage/data_exp_7.pkl" 25 | n_max_steps: 500 # maximum steps per episode 26 | n_random_episodes: 700 # how many random episodes' data to fit the initial dynamic model 27 | testset_split: 0.1 # testset's portion in the random dataset, the rest portion is the training set 28 | n_mpc_episodes: 4 # how many episodes data sampled with the MPC controller 29 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 30 | min_train_samples: 8000 31 | n_mpc_itrs: 100 # the number to perform reinforce iteration 32 | save_flag: True # set True if you want to save all the dataset 33 | save_path: "storage/data_exp_7.pkl" 34 | 35 | # MPC controller configuration 36 | mpc_config: 37 | horizon: 15 # how long of the horizon to predict 38 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 39 | max_itrs: 15 # max iterations for the ABC optimization 40 | gamma: 0.98 # reward discount coefficient 41 | max_itrs: 40 # max iterations for the ABC optimization 42 | gamma: 0.8 # reward discount coefficient 43 | action_low: -5 # lower bound of the solution space 44 | action_high: 5 # upper bound of the solution space 45 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/config_3.yml: -------------------------------------------------------------------------------- 1 | model_config: 2 | load_model: False # If set true, you must specify the model path, otherwise train a new model 3 | model_path: "storage/exp_5.ckpt" # the path to load the model 4 | n_states: 6 # environment states 5 | n_actions: 1 # how many controls we need 6 | n_hidden: 2 # hidden layer number 7 | size_hidden: 128 # hidden layer size 8 | use_cuda: True 9 | 10 | training_config: 11 | n_epochs: 1000 # how many epoches to train the dynamic model 12 | learning_rate: 0.0006 13 | batch_size: 64 14 | save_model_flag: True 15 | save_model_path: "storage/exp_5.ckpt" # the path to save the model 16 | save_loss_fig: True 17 | save_loss_fig_frequency: 100 # how many every epochs to save the loss figure 18 | exp_number: 5 # experiment number 19 | 20 | dataset_config: 21 | load_flag: False 22 | load_path: "storage/data_exp_5.pkl" 23 | n_max_steps: 500 # maximum steps per episode 24 | n_random_episodes: 30 # how many random episodes' data to fit the initial dynamic model 25 | testset_split: 0.2 # testset's portion in the random dataset, the rest portion is the training set 26 | n_mpc_episodes: 8 # how many episodes data sampled with the MPC controller 27 | mpc_dataset_split: 0.5 # mpc dataset's portion in the training set 28 | min_train_samples: 6000 29 | n_mpc_itrs: 100 # the number to perform reinforce iteration 30 | save_flag: True # set True if you want to save all the dataset 31 | save_path: "storage/data_exp_5.pkl" 32 | 33 | # MPC controller configuration 34 | mpc_config: 35 | horizon: 20 # how long of the horizon to predict 36 | numb_bees: 8 # ABC optimization algorithm param: number of the bees 37 | max_itrs: 40 # max iterations for the ABC optimization 38 | gamma: 0.999 # reward discount coefficient 39 | action_low: -5 # lower bound of the solution space 40 | action_high: 5 # upper bound of the solution space 41 | -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/loss-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/loss-1.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/loss-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/loss-2.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/loss-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/loss-3.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/loss-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/loss-4.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/loss-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/loss-5.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/loss-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/loss-6.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/loss-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/loss-7.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/mpc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/mpc.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/reward-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/reward-1.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/reward-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/reward-2.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/reward-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/reward-3.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/reward-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/reward-4.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/reward-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/reward-5.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/reward-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/reward-6.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/storage/reward-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/MPC/MPC-Qube/storage/reward-7.png -------------------------------------------------------------------------------- /MPC/MPC-Qube/test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import gym 3 | import torch.utils.data as data 4 | from dynamics import * 5 | from controller import * 6 | from utils import * 7 | from quanser_robots.common import GentlyTerminating 8 | import time 9 | 10 | def test(mpc, model): 11 | reward_episodes = [] 12 | for i in range(data_fac.n_mpc_episodes): 13 | data_tmp = [] 14 | label_tmp = [] 15 | reward_episode = 0 16 | state_old = data_fac.env.reset() 17 | for j in range(data_fac.n_max_steps): 18 | env.render() 19 | action = mpc.act(state_old, model) 20 | action = np.array([action]) 21 | data_tmp.append(np.concatenate((state_old, action))) 22 | state_new, reward, done, info = data_fac.env.step(action) 23 | reward_episode += reward 24 | label_tmp.append(state_new - state_old) 25 | if done: 26 | break 27 | state_old = state_new 28 | reward_episodes.append(reward_episode) 29 | print(f"Episode [{i}/{data_fac.n_mpc_episodes}], Reward: {reward_episode:.8f}") 30 | return reward_episodes 31 | 32 | env_id ="Qube-v0" # "CartPole-v0" 33 | env = GentlyTerminating(gym.make(env_id)) 34 | config_path = "config.yml" 35 | config = load_config(config_path) 36 | print_config(config_path) 37 | 38 | config["model_config"]["load_model"] = True 39 | config["dataset_config"]["load_flag"] = True 40 | 41 | model = DynamicModel(config) 42 | 43 | data_fac = DatasetFactory(env,config) 44 | model.norm_train_data(data_fac.all_dataset["data"],data_fac.all_dataset["label"]) 45 | 46 | mpc = MPC(env,config) 47 | 48 | rewards_list = [] 49 | for itr in range(config["dataset_config"]["n_mpc_itrs"]): 50 | t = time.time() 51 | print("**********************************************") 52 | print("The reinforce process [%s], collecting data ..." % itr) 53 | rewards = test(mpc, model) 54 | rewards_list += rewards 55 | plt.close("all") 56 | plt.figure(figsize=(12, 5)) 57 | plt.title('Reward Trend with %s iteration' % itr) 58 | plt.plot(rewards_list) 59 | plt.savefig("storage/reward-" + str(model.exp_number) + "_test.png") 60 | print("Consume %s s in this iteration" % (time.time() - t)) 61 | loss = model.trai 62 | -------------------------------------------------------------------------------- /MPC/README.md: -------------------------------------------------------------------------------- 1 | # MPC - Model Predictive Control 2 | 3 | This folder contains the implementation of MPC algorithm and the evaluation of it. 4 | 5 | The implementation is mainly followed in this paper [here](https://ieeexplore.ieee.org/abstract/document/8463189) 6 | 7 | To optimize the MPC controller, we use the [Artificial Bee Colony](https://en.wikipedia.org/wiki/Artificial_bee_colony_algorithm) (ABC) optimization algorithm, 8 | instead of the original random shooting method in the paper. The implementation of ABC algorithm is based on this repo: [https://github.com/rwuilbercq/Hive](https://github.com/rwuilbercq/Hive) 9 | 10 | Choose the environment folder and follow the instructions to run everything. 11 | 12 | Jupyter notebook example is in the ```./MPC-CartPoleStab``` folder. 13 | 14 | ## Overview of the experiment results: 15 | 16 | 17 | The best results in different environments: 18 | 19 | | Environment | Horizon |Numb\_bees | Max\_itrs | Gamma | Episode reward | 20 | | -------- | -----: | :----: | :----: | :----: | :----: | 21 | | Qube | 30 | 8 | 30 | 0.98 | 4.0 | 22 | | CartPole Swingup | 20 | 8 | 20 | 0.99 | 2000 | 23 | | CartPole Stab | 12 | 8 | 20 | 0.99 | 19999 | 24 | | Double CartPole | 5 | 8 | 20 | 0.99 | 91 | 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Reinforcement Learning Course Project 2 | ## Note: This repo is deprecated, for a newer and faster implementation of MPC, please go to the following repo: [https://github.com/liuzuxin/MPC_template-model_predictive_control_for_reinforcement_learning](https://github.com/liuzuxin/MPC_template-model_predictive_control_for_reinforcement_learning) 3 | 4 | Technische Universität Darmstadt winter semester 2018/2019 5 | 6 | Supervisor: Jan Peters, Riad Akrour 7 | 8 | This repository contains the PyTorch implementation of Deep Q-Network and Model Predictive Control (MPC), 9 | and the evaluation of them on the [quanser robot platform](https://git.ias.informatik.tu-darmstadt.de/quanser/clients). 10 | 11 | 12 | 13 | 14 | 15 | 16 | ## Authors 17 | + Zuxin Liu (Implement algorithms, clean code, run experiment and write report) 18 | + Yunhao Li (Run experiment and write report) 19 | + Junfei Xiao (Run experiment and write report) 20 | 21 | ## Algorithms 22 | + [DQN](https://arxiv.org/abs/1312.5602) 23 | + [MPC](https://ieeexplore.ieee.org/abstract/document/8463189) 24 | 25 | ## Platforms 26 | + [Qube](https://git.ias.informatik.tu-darmstadt.de/quanser/clients/tree/master/quanser_robots/qube) 27 | + [Double Pendlum](https://git.ias.informatik.tu-darmstadt.de/quanser/clients/tree/master/quanser_robots/double_pendulum) 28 | + [Cartpole Swing-up](https://git.ias.informatik.tu-darmstadt.de/quanser/clients/tree/master/quanser_robots/cartpole) 29 | + [Cartpole Stab](https://git.ias.informatik.tu-darmstadt.de/quanser/clients/tree/master/quanser_robots/cartpole) 30 | 31 | ## Installation 32 | For the installation of the Quanser robot simulation environment, please see [this page](https://git.ias.informatik.tu-darmstadt.de/quanser/clients) 33 | 34 | For the implementation of the algorithms, the following packages are required: 35 | 36 | + python = 3.6.2 37 | + pytorch = 1.0.1 38 | + numpy = 1.12.1 39 | + matplotlib = 2.1.1 40 | + gym 41 | 42 | You can simply create the same environment as ours by using [Anaconda](https://www.anaconda.com/). 43 | All the required packages are included in the ```environment.yaml``` file. You can create the environment by the following command 44 | 45 | ```angular2html 46 | conda env create -f environment.yaml 47 | ``` 48 | Then, activate your environment by 49 | 50 | ``` 51 | source activate pytorch 52 | ``` 53 | 54 | ## How to run 55 | 56 | 1. Choose the algorithm you want to use and change to the corresponding folder (DQN or MPC) 57 | 2. Choose the environment you want to evaluate and change to the folder (CartPoleStab, Double, Qube or Swing) 58 | 3. Change the configuration file ```config.yml``` to the parameters you want, and follow the instructions in the folder 59 | -------------------------------------------------------------------------------- /Resources/DQN/Playing Atari with Deep Reinforcement Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/DQN/Playing Atari with Deep Reinforcement Learning.pdf -------------------------------------------------------------------------------- /Resources/DQN/Q-Learning in Continuous State Action Spaces.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/DQN/Q-Learning in Continuous State Action Spaces.pdf -------------------------------------------------------------------------------- /Resources/DQN/README.md: -------------------------------------------------------------------------------- 1 | # Resources - DQN 2 | 3 | This folder contains some resources we used in the DQN. 4 | -------------------------------------------------------------------------------- /Resources/MPC/Approximate Dynamic Programming with Gaussian Processes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/MPC/Approximate Dynamic Programming with Gaussian Processes.pdf -------------------------------------------------------------------------------- /Resources/MPC/Constrained model predictive control: Stability and optimality.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/MPC/Constrained model predictive control: Stability and optimality.pdf -------------------------------------------------------------------------------- /Resources/MPC/Neural Network Dynamics for Model based Deep Rl with Model free fine tuning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/MPC/Neural Network Dynamics for Model based Deep Rl with Model free fine tuning.pdf -------------------------------------------------------------------------------- /Resources/MPC/README.md: -------------------------------------------------------------------------------- 1 | # Resources MPC 2 | 3 | This folder contains some resources we used in MPC. 4 | -------------------------------------------------------------------------------- /Resources/README.md: -------------------------------------------------------------------------------- 1 | # Resources 2 | 3 | This folder contains some resources we used in this project. 4 | 5 | The `./figures` folder contains some gif results in simulation and real environment -------------------------------------------------------------------------------- /Resources/figures/README.md: -------------------------------------------------------------------------------- 1 | # Resources - figures 2 | 3 | This folder contains some gif results in simulation and real environment -------------------------------------------------------------------------------- /Resources/figures/qube-after-fine-tuning.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/figures/qube-after-fine-tuning.gif -------------------------------------------------------------------------------- /Resources/figures/qube-before-fine-tuning.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/figures/qube-before-fine-tuning.gif -------------------------------------------------------------------------------- /Resources/figures/qube.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/figures/qube.gif -------------------------------------------------------------------------------- /Resources/figures/stabe.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/figures/stabe.gif -------------------------------------------------------------------------------- /Resources/figures/swing.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/figures/swing.gif -------------------------------------------------------------------------------- /Resources/figures/swing_interesting.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/liuzuxin/Deep-Q-Network-and-Model-Predictive-Control-Project/3032a127c445b4821aad7ca38daade5caa8210a7/Resources/figures/swing_interesting.gif --------------------------------------------------------------------------------