├── .gitmodules ├── LICENSE ├── NVIDIA_CLA_v1.0.1.pdf ├── README.md ├── ddqn_main.py ├── delayed_env.py ├── delayed_q_diagram.png ├── dqn_agents.py ├── environment.yml ├── example_sweep.yml ├── gym_modifications ├── acrobot.py └── cartpole.py ├── init_main.py ├── pretrained_agents ├── 2xcbo7mg_Acrobot-v1_ddqn_delay.h5 └── i06rfoxy_cartpole_ddqn_no_delay.h5 ├── requirements.txt └── third_party └── gym.patch /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "third_party/gym"] 2 | path = third_party/gym 3 | url = git@github.com:openai/gym.git 4 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2020 Gal Dalal 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /NVIDIA_CLA_v1.0.1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galdl/rl_delay_basic/2f88729d2339f468326cc38cd04e792617f544e7/NVIDIA_CLA_v1.0.1.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Acting in Delayed Environments with Non-Stationary Markov Policies 2 | This repository contains the implementation of the Delayed, Agumented, and Oblivious agents from the paper: 3 | "[Acting in Delayed Environments with Non-Stationary Markov Policies](https://arxiv.org/pdf/2101.11992)", Esther Derman\*, Gal Dalal\*, Shie Mannor (*equal contribution), published in ICLR 2021. 4 | 5 | The agent here supports the Cartpole and Acrobot environments by OpenAI. The Atari-supported agent can be found [here](https://github.com/galdl/rl_delay_atari). 6 | 7 | 8 | 9 | 10 | ## Installation 11 | 1. Tested with python3.7. Conda virtual env is encouraged. Other versions of python and/or environments should also be possible. 12 | 2. Clone project and cd to project dir. 13 | 3. Create virtual env:\ 14 | Option 1 -- Tensorflow 2.2: Run `pip install -r requirements.py` (other versions of the packages in requirements.py should also be fine).\ 15 | Option 2 -- Tensorflow 1.14: Run `conda env create -f environment. yml` to directly create a virtual env called `tf_14`. 16 | 4. To enable support of the noisy Cartpole and Acrobot experiments, modify the original gym cartpole.py and acrobot.py:\ 17 | Option 1 -- via pip install: 18 | ```bash 19 | cd third_party 20 | git submodule sync && git submodule update --init --recursive 21 | cd gym 22 | git apply ../gym.patch 23 | pip install -e . 24 | ``` 25 | Option 2 -- manually:\ 26 | 4a. Find location in site packages. E.g., "/home/username/anaconda3/envs/rl_delay_env/lib/python3.7/site-packages/gym/envs/classic_control/cartpole.py"\ 27 | 4b. Overwrite the above file with "rl_delay_basic/gym_modifications/cartpole.py". Repeat the same process for "rl_delay_basic/gym_modifications/acrobot.py". 28 | 29 | ## Hyperparameters: 30 | The parameters used for the experiments in the paper are the default ones appearing in init_main.py. They are the same for all types of agents (delayed, augmented, oblivious), both noisy and non-noisy, and all delay values. The only exception is that for Cartpole epsilon_decay=0.999, while for Acrobot epsilon_decay=0.9999. 31 | 32 | ## Wandb sweep: 33 | Using wandb, you can easily run multiple experiments for different agents, delay values, hyperparameters, etc. An example sweep file is included the in project: example_sweep.yml. A sweep can be created via "wandb sweep example_sweep.yml", and multiple workers can be started with "wandb agent your-sweep-id". For more details see https://docs.wandb.ai/guides/sweeps/quickstart. 34 | 35 | 36 | ## Citing the Project 37 | 38 | To cite this repository in publications: 39 | 40 | ``` 41 | @article{derman2021acting, 42 | title={Acting in delayed environments with non-stationary markov policies}, 43 | author={Derman, Esther and Dalal, Gal and Mannor, Shie}, 44 | journal={International Conference on Learning Representations (ICLR)}, 45 | year={2021} 46 | } 47 | ``` 48 | 49 | Happy delaying! 50 | -------------------------------------------------------------------------------- /ddqn_main.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | import os 4 | import numpy as np 5 | from dqn_agents import DDQNAgent, DDQNPlanningAgent, update_loss, reshape_state 6 | from init_main import init_main 7 | import wandb 8 | from tqdm import tqdm 9 | 10 | import socket 11 | 12 | # # possible cuda fix for mac 13 | # os.environ['KMP_DUPLICATE_LIB_OK']='True' 14 | 15 | # # don't use GPU (if running, e.g., on mac) 16 | # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 17 | # os.environ["CUDA_VISIBLE_DEVICES"] = "" 18 | 19 | AVERAGE_OVER_LAST_EP = 0.1 20 | # EPISODES = 3500 21 | SAVE_PATH = 'pretrained_agents' 22 | EP_LEN_LIMIT = int(1e4) 23 | EVAL_FREQ = 5 24 | 25 | 26 | def init_episode(delayed_env, agent, augment_state, state_size): 27 | ep_reward = 0 28 | ep_reshaped_reward = 0 29 | state = delayed_env.reset() 30 | state = massage_state(state, augment_state, delayed_env, state_size) 31 | agent.clear_action_buffer() 32 | loss_dict = {} 33 | loss_count = 0 34 | ep_step = 0 35 | return ep_reward, ep_reshaped_reward, state, loss_dict, loss_count, ep_step 36 | 37 | def routinely_save_agent(e, env_name): 38 | agent_name = env_name + '_ddqn_delay.h5' 39 | if e % 349 == 0: 40 | if not os.path.isdir(SAVE_PATH): 41 | os.makedirs(SAVE_PATH) 42 | agent_full_name = wandb.run.id + '_' + agent_name 43 | agent_path = os.path.join(SAVE_PATH, agent_full_name) 44 | agent.save(agent_path) 45 | print('saved agent to {}'.format(agent_path)) 46 | 47 | 48 | def agent_act(config, agent, state, delayed_env, eval=False): 49 | if config.agent_type == 'delayed': 50 | action = agent.act(state, pending_actions=delayed_env.get_pending_actions(), eval=eval) 51 | else: 52 | action = agent.act(state, eval) 53 | return action 54 | 55 | def massage_state(state, augment_state, delayed_env, state_size): 56 | if augment_state: 57 | state = np.concatenate((state, delayed_env.get_pending_actions())) 58 | state = reshape_state(state, delayed_env.is_atari_env, state_size) 59 | return state 60 | 61 | if __name__ == "__main__": 62 | config, delayed_env, state_size, action_size, done, batch_size = init_main() 63 | 64 | score_vec = [] 65 | # for non-atari (i.e. cartpole) env, run on CPU 66 | # if not delayed_env.is_atari_env: 67 | 68 | 69 | kwargs = { 70 | 'action_size': action_size, 71 | 'is_atari_env': delayed_env.is_atari_env, 72 | 'is_delayed_agent': config.is_delayed_agent, 73 | 'delay_value': config.delay_value, 74 | 'epsilon_min': config.epsilon_min, 75 | 'epsilon_decay': config.epsilon_decay, 76 | 'learning_rate': config.learning_rate, 77 | 'epsilon': config.epsilon, 78 | 'use_m_step_reward': config.use_m_step_reward, 79 | 'use_latest_reward': config.use_latest_reward 80 | } 81 | 82 | # if not config.double_q: 83 | # agent = DQNAgent(state_size=state_size, **kwargs) 84 | # else: 85 | augment_state = False 86 | # wandb.config.update({'augment_state': False}, allow_val_change=True) 87 | if config.agent_type == 'delayed': 88 | agent = DDQNPlanningAgent(state_size=state_size, env=delayed_env, 89 | use_learned_forward_model=config.use_learned_forward_model, **kwargs) 90 | else: 91 | if config.agent_type == 'augmented': 92 | # wandb.config.update({'augment_state': True}, allow_val_change=True) 93 | augment_state = True 94 | state_size += config.delay_value 95 | # third option is 'oblivious' 96 | agent = DDQNAgent(state_size=state_size, **kwargs) 97 | 98 | episode = 0 99 | ep_reward, ep_reshaped_reward, state, loss_dict, loss_count, ep_step = init_episode(delayed_env, agent, 100 | augment_state, state_size) 101 | total_steps_delay_dependent = int(100000 + config.delay_value * 10000) 102 | # eval_done = False 103 | for step_num in tqdm(range(total_steps_delay_dependent)): 104 | # if episode % EVAL_FREQ == 0: 105 | # while not eval_done: 106 | # action = agent_act(config, agent, state, delayed_env, eval=True) 107 | # next_state, eval_reward, eval_done, _ = delayed_env.step(action) 108 | # state = massage_state(next_state, config, delayed_env, state_size) 109 | # ep_reward += eval_reward 110 | # wandb.log({'reward_eval': ep_reward}, step=step_num) 111 | # episode += 1 112 | # else: 113 | # for step in range(EP_LEN_LIMIT): 114 | # delayed_env.orig_env.render() 115 | action = agent_act(config, agent, state, delayed_env, eval=False) 116 | next_state, reward, done, _ = delayed_env.step(action) 117 | ep_reward += reward 118 | if config.use_reward_shaping and not delayed_env.is_atari_env: 119 | reward = delayed_env.get_shaped_reward(next_state, reward) 120 | ep_reshaped_reward += reward 121 | next_state = massage_state(next_state, augment_state, delayed_env, state_size) 122 | can_memorize = ep_step > config.delay_value or not delayed_env.pretrained_agent_loaded 123 | if can_memorize: # otherwise, we're using expert samples initially which is unfair 124 | agent.memorize(state, action, reward, next_state, done) 125 | state = next_state 126 | if config.double_q and step_num % config.target_network_update_freq == 0: 127 | agent.update_target_model() 128 | if len(agent.memory) > batch_size and step_num % config.train_freq == 0: 129 | batch_loss_dict = agent.replay(batch_size) 130 | update_loss(loss_dict, batch_loss_dict) 131 | loss_count += 1 132 | ep_step += 1 133 | if done: 134 | routinely_save_agent(episode, config.env_name) 135 | wandb_dict = {'reward': ep_reward, 'ep_reshaped_reward': ep_reshaped_reward} 136 | if 'f_model_loss' in loss_dict: 137 | f_model_loss = loss_dict['f_model_loss'] / loss_count 138 | wandb_dict['f_model_loss'] = f_model_loss 139 | wandb.log(wandb_dict, step=step_num) 140 | score_vec.append(ep_reward) 141 | episode += 1 142 | ep_reward, ep_reshaped_reward, state, loss_dict, loss_count, ep_step = init_episode(delayed_env, agent, augment_state, 143 | state_size) 144 | 145 | tot_ep_num = len(score_vec) 146 | avg_over = round(tot_ep_num * AVERAGE_OVER_LAST_EP) 147 | final_avg_score = np.mean(score_vec[-avg_over:]) 148 | wandb.log({'final_score': final_avg_score}) 149 | -------------------------------------------------------------------------------- /delayed_env.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from collections import deque 4 | from dqn_agents import DQNAgent 5 | import numpy as np 6 | from dqn_agents import reshape_state 7 | from numpy import sin, cos, pi 8 | 9 | CARTPOLE_TRAINED_NON_DELAYED_AGENT_PATH = 'pretrained_agents/i06rfoxy_cartpole_ddqn_no_delay.h5' 10 | 11 | class DelayedEnv: 12 | def __init__(self, orig_env, delay_value): 13 | self.orig_env = orig_env 14 | self.env_name = str(self.orig_env) 15 | self.is_atari_env = 'AtariEnv' in self.env_name 16 | self.pending_actions = deque() 17 | self.delay_value = delay_value 18 | self.state_size = orig_env.observation_space.shape#[0] 19 | if not self.is_atari_env: 20 | self.state_size = self.state_size[0] 21 | self.action_size = orig_env.action_space.n 22 | self.stored_init_state = None 23 | self.trained_non_delayed_agent = DQNAgent(state_size=self.state_size, 24 | action_size=self.action_size, is_delayed_agent=False, 25 | delay_value=0, epsilon=0, is_atari_env=self.is_atari_env) 26 | self.pretrained_agent_loaded = False 27 | 28 | if 'CartPole' in self.env_name: # for other envs this is not necessary 29 | self.trained_non_delayed_agent.load(CARTPOLE_TRAINED_NON_DELAYED_AGENT_PATH) 30 | self.pretrained_agent_loaded = True 31 | 32 | def step(self, action): 33 | if self.delay_value > 0: 34 | self.pending_actions.append(action) 35 | if len(self.pending_actions) - 1 >= self.delay_value: 36 | executed_action = self.pending_actions.popleft() 37 | else: 38 | curr_state = reshape_state(self.get_curr_state(), self.is_atari_env, self.state_size) 39 | executed_action = self.trained_non_delayed_agent.act(curr_state) 40 | else: 41 | executed_action = action 42 | return self.orig_env.step(executed_action) 43 | 44 | def reset(self): 45 | self.pending_actions.clear() 46 | return self.orig_env.reset() 47 | 48 | def get_shaped_reward(self, state, orig_reward): 49 | reward = orig_reward 50 | if 'CartPole' in self.env_name: 51 | x, x_dot, theta, theta_dot = state 52 | r1 = (self.orig_env.x_threshold - abs(x)) / self.orig_env.x_threshold - 0.8 53 | r2 = (self.orig_env.theta_threshold_radians - abs( 54 | theta)) / self.orig_env.theta_threshold_radians - 0.5 55 | reward = r1 + r2 56 | if 'MountainCar' in self.env_name: 57 | # # Adjust reward based on car position 58 | # reward = state[0] + 0.5 59 | # # Adjust reward for task completion 60 | # if state[0] >= 0.5: 61 | # reward += 1 62 | position = state[0] 63 | reward = (position - self.orig_env.goal_position) / ((self.orig_env.max_position - self.orig_env.min_position) * 10) 64 | # print(position, self.goal_position) 65 | if position >= 0.1: 66 | reward += 10 67 | elif position >= 0.25: 68 | reward += 50 69 | elif position >= 0.5: 70 | reward += 100 71 | return reward 72 | 73 | def get_pending_actions(self): 74 | if len(self.pending_actions) == 0 and self.delay_value > 0: 75 | # reconstruct anticipated trajectory using the oracle 76 | self.store_initial_state() 77 | curr_state = self.get_curr_state() 78 | for i in range(self.delay_value): 79 | curr_state = reshape_state(curr_state, self.is_atari_env, self.state_size) 80 | estimated_action = self.trained_non_delayed_agent.act(curr_state) 81 | self.pending_actions.append(estimated_action) 82 | curr_state = self.get_next_state(state=None, action=estimated_action) 83 | self.restore_initial_state() 84 | 85 | return self.pending_actions 86 | 87 | def store_initial_state(self): 88 | if self.is_atari_env: 89 | self.stored_init_state = self.orig_env.clone_state() 90 | else: 91 | self.stored_init_state = self.orig_env.unwrapped.state 92 | 93 | def restore_initial_state(self): 94 | if self.is_atari_env: 95 | self.orig_env.restore_state(self.stored_init_state) 96 | else: 97 | self.orig_env.unwrapped.state = self.stored_init_state 98 | 99 | def get_curr_state(self): 100 | if self.is_atari_env: 101 | curr_state = self.orig_env.ale.getScreenRGB2() 102 | else: 103 | curr_state = self.orig_env.unwrapped.state 104 | if 'Acrobot' in self.env_name: 105 | curr_state = np.array([cos(curr_state[0]), sin(curr_state[0]), cos(curr_state[1]), sin(curr_state[1]), 106 | curr_state[2], curr_state[3]]) 107 | return curr_state 108 | 109 | def get_next_state(self, state, action): 110 | next_state, _, _, _ = self.orig_env.step(action) 111 | self.orig_env._elapsed_steps -= 1 112 | return next_state 113 | 114 | def reset_to_state(self, state): 115 | self.orig_env.unwrapped.state = state 116 | # -------------------------------------------------------------------------------- /delayed_q_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galdl/rl_delay_basic/2f88729d2339f468326cc38cd04e792617f544e7/delayed_q_diagram.png -------------------------------------------------------------------------------- /dqn_agents.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | 3 | from collections import deque 4 | from keras.models import Sequential 5 | from keras.layers import Dense, Conv2D, MaxPool2D, Flatten 6 | from copy import deepcopy 7 | import random 8 | from keras.optimizers import Adam 9 | from keras import backend as K 10 | import tensorflow as tf 11 | import numpy as np 12 | 13 | def reshape_state(state, is_atari_env, state_size): 14 | reshaped = state 15 | if not is_atari_env: 16 | reshaped = np.reshape(state, [1, state_size]) 17 | else: 18 | if len(state.shape) < 4: 19 | reshaped = np.expand_dims(state, axis=0) 20 | return reshaped 21 | 22 | def update_loss(loss, sample_loss): 23 | if loss is not None and sample_loss is not None: 24 | for key, val in sample_loss.items(): 25 | if key in loss: 26 | loss[key] += val 27 | else: 28 | loss[key] = val 29 | 30 | def concatenate_state_action(state, action): 31 | out = np.concatenate((state[0], [action])) 32 | out = np.reshape(out, [1, len(out)]) 33 | return out 34 | 35 | class DQNAgent: 36 | def __init__(self, state_size, action_size, is_atari_env, is_delayed_agent=False, delay_value=0, epsilon_min=0.001, 37 | epsilon_decay=0.999, learning_rate=0.001, epsilon=1.0, use_m_step_reward=False, use_latest_reward=True, 38 | loss='mse', **kwargs): 39 | self.state_size = state_size 40 | self.action_size = action_size 41 | self.is_atari_env = is_atari_env 42 | mem_len = 50000 if self.is_atari_env else 2000 43 | self.memory = deque(maxlen=mem_len) 44 | self.gamma = 0.95 # discount rate 45 | self.epsilon = epsilon # exploration rate 46 | self.epsilon_min = epsilon_min 47 | self.epsilon_decay = epsilon_decay #0.995 48 | self.learning_rate = learning_rate 49 | self.sample_buffer = deque() 50 | self.is_delayed_agent = is_delayed_agent 51 | self.delay_value = delay_value 52 | self.model = self._build_model(loss=loss) 53 | self.use_m_step_reward = use_m_step_reward 54 | self.use_latest_reward = use_latest_reward 55 | 56 | 57 | def _huber_loss(self, y_true, y_pred, clip_delta=1.0): 58 | """Huber loss for Q Learning 59 | References: https://en.wikipedia.org/wiki/Huber_loss 60 | https://www.tensorflow.org/api_docs/python/tf/losses/huber_loss 61 | """ 62 | error = y_true - y_pred 63 | cond = K.abs(error) <= clip_delta 64 | 65 | squared_loss = 0.5 * K.square(error) 66 | quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta) 67 | 68 | return K.mean(tf.where(cond, squared_loss, quadratic_loss)) 69 | 70 | def _build_model(self, loss=None, input_size=None, output_size=None): 71 | loss = self._huber_loss if loss is 'huber' else loss 72 | input_size = self.state_size if input_size is None else input_size 73 | output_size = self.action_size if output_size is None else output_size 74 | 75 | # Neural Net for Deep-Q learning Model 76 | model = Sequential() 77 | if self.is_atari_env: 78 | model.add(Conv2D(32, 8, strides=(4,4), input_shape=input_size, activation='relu')) 79 | model.add(MaxPool2D()) 80 | model.add(Conv2D(64, 4, strides=(2,2), activation='relu')) 81 | model.add(MaxPool2D()) 82 | model.add(Conv2D(64, 3, strides=(1,1), activation='relu')) 83 | model.add(MaxPool2D()) 84 | model.add(Flatten()) 85 | model.add(Dense(64, activation='relu')) 86 | model.add(Dense(64, activation='relu')) 87 | model.add(Dense(output_size, activation='linear')) 88 | else: 89 | model.add(Dense(24, input_dim=input_size, activation='relu')) 90 | model.add(Dense(24, activation='relu')) 91 | model.add(Dense(output_size, activation='linear')) 92 | 93 | model.compile(loss=loss, 94 | optimizer=Adam(lr=self.learning_rate)) 95 | return model 96 | 97 | def memorize(self, state, action, reward, next_state, done): 98 | if self.is_delayed_agent: 99 | # for earlier time than delay_value, the data is problematic (non-delayed response) 100 | # Construct modified tuple by keeping old s_t with new a_{t+m}, r_{t+m} s_{t+m+1} 101 | new_tuple = (state, action, reward, next_state, done) 102 | self.sample_buffer.append(new_tuple) 103 | if len(self.sample_buffer) - 1 >= self.delay_value: 104 | old_tuple = self.sample_buffer.popleft() 105 | modified_tuple = list(deepcopy(old_tuple)) 106 | modified_tuple[1] = action 107 | modified_tuple[2] = self.m_step_reward(first_reward=old_tuple[2]) 108 | # trying to use s_{t+1} instead of s_{t+m} as in the original ICML2020 submission 109 | # modified_tuple[3] = next_state 110 | modified_tuple = tuple(modified_tuple) 111 | self.memory.append(modified_tuple) 112 | else: 113 | self.memory.append((state, action, reward, next_state, done)) 114 | 115 | def act(self, state, eval=False): 116 | if not eval and np.random.rand() <= self.epsilon: 117 | return random.randrange(self.action_size) 118 | act_values = self.model.predict(state) 119 | return np.argmax(act_values[0]) # returns action 120 | 121 | def m_step_reward(self, first_reward): 122 | if not self.use_m_step_reward: 123 | if self.use_latest_reward: 124 | return self.sample_buffer[-1][2] 125 | else: 126 | return first_reward 127 | else: 128 | discounted_rew = first_reward 129 | for i in range(self.delay_value): 130 | discounted_rew += self.gamma ** (i + 1) * self.sample_buffer[i][2] 131 | return discounted_rew 132 | 133 | def effective_gamma(self): 134 | return self.gamma if not self.use_m_step_reward else (self.gamma ** (self.delay_value + 1)) 135 | 136 | def replay(self, batch_size): 137 | minibatch = random.sample(self.memory, batch_size) 138 | for state, action, reward, next_state, done in minibatch: 139 | target = reward 140 | if not done: 141 | target = (reward + self.effective_gamma() * 142 | np.amax(self.model.predict(next_state)[0])) 143 | target_f = self.model.predict(state) 144 | target_f[0][action] = target 145 | # self.model.fit(state, target_f, epochs=1, verbose=0, 146 | # callbacks=[WandbCallback()]) 147 | self.model.fit(state, target_f, epochs=1, verbose=0) 148 | if self.epsilon > self.epsilon_min: 149 | self.epsilon *= self.epsilon_decay 150 | 151 | 152 | def load(self, name): 153 | self.model.load_weights(name) 154 | 155 | def save(self, name): 156 | self.model.save_weights(name) 157 | 158 | def clear_action_buffer(self): 159 | self.sample_buffer.clear() 160 | 161 | 162 | class DDQNAgent(DQNAgent): 163 | def __init__(self, state_size, action_size, is_atari_env, is_delayed_agent=False, delay_value=0, epsilon_min=0.001, 164 | epsilon_decay=0.999, learning_rate=0.001, epsilon=1.0, use_m_step_reward=False, use_latest_reward=True): 165 | super().__init__(state_size, action_size, is_atari_env=is_atari_env, is_delayed_agent=is_delayed_agent, delay_value=delay_value, 166 | epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, learning_rate=learning_rate, 167 | epsilon=epsilon, use_m_step_reward=use_m_step_reward, use_latest_reward=use_latest_reward, 168 | loss='huber') 169 | # self.model = self._build_model() 170 | self.target_model = self._build_model(loss='huber') 171 | self.update_target_model() 172 | 173 | 174 | def update_target_model(self): 175 | # copy weights from model to target_model 176 | self.target_model.set_weights(self.model.get_weights()) 177 | 178 | def train_model(self, batch): 179 | state_vec, action_vec, reward_vec, next_state_vec, done_vec = batch 180 | target = self.model.predict(state_vec) 181 | # a = self.model.predict(next_state)[0] 182 | t = self.target_model.predict(next_state_vec)#[0] 183 | not_done_arr = np.invert(np.asarray(done_vec)) 184 | new_targets = reward_vec + not_done_arr * self.effective_gamma() * np.amax(t, axis=1) 185 | for i in range(len(batch[0])): 186 | target[i][action_vec[i]] = new_targets[i] 187 | # target[0][action] = reward + self.gamma * t[np.argmax(a)] 188 | train_history = self.model.fit(state_vec, target, epochs=1, verbose=0) 189 | q_loss = train_history.history['loss'][0] 190 | loss_dict = {'q_loss': q_loss} 191 | return loss_dict 192 | 193 | def _create_batch(self, indices): 194 | state_vec, action_vec, reward_vec, next_state_vec, done_vec = [], [], [], [], [] 195 | for i in indices: 196 | data = self.memory[i] 197 | state, action, reward, next_state, done = data 198 | state_vec.append(np.array(state, copy=False)) 199 | action_vec.append(action) 200 | reward_vec.append(reward) 201 | next_state_vec.append(np.array(next_state, copy=False)) 202 | done_vec.append(done) 203 | return np.concatenate(state_vec, axis=0), action_vec, reward_vec, np.concatenate(next_state_vec, axis=0), done_vec 204 | 205 | def replay(self, batch_size): 206 | loss = {} 207 | indices = np.random.choice(len(self.memory), batch_size) 208 | batch = self._create_batch(indices) 209 | sample_loss = self.train_model(batch) 210 | update_loss(loss, sample_loss) 211 | if self.epsilon > self.epsilon_min: 212 | self.epsilon *= self.epsilon_decay 213 | return loss 214 | 215 | class DDQNPlanningAgent(DDQNAgent): 216 | def __init__(self, state_size, action_size, is_atari_env, is_delayed_agent=False, delay_value=0, epsilon_min=0.001, 217 | epsilon_decay=0.999, learning_rate=0.001, epsilon=1.0, use_m_step_reward=False, 218 | use_latest_reward=True, env=None, use_learned_forward_model=True): 219 | super().__init__(state_size, action_size, is_atari_env=is_atari_env, is_delayed_agent=is_delayed_agent, delay_value=delay_value, 220 | epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, learning_rate=learning_rate, 221 | epsilon=epsilon, use_m_step_reward=use_m_step_reward, use_latest_reward=use_latest_reward) 222 | self.use_learned_forward_model = use_learned_forward_model 223 | if self.use_learned_forward_model: 224 | keras_forward_model = self._build_model(loss='mse', input_size=self.state_size + 1, output_size=self.state_size) 225 | self.forward_model = ForwardModel(keras_forward_model) 226 | else: 227 | self.forward_model = env 228 | 229 | def train_model(self, batch): 230 | loss_dict = super().train_model(batch) 231 | if self.use_learned_forward_model and self.delay_value > 0: 232 | state_vec, action_vec, _, next_state_vec, _ = batch 233 | act_t = np.asarray([action_vec]).transpose() 234 | concat_vec = np.concatenate((state_vec, act_t), axis=1) 235 | train_history = self.forward_model.keras_model.fit(concat_vec, next_state_vec, epochs=1, verbose=0) 236 | f_model_loss = train_history.history['loss'][0] 237 | loss_dict['f_model_loss'] = f_model_loss 238 | return loss_dict 239 | 240 | def act(self, state, pending_actions, eval): 241 | if not eval and np.random.rand() <= self.epsilon: 242 | return random.randrange(self.action_size) 243 | last_state = state 244 | if self.delay_value > 0: 245 | if not self.use_learned_forward_model: 246 | self.forward_model.store_initial_state() 247 | # initial_state = deepcopy(state) 248 | for curr_action in pending_actions: 249 | last_state = self.forward_model.get_next_state(state=last_state, action=curr_action) 250 | if not self.use_learned_forward_model: 251 | self.forward_model.restore_initial_state() 252 | last_state_r = reshape_state(last_state, self.is_atari_env, self.state_size) 253 | act_values = self.model.predict(last_state_r) 254 | return np.argmax(act_values[0]) # returns best action for last state 255 | 256 | def memorize(self, state, action, reward, next_state, done): 257 | # for earlier time than delay_value, the data is problematic (non-delayed response) 258 | # Construct modified tuple by keeping old s_t with new a_{t+m}, r_{t+m} s_{t+m+1} 259 | new_tuple = (state, action, reward, next_state, done) 260 | self.sample_buffer.append(new_tuple) 261 | if len(self.sample_buffer) - 1 >= self.delay_value: 262 | old_tuple = self.sample_buffer.popleft() 263 | modified_tuple = list(deepcopy(old_tuple)) 264 | # build time-coherent tuple from new tuple and old action 265 | modified_tuple[0] = state 266 | # modified_tuple[1] = action 267 | modified_tuple[2] = reward #self.m_step_reward(first_reward=old_tuple[2]) 268 | modified_tuple[3] = next_state 269 | modified_tuple = tuple(modified_tuple) 270 | self.memory.append(modified_tuple) 271 | 272 | class ForwardModel: 273 | def __init__(self, keras_model): 274 | self.keras_model = keras_model 275 | 276 | def get_next_state(self, state, action): 277 | input = concatenate_state_action(state, action) 278 | return self.keras_model.predict(input) 279 | 280 | def reset_to_state(self, state): 281 | # not necessary here. Only used if the forwrad_model is the actual env instance 282 | pass -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: tf_14 2 | channels: 3 | - anaconda 4 | - conda-forge 5 | - defaults 6 | dependencies: 7 | - _libgcc_mutex=0.1=main 8 | - _tflow_select=2.1.0=gpu 9 | - absl-py=0.10.0=py37_0 10 | - astor=0.8.1=py37_0 11 | - blas=1.0=mkl 12 | - c-ares=1.16.1=h7b6447c_0 13 | - ca-certificates=2021.5.25=h06a4308_1 14 | - certifi=2021.5.30=py37h06a4308_0 15 | - cudatoolkit=10.1.243=h6bb024c_0 16 | - cudnn=7.6.5=cuda10.1_0 17 | - cupti=10.1.168=0 18 | - gast=0.4.0=py_0 19 | - google-pasta=0.2.0=py_0 20 | - grpcio=1.31.0=py37hf8bcb03_0 21 | - h5py=2.10.0=py37hd6299e0_1 22 | - hdf5=1.10.6=hb1b8bf9_0 23 | - importlib-metadata=2.0.0=py_1 24 | - intel-openmp=2020.2=254 25 | - keras-applications=1.0.8=py_1 26 | - keras-preprocessing=1.1.0=py_1 27 | - ld_impl_linux-64=2.33.1=h53a641e_7 28 | - libblas=3.8.0=14_mkl 29 | - libcblas=3.8.0=14_mkl 30 | - libffi=3.3=he6710b0_2 31 | - libgcc-ng=9.1.0=hdf63c60_0 32 | - libgfortran-ng=7.3.0=hdf63c60_0 33 | - liblapack=3.8.0=14_mkl 34 | - libprotobuf=3.13.0.1=hd408876_0 35 | - libstdcxx-ng=9.1.0=hdf63c60_0 36 | - markdown=3.3.2=py37_0 37 | - mkl=2019.4=243 38 | - mkl-service=2.3.0=py37he904b0f_0 39 | - mkl_fft=1.2.0=py37h23d657b_0 40 | - mkl_random=1.1.0=py37hd6b4f25_0 41 | - ncurses=6.2=he6710b0_1 42 | - openssl=1.1.1k=h27cfd23_0 43 | - pip=21.0.1=py37h06a4308_0 44 | - protobuf=3.13.0.1=py37he6710b0_1 45 | - python=3.7.10=hdb3f193_0 46 | - python_abi=3.7=1_cp37m 47 | - readline=8.1=h27cfd23_0 48 | - scipy=1.5.2=py37h0b6359f_0 49 | - setuptools=52.0.0=py37h06a4308_0 50 | - six=1.15.0=py_0 51 | - sqlite=3.35.2=hdfb4753_0 52 | - tensorboard=1.14.0=py37hf484d3e_0 53 | - tensorflow=1.14.0=gpu_py37h74c33d7_0 54 | - tensorflow-base=1.14.0=gpu_py37he45bfe2_0 55 | - tensorflow-estimator=1.14.0=py_0 56 | - tensorflow-gpu=1.14.0=h0d30ee6_0 57 | - termcolor=1.1.0=py37_1 58 | - tk=8.6.10=hbc83047_0 59 | - unzip=6.0=h611a1e1_0 60 | - werkzeug=1.0.1=py_0 61 | - wheel=0.36.2=pyhd3eb1b0_0 62 | - wrapt=1.12.1=py37h7b6447c_1 63 | - xz=5.2.5=h7b6447c_0 64 | - zip=3.0=h1adfe0e_0 65 | - zipp=3.3.1=py_0 66 | - zlib=1.2.11=h7b6447c_3 67 | - pip: 68 | - atari-py==0.2.9 69 | - backcall==0.2.0 70 | - chardet==4.0.0 71 | - click==8.0.1 72 | - cloudpickle==1.6.0 73 | - configparser==5.0.2 74 | - cvxopt==1.2.6 75 | - decorator==5.0.9 76 | - docker-pycreds==0.4.0 77 | - gitdb==4.0.7 78 | - gitpython==3.1.18 79 | - gym==0.18.3 80 | - idna==2.10 81 | - ipdb==0.13.9 82 | - ipython==7.25.0 83 | - ipython-genutils==0.2.0 84 | - jedi==0.18.0 85 | - keras==2.3.1 86 | - matplotlib-inline==0.1.2 87 | - nashpy==0.0.20 88 | - numpy==1.20.1 89 | - nvidia-htop==1.0.2 90 | - opencv-python==4.5.2.54 91 | - opencv-python-headless==4.5.2.54 92 | - parso==0.8.2 93 | - pathtools==0.1.2 94 | - pexpect==4.8.0 95 | - pickleshare==0.7.5 96 | - pillow==8.2.0 97 | - promise==2.3 98 | - prompt-toolkit==3.0.19 99 | - psutil==5.8.0 100 | - ptyprocess==0.7.0 101 | - pyglet==1.5.15 102 | - pygments==2.9.0 103 | - python-dateutil==2.8.1 104 | - pyyaml==5.4.1 105 | - requests==2.25.1 106 | - sentry-sdk==1.1.0 107 | - shortuuid==1.0.1 108 | - smmap==4.0.0 109 | - subprocess32==3.5.4 110 | - theano==1.0.5 111 | - toml==0.10.2 112 | - tqdm==4.61.1 113 | - traitlets==5.0.5 114 | - typing-extensions==3.10.0.0 115 | - unrar==0.4 116 | - urllib3==1.26.6 117 | - wandb==0.10.32 118 | - wcwidth==0.2.5 -------------------------------------------------------------------------------- /example_sweep.yml: -------------------------------------------------------------------------------- 1 | program: ddqn_main.py 2 | method: grid 3 | metric: 4 | goal: maximize 5 | name: final_score 6 | parameters: 7 | env_name: 8 | values: ['CartPole-v1'] #'CartPole-v1', 'Acrobot-v1' 9 | agent_type: 10 | values: ['delayed'] 11 | use_learned_forward_model: 12 | values: [False, True] 13 | delay_value: 14 | values: [0, 5, 15, 25] 15 | physical_noise_std_ratio: 16 | values: [0.1] 17 | seed: 18 | values: [1, 2, 3] 19 | use_reward_shaping: 20 | values: [True] 21 | epsilon_decay: 22 | values: [0.999] # 0.9999 for acrobot 23 | epsilon_min: 24 | values: [0.001] 25 | learning_rate: 26 | values: [0.005] 27 | double_q: 28 | values: [True] 29 | target_network_update_freq: 30 | values: [300] 31 | total_steps: 32 | values: [250000] 33 | 34 | -------------------------------------------------------------------------------- /gym_modifications/acrobot.py: -------------------------------------------------------------------------------- 1 | """classic Acrobot task""" 2 | import numpy as np 3 | from numpy import sin, cos, pi 4 | 5 | from gym import core, spaces 6 | from gym.utils import seeding 7 | 8 | __copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy" 9 | __credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann", 10 | "William Dabney", "Jonathan P. How"] 11 | __license__ = "BSD 3-Clause" 12 | __author__ = "Christoph Dann " 13 | 14 | # SOURCE: 15 | # https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py 16 | 17 | 18 | class AcrobotEnv(core.Env): 19 | 20 | """ 21 | Acrobot is a 2-link pendulum with only the second joint actuated. 22 | Initially, both links point downwards. The goal is to swing the 23 | end-effector at a height at least the length of one link above the base. 24 | Both links can swing freely and can pass by each other, i.e., they don't 25 | collide when they have the same angle. 26 | **STATE:** 27 | The state consists of the sin() and cos() of the two rotational joint 28 | angles and the joint angular velocities : 29 | [cos(theta1) sin(theta1) cos(theta2) sin(theta2) thetaDot1 thetaDot2]. 30 | For the first link, an angle of 0 corresponds to the link pointing downwards. 31 | The angle of the second link is relative to the angle of the first link. 32 | An angle of 0 corresponds to having the same angle between the two links. 33 | A state of [1, 0, 1, 0, ..., ...] means that both links point downwards. 34 | **ACTIONS:** 35 | The action is either applying +1, 0 or -1 torque on the joint between 36 | the two pendulum links. 37 | .. note:: 38 | The dynamics equations were missing some terms in the NIPS paper which 39 | are present in the book. R. Sutton confirmed in personal correspondence 40 | that the experimental results shown in the paper and the book were 41 | generated with the equations shown in the book. 42 | However, there is the option to run the domain with the paper equations 43 | by setting book_or_nips = 'nips' 44 | **REFERENCE:** 45 | .. seealso:: 46 | R. Sutton: Generalization in Reinforcement Learning: 47 | Successful Examples Using Sparse Coarse Coding (NIPS 1996) 48 | .. seealso:: 49 | R. Sutton and A. G. Barto: 50 | Reinforcement learning: An introduction. 51 | Cambridge: MIT press, 1998. 52 | .. warning:: 53 | This version of the domain uses the Runge-Kutta method for integrating 54 | the system dynamics and is more realistic, but also considerably harder 55 | than the original version which employs Euler integration, 56 | see the AcrobotLegacy class. 57 | """ 58 | 59 | metadata = { 60 | 'render.modes': ['human', 'rgb_array'], 61 | 'video.frames_per_second' : 15 62 | } 63 | 64 | dt = .2 65 | 66 | LINK_LENGTH_1 = 1. # [m] 67 | LINK_LENGTH_2 = 1. # [m] 68 | LINK_MASS_1 = 1. #: [kg] mass of link 1 69 | LINK_MASS_2 = 1. #: [kg] mass of link 2 70 | LINK_COM_POS_1 = 0.5 #: [m] position of the center of mass of link 1 71 | LINK_COM_POS_2 = 0.5 #: [m] position of the center of mass of link 2 72 | LINK_MOI = 1. #: moments of inertia for both links 73 | 74 | MAX_VEL_1 = 4 * pi 75 | MAX_VEL_2 = 9 * pi 76 | 77 | AVAIL_TORQUE = [-1., 0., +1] 78 | 79 | torque_noise_max = 0. 80 | 81 | #: use dynamics equations from the nips paper or the book 82 | book_or_nips = "book" 83 | action_arrow = None 84 | domain_fig = None 85 | actions_num = 3 86 | 87 | def __init__(self, physical_noise_std_ratio=0): 88 | self.viewer = None 89 | high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32) 90 | low = -high 91 | self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32) 92 | self.action_space = spaces.Discrete(3) 93 | self.state = None 94 | self.seed() 95 | #external edit: add normal noise to the parameters, with std proportional to the original values 96 | self.physical_noise_std_ratio = physical_noise_std_ratio 97 | 98 | def seed(self, seed=None): 99 | self.np_random, seed = seeding.np_random(seed) 100 | return [seed] 101 | 102 | def reset(self): 103 | self.state = self.np_random.uniform(low=-0.1, high=0.1, size=(4,)) 104 | return self._get_ob() 105 | 106 | def step(self, a): 107 | s = self.state 108 | torque = self.AVAIL_TORQUE[a] 109 | 110 | # Add noise to the force action 111 | if self.torque_noise_max > 0: 112 | torque += self.np_random.uniform(-self.torque_noise_max, self.torque_noise_max) 113 | 114 | # Now, augment the state with our force action so it can be passed to 115 | # _dsdt 116 | s_augmented = np.append(s, torque) 117 | 118 | ns = rk4(self._dsdt, s_augmented, [0, self.dt]) 119 | # only care about final timestep of integration returned by integrator 120 | ns = ns[-1] 121 | ns = ns[:4] # omit action 122 | # ODEINT IS TOO SLOW! 123 | # ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt]) 124 | # self.s_continuous = ns_continuous[-1] # We only care about the state 125 | # at the ''final timestep'', self.dt 126 | 127 | ns[0] = wrap(ns[0], -pi, pi) 128 | ns[1] = wrap(ns[1], -pi, pi) 129 | ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1) 130 | ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2) 131 | self.state = ns 132 | terminal = self._terminal() 133 | reward = -1. if not terminal else 0. 134 | return (self._get_ob(), reward, terminal, {}) 135 | 136 | def _get_ob(self): 137 | s = self.state 138 | return np.array([cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]]) 139 | 140 | def _terminal(self): 141 | s = self.state 142 | return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.) 143 | 144 | def _perturb_param(self, param): 145 | return param + np.random.normal(0, param * self.physical_noise_std_ratio) 146 | 147 | def _dsdt(self, s_augmented, t): 148 | # m1 = self.LINK_MASS_1 149 | m1 = self._perturb_param(self.LINK_MASS_1) 150 | # m2 = self.LINK_MASS_2 151 | m2 = self._perturb_param(self.LINK_MASS_2) 152 | # l1 = self.LINK_LENGTH_1 153 | l1 = self._perturb_param(self.LINK_LENGTH_1) 154 | 155 | lc1 = self.LINK_COM_POS_1 156 | lc2 = self.LINK_COM_POS_2 157 | I1 = self.LINK_MOI 158 | I2 = self.LINK_MOI 159 | 160 | g = 9.8 161 | a = s_augmented[-1] 162 | s = s_augmented[:-1] 163 | theta1 = s[0] 164 | theta2 = s[1] 165 | dtheta1 = s[2] 166 | dtheta2 = s[3] 167 | d1 = m1 * lc1 ** 2 + m2 * \ 168 | (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2 169 | d2 = m2 * (lc2 ** 2 + l1 * lc2 * cos(theta2)) + I2 170 | phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.) 171 | phi1 = - m2 * l1 * lc2 * dtheta2 ** 2 * sin(theta2) \ 172 | - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2) \ 173 | + (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2) + phi2 174 | if self.book_or_nips == "nips": 175 | # the following line is consistent with the description in the 176 | # paper 177 | ddtheta2 = (a + d2 / d1 * phi1 - phi2) / \ 178 | (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) 179 | else: 180 | # the following line is consistent with the java implementation and the 181 | # book 182 | ddtheta2 = (a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * sin(theta2) - phi2) \ 183 | / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1) 184 | ddtheta1 = -(d2 * ddtheta2 + phi1) / d1 185 | return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.) 186 | 187 | def render(self, mode='human'): 188 | from gym.envs.classic_control import rendering 189 | 190 | s = self.state 191 | 192 | if self.viewer is None: 193 | self.viewer = rendering.Viewer(500,500) 194 | bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2 # 2.2 for default 195 | self.viewer.set_bounds(-bound,bound,-bound,bound) 196 | 197 | if s is None: return None 198 | 199 | p1 = [-self.LINK_LENGTH_1 * 200 | cos(s[0]), self.LINK_LENGTH_1 * sin(s[0])] 201 | 202 | p2 = [p1[0] - self.LINK_LENGTH_2 * cos(s[0] + s[1]), 203 | p1[1] + self.LINK_LENGTH_2 * sin(s[0] + s[1])] 204 | 205 | xys = np.array([[0,0], p1, p2])[:,::-1] 206 | thetas = [s[0]- pi/2, s[0]+s[1]-pi/2] 207 | link_lengths = [self.LINK_LENGTH_1, self.LINK_LENGTH_2] 208 | 209 | self.viewer.draw_line((-2.2, 1), (2.2, 1)) 210 | for ((x,y),th,llen) in zip(xys, thetas, link_lengths): 211 | l,r,t,b = 0, llen, .1, -.1 212 | jtransform = rendering.Transform(rotation=th, translation=(x,y)) 213 | link = self.viewer.draw_polygon([(l,b), (l,t), (r,t), (r,b)]) 214 | link.add_attr(jtransform) 215 | link.set_color(0,.8, .8) 216 | circ = self.viewer.draw_circle(.1) 217 | circ.set_color(.8, .8, 0) 218 | circ.add_attr(jtransform) 219 | 220 | return self.viewer.render(return_rgb_array = mode=='rgb_array') 221 | 222 | def close(self): 223 | if self.viewer: 224 | self.viewer.close() 225 | self.viewer = None 226 | 227 | def wrap(x, m, M): 228 | """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which 229 | truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n 230 | For example, m = -180, M = 180 (degrees), x = 360 --> returns 0. 231 | 232 | Args: 233 | x: a scalar 234 | m: minimum possible value in range 235 | M: maximum possible value in range 236 | 237 | Returns: 238 | x: a scalar, wrapped 239 | """ 240 | diff = M - m 241 | while x > M: 242 | x = x - diff 243 | while x < m: 244 | x = x + diff 245 | return x 246 | 247 | def bound(x, m, M=None): 248 | """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR* 249 | have m as length 2 vector, bound(x,m, ) returns m[0] <= x <= m[1]. 250 | 251 | Args: 252 | x: scalar 253 | 254 | Returns: 255 | x: scalar, bound between min (m) and Max (M) 256 | """ 257 | if M is None: 258 | M = m[1] 259 | m = m[0] 260 | # bound x between min (m) and Max (M) 261 | return min(max(x, m), M) 262 | 263 | 264 | def rk4(derivs, y0, t, *args, **kwargs): 265 | """ 266 | Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta. 267 | This is a toy implementation which may be useful if you find 268 | yourself stranded on a system w/o scipy. Otherwise use 269 | :func:`scipy.integrate`. 270 | 271 | Args: 272 | derivs: the derivative of the system and has the signature ``dy = derivs(yi, ti)`` 273 | y0: initial state vector 274 | t: sample times 275 | args: additional arguments passed to the derivative function 276 | kwargs: additional keyword arguments passed to the derivative function 277 | 278 | Example 1 :: 279 | ## 2D system 280 | def derivs6(x,t): 281 | d1 = x[0] + 2*x[1] 282 | d2 = -3*x[0] + 4*x[1] 283 | return (d1, d2) 284 | dt = 0.0005 285 | t = arange(0.0, 2.0, dt) 286 | y0 = (1,2) 287 | yout = rk4(derivs6, y0, t) 288 | Example 2:: 289 | ## 1D system 290 | alpha = 2 291 | def derivs(x,t): 292 | return -alpha*x + exp(-t) 293 | y0 = 1 294 | yout = rk4(derivs, y0, t) 295 | If you have access to scipy, you should probably be using the 296 | scipy.integrate tools rather than this function. 297 | 298 | Returns: 299 | yout: Runge-Kutta approximation of the ODE 300 | """ 301 | 302 | try: 303 | Ny = len(y0) 304 | except TypeError: 305 | yout = np.zeros((len(t),), np.float_) 306 | else: 307 | yout = np.zeros((len(t), Ny), np.float_) 308 | 309 | yout[0] = y0 310 | 311 | 312 | for i in np.arange(len(t) - 1): 313 | 314 | thist = t[i] 315 | dt = t[i + 1] - thist 316 | dt2 = dt / 2.0 317 | y0 = yout[i] 318 | 319 | k1 = np.asarray(derivs(y0, thist, *args, **kwargs)) 320 | k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs)) 321 | k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs)) 322 | k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs)) 323 | yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4) 324 | return yout 325 | -------------------------------------------------------------------------------- /gym_modifications/cartpole.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classic cart-pole system implemented by Rich Sutton et al. 3 | Copied from http://incompleteideas.net/sutton/book/code/pole.c 4 | permalink: https://perma.cc/C9ZM-652R 5 | """ 6 | 7 | import math 8 | import gym 9 | from gym import spaces, logger 10 | from gym.utils import seeding 11 | import numpy as np 12 | 13 | 14 | class CartPoleEnv(gym.Env): 15 | """ 16 | Description: 17 | A pole is attached by an un-actuated joint to a cart, which moves along 18 | a frictionless track. The pendulum starts upright, and the goal is to 19 | prevent it from falling over by increasing and reducing the cart's 20 | velocity. 21 | 22 | Source: 23 | This environment corresponds to the version of the cart-pole problem 24 | described by Barto, Sutton, and Anderson 25 | 26 | Observation: 27 | Type: Box(4) 28 | Num Observation Min Max 29 | 0 Cart Position -4.8 4.8 30 | 1 Cart Velocity -Inf Inf 31 | 2 Pole Angle -24 deg 24 deg 32 | 3 Pole Velocity At Tip -Inf Inf 33 | 34 | Actions: 35 | Type: Discrete(2) 36 | Num Action 37 | 0 Push cart to the left 38 | 1 Push cart to the right 39 | 40 | Note: The amount the velocity that is reduced or increased is not 41 | fixed; it depends on the angle the pole is pointing. This is because 42 | the center of gravity of the pole increases the amount of energy needed 43 | to move the cart underneath it 44 | 45 | Reward: 46 | Reward is 1 for every step taken, including the termination step 47 | 48 | Starting State: 49 | All observations are assigned a uniform random value in [-0.05..0.05] 50 | 51 | Episode Termination: 52 | Pole Angle is more than 12 degrees. 53 | Cart Position is more than 2.4 (center of the cart reaches the edge of 54 | the display). 55 | Episode length is greater than 200. 56 | Solved Requirements: 57 | Considered solved when the average reward is greater than or equal to 58 | 195.0 over 100 consecutive trials. 59 | """ 60 | 61 | metadata = { 62 | 'render.modes': ['human', 'rgb_array'], 63 | 'video.frames_per_second': 50 64 | } 65 | 66 | def __init__(self, physical_noise_std_ratio=0): 67 | self.gravity = 9.8 68 | self.masscart = 1.0 69 | self.masspole = 0.1 70 | self.total_mass = (self.masspole + self.masscart) 71 | self.length = 0.5 # actually half the pole's length 72 | self.polemass_length = (self.masspole * self.length) 73 | self.force_mag = 10.0 74 | self.tau = 0.02 # seconds between state updates 75 | self.kinematics_integrator = 'euler' 76 | 77 | # external edit: add normal noise to the parameters, with std proportional to the original values 78 | self.physical_noise_std_ratio = physical_noise_std_ratio 79 | 80 | # Angle at which to fail the episode 81 | self.theta_threshold_radians = 12 * 2 * math.pi / 360 82 | self.x_threshold = 2.4 83 | 84 | # Angle limit set to 2 * theta_threshold_radians so failing observation 85 | # is still within bounds. 86 | high = np.array([self.x_threshold * 2, 87 | np.finfo(np.float32).max, 88 | self.theta_threshold_radians * 2, 89 | np.finfo(np.float32).max], 90 | dtype=np.float32) 91 | 92 | self.action_space = spaces.Discrete(2) 93 | self.observation_space = spaces.Box(-high, high, dtype=np.float32) 94 | 95 | self.seed() 96 | self.viewer = None 97 | self.state = None 98 | 99 | self.steps_beyond_done = None 100 | 101 | def seed(self, seed=None): 102 | self.np_random, seed = seeding.np_random(seed) 103 | return [seed] 104 | 105 | def _perturb_param(self, param): 106 | return param + np.random.normal(0, param * self.physical_noise_std_ratio) 107 | 108 | def step(self, action): 109 | masspole = self._perturb_param(self.masspole) 110 | masscart = self._perturb_param(self.masscart) 111 | total_mass = (masspole + masscart) 112 | length = self._perturb_param(self.length) 113 | 114 | err_msg = "%r (%s) invalid" % (action, type(action)) 115 | assert self.action_space.contains(action), err_msg 116 | 117 | x, x_dot, theta, theta_dot = self.state 118 | force = self.force_mag if action == 1 else -self.force_mag 119 | costheta = math.cos(theta) 120 | sintheta = math.sin(theta) 121 | 122 | # For the interested reader: 123 | # https://coneural.org/florian/papers/05_cart_pole.pdf 124 | temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / total_mass 125 | thetaacc = (self.gravity * sintheta - costheta * temp) / ( 126 | length * (4.0 / 3.0 - masspole * costheta ** 2 / total_mass)) 127 | xacc = temp - self.polemass_length * thetaacc * costheta / total_mass 128 | 129 | if self.kinematics_integrator == 'euler': 130 | x = x + self.tau * x_dot 131 | x_dot = x_dot + self.tau * xacc 132 | theta = theta + self.tau * theta_dot 133 | theta_dot = theta_dot + self.tau * thetaacc 134 | else: # semi-implicit euler 135 | x_dot = x_dot + self.tau * xacc 136 | x = x + self.tau * x_dot 137 | theta_dot = theta_dot + self.tau * thetaacc 138 | theta = theta + self.tau * theta_dot 139 | 140 | self.state = (x, x_dot, theta, theta_dot) 141 | 142 | done = bool( 143 | x < -self.x_threshold 144 | or x > self.x_threshold 145 | or theta < -self.theta_threshold_radians 146 | or theta > self.theta_threshold_radians 147 | ) 148 | 149 | if not done: 150 | reward = 1.0 151 | elif self.steps_beyond_done is None: 152 | # Pole just fell! 153 | self.steps_beyond_done = 0 154 | reward = 1.0 155 | else: 156 | if self.steps_beyond_done == 0: 157 | logger.warn( 158 | "You are calling 'step()' even though this " 159 | "environment has already returned done = True. You " 160 | "should always call 'reset()' once you receive 'done = " 161 | "True' -- any further steps are undefined behavior." 162 | ) 163 | self.steps_beyond_done += 1 164 | reward = 0.0 165 | 166 | return np.array(self.state), reward, done, {} 167 | 168 | def reset(self): 169 | self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,)) 170 | self.steps_beyond_done = None 171 | return np.array(self.state) 172 | 173 | def render(self, mode='human'): 174 | screen_width = 600 175 | screen_height = 400 176 | 177 | world_width = self.x_threshold * 2 178 | scale = screen_width / world_width 179 | carty = 100 # TOP OF CART 180 | polewidth = 10.0 181 | polelen = scale * (2 * self.length) 182 | cartwidth = 50.0 183 | cartheight = 30.0 184 | 185 | if self.viewer is None: 186 | from gym.envs.classic_control import rendering 187 | self.viewer = rendering.Viewer(screen_width, screen_height) 188 | l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2 189 | axleoffset = cartheight / 4.0 190 | cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 191 | self.carttrans = rendering.Transform() 192 | cart.add_attr(self.carttrans) 193 | self.viewer.add_geom(cart) 194 | l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2 195 | pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)]) 196 | pole.set_color(.8, .6, .4) 197 | self.poletrans = rendering.Transform(translation=(0, axleoffset)) 198 | pole.add_attr(self.poletrans) 199 | pole.add_attr(self.carttrans) 200 | self.viewer.add_geom(pole) 201 | self.axle = rendering.make_circle(polewidth / 2) 202 | self.axle.add_attr(self.poletrans) 203 | self.axle.add_attr(self.carttrans) 204 | self.axle.set_color(.5, .5, .8) 205 | self.viewer.add_geom(self.axle) 206 | self.track = rendering.Line((0, carty), (screen_width, carty)) 207 | self.track.set_color(0, 0, 0) 208 | self.viewer.add_geom(self.track) 209 | 210 | self._pole_geom = pole 211 | 212 | if self.state is None: 213 | return None 214 | 215 | # Edit the pole polygon vertex 216 | pole = self._pole_geom 217 | l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2 218 | pole.v = [(l, b), (l, t), (r, t), (r, b)] 219 | 220 | x = self.state 221 | cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART 222 | self.carttrans.set_translation(cartx, carty) 223 | self.poletrans.set_rotation(-x[2]) 224 | 225 | return self.viewer.render(return_rgb_array=mode == 'rgb_array') 226 | 227 | def close(self): 228 | if self.viewer: 229 | self.viewer.close() 230 | self.viewer = None 231 | -------------------------------------------------------------------------------- /init_main.py: -------------------------------------------------------------------------------- 1 | # SPDX-License-Identifier: Apache-2.0 2 | import gym 3 | from delayed_env import DelayedEnv 4 | import wandb 5 | import warnings 6 | 7 | # from keras import backend as K 8 | from tensorflow.python.keras import backend as K 9 | 10 | import tensorflow as tf 11 | 12 | 13 | config = tf.compat.v1.ConfigProto(allow_soft_placement=True) 14 | config.gpu_options.per_process_gpu_memory_fraction = 0.9 15 | config.gpu_options.allow_growth = True 16 | sess = tf.compat.v1.Session(config=config) 17 | K.set_session(sess) 18 | 19 | 20 | def init_main(): 21 | hyperparameter_defaults = dict( 22 | is_delayed_agent=False, 23 | double_q=True, 24 | delay_value=5, 25 | epsilon_decay=0.999, # Cartpole: 0.999, Acrobot: 0.9999, MountainCar: 0.99999 26 | epsilon_min=0.001, #0.001 27 | learning_rate=0.005, # Cartpole & Acrobot: 0.005, #mountainCar: 0.0001 28 | seed=1, 29 | epsilon=1.0, 30 | use_m_step_reward=False, 31 | use_latest_reward=False, 32 | use_reward_shaping=True, 33 | physical_noise_std_ratio=0.1, # default: 0.1 34 | env_name='CartPole-v1', #'CartPole-v1', 'Acrobot-v1', 'MountainCar-v0' 35 | train_freq=1, 36 | target_network_update_freq=300, 37 | use_learned_forward_model=True, 38 | agent_type='delayed', #'delayed', 'augmented', 'oblivious' 39 | # total_steps=3000, replaced with a delay-dependent function 40 | ) 41 | # Pass your defaults to wandb.init 42 | wandb.init(config=hyperparameter_defaults) 43 | config = wandb.config 44 | if 'CartPole' in config.env_name or 'Acrobot' in config.env_name: 45 | try: 46 | orig_env = gym.make(config.env_name, physical_noise_std_ratio=config.physical_noise_std_ratio) 47 | except TypeError as e: 48 | warnings.warn('{} gym env has not been modified as needed to support added noise. See README.md for ' 49 | 'instructions.\nRunning original noiseless version instead.'.format(config.env_name)) 50 | orig_env = gym.make(config.env_name) 51 | else: 52 | orig_env = gym.make(config.env_name) 53 | # orig_env = DiscretizeActions(orig_env) # for mujoco envs 54 | delayed_env = DelayedEnv(orig_env, config.delay_value) 55 | state_size = orig_env.observation_space.shape#[0] 56 | if not delayed_env.is_atari_env: 57 | state_size = state_size[0] 58 | action_size = orig_env.action_space.n 59 | done = False 60 | batch_size = 32 61 | return config, delayed_env, state_size, action_size, done, batch_size 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /pretrained_agents/2xcbo7mg_Acrobot-v1_ddqn_delay.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galdl/rl_delay_basic/2f88729d2339f468326cc38cd04e792617f544e7/pretrained_agents/2xcbo7mg_Acrobot-v1_ddqn_delay.h5 -------------------------------------------------------------------------------- /pretrained_agents/i06rfoxy_cartpole_ddqn_no_delay.h5: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/galdl/rl_delay_basic/2f88729d2339f468326cc38cd04e792617f544e7/pretrained_agents/i06rfoxy_cartpole_ddqn_no_delay.h5 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.18.5 2 | wandb==0.10.32 3 | tensorflow==2.2.0 4 | keras==2.4.3 5 | gym==0.18.3 6 | tqdm==4.61.1 -------------------------------------------------------------------------------- /third_party/gym.patch: -------------------------------------------------------------------------------- 1 | diff --git a/gym/envs/classic_control/acrobot.py b/gym/envs/classic_control/acrobot.py 2 | index 91321f3..afa2522 100644 3 | --- a/gym/envs/classic_control/acrobot.py 4 | +++ b/gym/envs/classic_control/acrobot.py 5 | @@ -14,6 +14,7 @@ __author__ = "Christoph Dann " 6 | # SOURCE: 7 | # https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py 8 | 9 | + 10 | class AcrobotEnv(core.Env): 11 | 12 | """ 13 | @@ -83,7 +84,7 @@ class AcrobotEnv(core.Env): 14 | domain_fig = None 15 | actions_num = 3 16 | 17 | - def __init__(self): 18 | + def __init__(self, physical_noise_std_ratio=0): 19 | self.viewer = None 20 | high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32) 21 | low = -high 22 | @@ -91,6 +92,8 @@ class AcrobotEnv(core.Env): 23 | self.action_space = spaces.Discrete(3) 24 | self.state = None 25 | self.seed() 26 | + #external edit: add normal noise to the parameters, with std proportional to the original values 27 | + self.physical_noise_std_ratio = physical_noise_std_ratio 28 | 29 | def seed(self, seed=None): 30 | self.np_random, seed = seeding.np_random(seed) 31 | @@ -138,14 +141,22 @@ class AcrobotEnv(core.Env): 32 | s = self.state 33 | return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.) 34 | 35 | + def _perturb_param(self, param): 36 | + return param + np.random.normal(0, param * self.physical_noise_std_ratio) 37 | + 38 | def _dsdt(self, s_augmented, t): 39 | - m1 = self.LINK_MASS_1 40 | - m2 = self.LINK_MASS_2 41 | - l1 = self.LINK_LENGTH_1 42 | + # m1 = self.LINK_MASS_1 43 | + m1 = self._perturb_param(self.LINK_MASS_1) 44 | + # m2 = self.LINK_MASS_2 45 | + m2 = self._perturb_param(self.LINK_MASS_2) 46 | + # l1 = self.LINK_LENGTH_1 47 | + l1 = self._perturb_param(self.LINK_LENGTH_1) 48 | + 49 | lc1 = self.LINK_COM_POS_1 50 | lc2 = self.LINK_COM_POS_2 51 | I1 = self.LINK_MOI 52 | I2 = self.LINK_MOI 53 | + 54 | g = 9.8 55 | a = s_augmented[-1] 56 | s = s_augmented[:-1] 57 | diff --git a/gym/envs/classic_control/cartpole.py b/gym/envs/classic_control/cartpole.py 58 | index 78d806b..d3c6ba2 100644 59 | --- a/gym/envs/classic_control/cartpole.py 60 | +++ b/gym/envs/classic_control/cartpole.py 61 | @@ -25,17 +25,17 @@ class CartPoleEnv(gym.Env): 62 | 63 | Observation: 64 | Type: Box(4) 65 | - Num Observation Min Max 66 | - 0 Cart Position -4.8 4.8 67 | - 1 Cart Velocity -Inf Inf 68 | - 2 Pole Angle -0.418 rad (-24 deg) 0.418 rad (24 deg) 69 | - 3 Pole Angular Velocity -Inf Inf 70 | + Num Observation Min Max 71 | + 0 Cart Position -4.8 4.8 72 | + 1 Cart Velocity -Inf Inf 73 | + 2 Pole Angle -24 deg 24 deg 74 | + 3 Pole Velocity At Tip -Inf Inf 75 | 76 | Actions: 77 | Type: Discrete(2) 78 | - Num Action 79 | - 0 Push cart to the left 80 | - 1 Push cart to the right 81 | + Num Action 82 | + 0 Push cart to the left 83 | + 1 Push cart to the right 84 | 85 | Note: The amount the velocity that is reduced or increased is not 86 | fixed; it depends on the angle the pole is pointing. This is because 87 | @@ -54,7 +54,7 @@ class CartPoleEnv(gym.Env): 88 | the display). 89 | Episode length is greater than 200. 90 | Solved Requirements: 91 | - Considered solved when the average return is greater than or equal to 92 | + Considered solved when the average reward is greater than or equal to 93 | 195.0 over 100 consecutive trials. 94 | """ 95 | 96 | @@ -63,7 +63,7 @@ class CartPoleEnv(gym.Env): 97 | 'video.frames_per_second': 50 98 | } 99 | 100 | - def __init__(self): 101 | + def __init__(self, physical_noise_std_ratio=0): 102 | self.gravity = 9.8 103 | self.masscart = 1.0 104 | self.masspole = 0.1 105 | @@ -74,6 +74,9 @@ class CartPoleEnv(gym.Env): 106 | self.tau = 0.02 # seconds between state updates 107 | self.kinematics_integrator = 'euler' 108 | 109 | + # external edit: add normal noise to the parameters, with std proportional to the original values 110 | + self.physical_noise_std_ratio = physical_noise_std_ratio 111 | + 112 | # Angle at which to fail the episode 113 | self.theta_threshold_radians = 12 * 2 * math.pi / 360 114 | self.x_threshold = 2.4 115 | @@ -99,7 +102,15 @@ class CartPoleEnv(gym.Env): 116 | self.np_random, seed = seeding.np_random(seed) 117 | return [seed] 118 | 119 | + def _perturb_param(self, param): 120 | + return param + np.random.normal(0, param * self.physical_noise_std_ratio) 121 | + 122 | def step(self, action): 123 | + masspole = self._perturb_param(self.masspole) 124 | + masscart = self._perturb_param(self.masscart) 125 | + total_mass = (masspole + masscart) 126 | + length = self._perturb_param(self.length) 127 | + 128 | err_msg = "%r (%s) invalid" % (action, type(action)) 129 | assert self.action_space.contains(action), err_msg 130 | 131 | @@ -110,9 +121,10 @@ class CartPoleEnv(gym.Env): 132 | 133 | # For the interested reader: 134 | # https://coneural.org/florian/papers/05_cart_pole.pdf 135 | - temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / self.total_mass 136 | - thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta ** 2 / self.total_mass)) 137 | - xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass 138 | + temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / total_mass 139 | + thetaacc = (self.gravity * sintheta - costheta * temp) / ( 140 | + length * (4.0 / 3.0 - masspole * costheta ** 2 / total_mass)) 141 | + xacc = temp - self.polemass_length * thetaacc * costheta / total_mass 142 | 143 | if self.kinematics_integrator == 'euler': 144 | x = x + self.tau * x_dot 145 | @@ -163,7 +175,7 @@ class CartPoleEnv(gym.Env): 146 | screen_height = 400 147 | 148 | world_width = self.x_threshold * 2 149 | - scale = screen_width/world_width 150 | + scale = screen_width / world_width 151 | carty = 100 # TOP OF CART 152 | polewidth = 10.0 153 | polelen = scale * (2 * self.length) 154 | @@ -186,7 +198,7 @@ class CartPoleEnv(gym.Env): 155 | pole.add_attr(self.poletrans) 156 | pole.add_attr(self.carttrans) 157 | self.viewer.add_geom(pole) 158 | - self.axle = rendering.make_circle(polewidth/2) 159 | + self.axle = rendering.make_circle(polewidth / 2) 160 | self.axle.add_attr(self.poletrans) 161 | self.axle.add_attr(self.carttrans) 162 | self.axle.set_color(.5, .5, .8) 163 | --------------------------------------------------------------------------------