├── .gitmodules
├── LICENSE
├── NVIDIA_CLA_v1.0.1.pdf
├── README.md
├── ddqn_main.py
├── delayed_env.py
├── delayed_q_diagram.png
├── dqn_agents.py
├── environment.yml
├── example_sweep.yml
├── gym_modifications
    ├── acrobot.py
    └── cartpole.py
├── init_main.py
├── pretrained_agents
    ├── 2xcbo7mg_Acrobot-v1_ddqn_delay.h5
    └── i06rfoxy_cartpole_ddqn_no_delay.h5
├── requirements.txt
└── third_party
    └── gym.patch


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "third_party/gym"]
2 | 	path = third_party/gym
3 | 	url = git@github.com:openai/gym.git
4 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2020 Gal Dalal
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/NVIDIA_CLA_v1.0.1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galdl/rl_delay_basic/2f88729d2339f468326cc38cd04e792617f544e7/NVIDIA_CLA_v1.0.1.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Acting in Delayed Environments with Non-Stationary Markov Policies
 2 | This repository contains the implementation of the Delayed, Agumented, and Oblivious agents from the paper:
 3 | "[Acting in Delayed Environments with Non-Stationary Markov Policies](https://arxiv.org/pdf/2101.11992)", Esther Derman<sup>\*</sup>, Gal Dalal<sup>\*</sup>, Shie Mannor (<sup>*</sup>equal contribution), published in ICLR 2021. 
 4 | 
 5 | The agent here supports the Cartpole and Acrobot environments by OpenAI. The Atari-supported agent can be found [here](https://github.com/galdl/rl_delay_atari).
 6 | 
 7 | <img src="https://github.com/galdl/rl_delay_basic/blob/master/delayed_q_diagram.png" width="600" height="330">
 8 | 
 9 | 
10 | ## Installation
11 | 1. Tested with python3.7. Conda virtual env is encouraged. Other versions of python and/or environments should also be possible.
12 | 2. Clone project and cd to project dir.
13 | 3. Create virtual env:\
14 |    Option 1 -- Tensorflow 2.2: Run `pip install -r requirements.py` (other versions of the packages in requirements.py should also be fine).\
15 |    Option 2 -- Tensorflow 1.14: Run `conda env create -f environment. yml` to directly create a virtual env called `tf_14`.
16 | 4. To enable support of the noisy Cartpole and Acrobot experiments, modify the original gym cartpole.py and acrobot.py:\
17 |    Option 1 -- via pip install:
18 |       ```bash
19 |       cd third_party
20 |       git submodule sync && git submodule update --init --recursive
21 |       cd gym
22 |       git apply ../gym.patch
23 |       pip install -e .
24 |       ```
25 |    Option 2 -- manually:\
26 |       4a. Find location in site packages. E.g., "/home/username/anaconda3/envs/rl_delay_env/lib/python3.7/site-packages/gym/envs/classic_control/cartpole.py"\
27 |       4b. Overwrite the above file with "rl_delay_basic/gym_modifications/cartpole.py". Repeat the same process for "rl_delay_basic/gym_modifications/acrobot.py".  
28 | 
29 | ## Hyperparameters:
30 | The parameters used for the experiments in the paper are the default ones appearing in init_main.py. They are the same for all types of agents (delayed, augmented, oblivious), both noisy and non-noisy, and all delay values. The only exception is that for Cartpole epsilon_decay=0.999, while for Acrobot epsilon_decay=0.9999.
31 | 
32 | ## Wandb sweep:
33 | Using wandb, you can easily run multiple experiments for different agents, delay values, hyperparameters, etc. An example sweep file is included the in project: example_sweep.yml. A sweep can be created via "wandb sweep example_sweep.yml", and multiple workers can be started with "wandb agent your-sweep-id". For more details see https://docs.wandb.ai/guides/sweeps/quickstart. 
34 |   
35 | 
36 | ## Citing the Project
37 | 
38 | To cite this repository in publications:
39 | 
40 | ```
41 | @article{derman2021acting,
42 |   title={Acting in delayed environments with non-stationary markov policies},
43 |   author={Derman, Esther and Dalal, Gal and Mannor, Shie},
44 |   journal={International Conference on Learning Representations (ICLR)},
45 |   year={2021}
46 | }
47 | ```
48 | 
49 | Happy delaying!
50 | 


--------------------------------------------------------------------------------
/ddqn_main.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | import os
  4 | import numpy as np
  5 | from dqn_agents import DDQNAgent, DDQNPlanningAgent, update_loss, reshape_state
  6 | from init_main import init_main
  7 | import wandb
  8 | from tqdm import tqdm
  9 | 
 10 | import socket
 11 | 
 12 | # # possible cuda fix for mac
 13 | # os.environ['KMP_DUPLICATE_LIB_OK']='True'
 14 | 
 15 | # # don't use GPU (if running, e.g., on mac)
 16 | # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 17 | # os.environ["CUDA_VISIBLE_DEVICES"] = ""
 18 | 
 19 | AVERAGE_OVER_LAST_EP = 0.1
 20 | # EPISODES = 3500
 21 | SAVE_PATH = 'pretrained_agents'
 22 | EP_LEN_LIMIT = int(1e4)
 23 | EVAL_FREQ = 5
 24 | 
 25 | 
 26 | def init_episode(delayed_env, agent, augment_state, state_size):
 27 |     ep_reward = 0
 28 |     ep_reshaped_reward = 0
 29 |     state = delayed_env.reset()
 30 |     state = massage_state(state, augment_state, delayed_env, state_size)
 31 |     agent.clear_action_buffer()
 32 |     loss_dict = {}
 33 |     loss_count = 0
 34 |     ep_step = 0
 35 |     return ep_reward, ep_reshaped_reward, state, loss_dict, loss_count, ep_step
 36 | 
 37 | def routinely_save_agent(e, env_name):
 38 |     agent_name = env_name + '_ddqn_delay.h5'
 39 |     if e % 349 == 0:
 40 |         if not os.path.isdir(SAVE_PATH):
 41 |             os.makedirs(SAVE_PATH)
 42 |         agent_full_name = wandb.run.id + '_' + agent_name
 43 |         agent_path = os.path.join(SAVE_PATH, agent_full_name)
 44 |         agent.save(agent_path)
 45 |         print('saved agent to {}'.format(agent_path))
 46 | 
 47 | 
 48 | def agent_act(config, agent, state, delayed_env, eval=False):
 49 |     if config.agent_type == 'delayed':
 50 |         action = agent.act(state, pending_actions=delayed_env.get_pending_actions(), eval=eval)
 51 |     else:
 52 |         action = agent.act(state, eval)
 53 |     return action
 54 | 
 55 | def massage_state(state, augment_state, delayed_env, state_size):
 56 |     if augment_state:
 57 |         state = np.concatenate((state, delayed_env.get_pending_actions()))
 58 |     state = reshape_state(state, delayed_env.is_atari_env, state_size)
 59 |     return state
 60 | 
 61 | if __name__ == "__main__":
 62 |     config, delayed_env, state_size, action_size, done, batch_size = init_main()
 63 | 
 64 |     score_vec = []
 65 |     # for non-atari (i.e. cartpole) env, run on CPU
 66 |     # if not delayed_env.is_atari_env:
 67 | 
 68 | 
 69 |     kwargs = {
 70 |         'action_size': action_size,
 71 |         'is_atari_env': delayed_env.is_atari_env,
 72 |         'is_delayed_agent': config.is_delayed_agent,
 73 |         'delay_value': config.delay_value,
 74 |         'epsilon_min': config.epsilon_min,
 75 |         'epsilon_decay': config.epsilon_decay,
 76 |         'learning_rate': config.learning_rate,
 77 |         'epsilon': config.epsilon,
 78 |         'use_m_step_reward': config.use_m_step_reward,
 79 |         'use_latest_reward': config.use_latest_reward
 80 |     }
 81 | 
 82 |     # if not config.double_q:
 83 |     #     agent = DQNAgent(state_size=state_size, **kwargs)
 84 |     # else:
 85 |     augment_state = False
 86 |     # wandb.config.update({'augment_state': False}, allow_val_change=True)
 87 |     if config.agent_type == 'delayed':
 88 |         agent = DDQNPlanningAgent(state_size=state_size, env=delayed_env,
 89 |                                   use_learned_forward_model=config.use_learned_forward_model, **kwargs)
 90 |     else:
 91 |         if config.agent_type == 'augmented':
 92 |             # wandb.config.update({'augment_state': True}, allow_val_change=True)
 93 |             augment_state = True
 94 |             state_size += config.delay_value
 95 |         # third option is 'oblivious'
 96 |         agent = DDQNAgent(state_size=state_size, **kwargs)
 97 | 
 98 |     episode = 0
 99 |     ep_reward, ep_reshaped_reward, state, loss_dict, loss_count, ep_step = init_episode(delayed_env, agent,
100 |                                                                                         augment_state, state_size)
101 |     total_steps_delay_dependent = int(100000 + config.delay_value * 10000)
102 |     # eval_done = False
103 |     for step_num in tqdm(range(total_steps_delay_dependent)):
104 |         # if episode % EVAL_FREQ == 0:
105 |         #     while not eval_done:
106 |         #         action = agent_act(config, agent, state, delayed_env, eval=True)
107 |         #         next_state, eval_reward, eval_done, _ = delayed_env.step(action)
108 |         #         state = massage_state(next_state, config, delayed_env, state_size)
109 |         #         ep_reward += eval_reward
110 |         #     wandb.log({'reward_eval': ep_reward}, step=step_num)
111 |         #     episode += 1
112 |         # else:
113 |         #     for step in range(EP_LEN_LIMIT):
114 |                 #     delayed_env.orig_env.render()
115 |         action = agent_act(config, agent, state, delayed_env, eval=False)
116 |         next_state, reward, done, _ = delayed_env.step(action)
117 |         ep_reward += reward
118 |         if config.use_reward_shaping and not delayed_env.is_atari_env:
119 |             reward = delayed_env.get_shaped_reward(next_state, reward)
120 |         ep_reshaped_reward += reward
121 |         next_state = massage_state(next_state, augment_state, delayed_env, state_size)
122 |         can_memorize = ep_step > config.delay_value or not delayed_env.pretrained_agent_loaded
123 |         if can_memorize: # otherwise, we're using expert samples initially which is unfair
124 |             agent.memorize(state, action, reward, next_state, done)
125 |         state = next_state
126 |         if config.double_q and step_num % config.target_network_update_freq == 0:
127 |             agent.update_target_model()
128 |         if len(agent.memory) > batch_size and step_num % config.train_freq == 0:
129 |             batch_loss_dict = agent.replay(batch_size)
130 |             update_loss(loss_dict, batch_loss_dict)
131 |             loss_count += 1
132 |         ep_step += 1
133 |         if done:
134 |             routinely_save_agent(episode, config.env_name)
135 |             wandb_dict = {'reward': ep_reward, 'ep_reshaped_reward': ep_reshaped_reward}
136 |             if 'f_model_loss' in loss_dict:
137 |                 f_model_loss = loss_dict['f_model_loss'] / loss_count
138 |                 wandb_dict['f_model_loss'] = f_model_loss
139 |             wandb.log(wandb_dict, step=step_num)
140 |             score_vec.append(ep_reward)
141 |             episode += 1
142 |             ep_reward, ep_reshaped_reward, state, loss_dict, loss_count, ep_step = init_episode(delayed_env, agent, augment_state,
143 |                                                                                        state_size)
144 | 
145 |     tot_ep_num = len(score_vec)
146 |     avg_over = round(tot_ep_num * AVERAGE_OVER_LAST_EP)
147 |     final_avg_score = np.mean(score_vec[-avg_over:])
148 |     wandb.log({'final_score': final_avg_score})
149 | 


--------------------------------------------------------------------------------
/delayed_env.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | from collections import deque
  4 | from dqn_agents import DQNAgent
  5 | import numpy as np
  6 | from dqn_agents import reshape_state
  7 | from numpy import sin, cos, pi
  8 | 
  9 | CARTPOLE_TRAINED_NON_DELAYED_AGENT_PATH = 'pretrained_agents/i06rfoxy_cartpole_ddqn_no_delay.h5'
 10 | 
 11 | class DelayedEnv:
 12 |     def __init__(self, orig_env, delay_value):
 13 |         self.orig_env = orig_env
 14 |         self.env_name = str(self.orig_env)
 15 |         self.is_atari_env = 'AtariEnv' in self.env_name
 16 |         self.pending_actions = deque()
 17 |         self.delay_value = delay_value
 18 |         self.state_size = orig_env.observation_space.shape#[0]
 19 |         if not self.is_atari_env:
 20 |             self.state_size = self.state_size[0]
 21 |         self.action_size = orig_env.action_space.n
 22 |         self.stored_init_state = None
 23 |         self.trained_non_delayed_agent = DQNAgent(state_size=self.state_size,
 24 |                                                   action_size=self.action_size, is_delayed_agent=False,
 25 |                                                   delay_value=0, epsilon=0, is_atari_env=self.is_atari_env)
 26 |         self.pretrained_agent_loaded = False
 27 | 
 28 |         if 'CartPole' in self.env_name: # for other envs this is not necessary
 29 |             self.trained_non_delayed_agent.load(CARTPOLE_TRAINED_NON_DELAYED_AGENT_PATH)
 30 |             self.pretrained_agent_loaded = True
 31 | 
 32 |     def step(self, action):
 33 |         if self.delay_value > 0:
 34 |             self.pending_actions.append(action)
 35 |             if len(self.pending_actions) - 1 >= self.delay_value:
 36 |                 executed_action = self.pending_actions.popleft()
 37 |             else:
 38 |                 curr_state = reshape_state(self.get_curr_state(), self.is_atari_env, self.state_size)
 39 |                 executed_action = self.trained_non_delayed_agent.act(curr_state)
 40 |         else:
 41 |             executed_action = action
 42 |         return self.orig_env.step(executed_action)
 43 | 
 44 |     def reset(self):
 45 |         self.pending_actions.clear()
 46 |         return self.orig_env.reset()
 47 | 
 48 |     def get_shaped_reward(self, state, orig_reward):
 49 |         reward = orig_reward
 50 |         if 'CartPole' in self.env_name:
 51 |             x, x_dot, theta, theta_dot = state
 52 |             r1 = (self.orig_env.x_threshold - abs(x)) / self.orig_env.x_threshold - 0.8
 53 |             r2 = (self.orig_env.theta_threshold_radians - abs(
 54 |                 theta)) / self.orig_env.theta_threshold_radians - 0.5
 55 |             reward = r1 + r2
 56 |         if 'MountainCar' in self.env_name:
 57 |             # # Adjust reward based on car position
 58 |             # reward = state[0] + 0.5
 59 |             # # Adjust reward for task completion
 60 |             # if state[0] >= 0.5:
 61 |             #     reward += 1
 62 |             position = state[0]
 63 |             reward = (position - self.orig_env.goal_position) / ((self.orig_env.max_position - self.orig_env.min_position) * 10)
 64 |             # print(position, self.goal_position)
 65 |             if position >= 0.1:
 66 |                 reward += 10
 67 |             elif position >= 0.25:
 68 |                 reward += 50
 69 |             elif position >= 0.5:
 70 |                 reward += 100
 71 |         return reward
 72 | 
 73 |     def get_pending_actions(self):
 74 |         if len(self.pending_actions) == 0 and self.delay_value > 0:
 75 |             # reconstruct anticipated trajectory using the oracle
 76 |             self.store_initial_state()
 77 |             curr_state = self.get_curr_state()
 78 |             for i in range(self.delay_value):
 79 |                 curr_state = reshape_state(curr_state, self.is_atari_env, self.state_size)
 80 |                 estimated_action = self.trained_non_delayed_agent.act(curr_state)
 81 |                 self.pending_actions.append(estimated_action)
 82 |                 curr_state = self.get_next_state(state=None, action=estimated_action)
 83 |             self.restore_initial_state()
 84 | 
 85 |         return self.pending_actions
 86 | 
 87 |     def store_initial_state(self):
 88 |         if self.is_atari_env:
 89 |             self.stored_init_state = self.orig_env.clone_state()
 90 |         else:
 91 |             self.stored_init_state = self.orig_env.unwrapped.state
 92 | 
 93 |     def restore_initial_state(self):
 94 |         if self.is_atari_env:
 95 |             self.orig_env.restore_state(self.stored_init_state)
 96 |         else:
 97 |             self.orig_env.unwrapped.state = self.stored_init_state
 98 | 
 99 |     def get_curr_state(self):
100 |         if self.is_atari_env:
101 |             curr_state = self.orig_env.ale.getScreenRGB2()
102 |         else:
103 |             curr_state = self.orig_env.unwrapped.state
104 |         if 'Acrobot' in self.env_name:
105 |             curr_state = np.array([cos(curr_state[0]), sin(curr_state[0]), cos(curr_state[1]), sin(curr_state[1]),
106 |                                    curr_state[2], curr_state[3]])
107 |         return curr_state
108 | 
109 |     def get_next_state(self, state, action):
110 |         next_state, _, _, _ = self.orig_env.step(action)
111 |         self.orig_env._elapsed_steps -= 1
112 |         return next_state
113 | 
114 |     def reset_to_state(self, state):
115 |         self.orig_env.unwrapped.state = state
116 | #


--------------------------------------------------------------------------------
/delayed_q_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galdl/rl_delay_basic/2f88729d2339f468326cc38cd04e792617f544e7/delayed_q_diagram.png


--------------------------------------------------------------------------------
/dqn_agents.py:
--------------------------------------------------------------------------------
  1 | # SPDX-License-Identifier: Apache-2.0
  2 | 
  3 | from collections import deque
  4 | from keras.models import Sequential
  5 | from keras.layers import Dense, Conv2D, MaxPool2D, Flatten
  6 | from copy import deepcopy
  7 | import random
  8 | from keras.optimizers import Adam
  9 | from keras import backend as K
 10 | import tensorflow as tf
 11 | import numpy as np
 12 | 
 13 | def reshape_state(state, is_atari_env, state_size):
 14 |     reshaped = state
 15 |     if not is_atari_env:
 16 |         reshaped = np.reshape(state, [1, state_size])
 17 |     else:
 18 |         if len(state.shape) < 4:
 19 |             reshaped = np.expand_dims(state, axis=0)
 20 |     return reshaped
 21 | 
 22 | def update_loss(loss, sample_loss):
 23 |     if loss is not None and sample_loss is not None:
 24 |         for key, val in sample_loss.items():
 25 |             if key in loss:
 26 |                 loss[key] += val
 27 |             else:
 28 |                 loss[key] = val
 29 | 
 30 | def concatenate_state_action(state, action):
 31 |     out = np.concatenate((state[0], [action]))
 32 |     out = np.reshape(out, [1, len(out)])
 33 |     return out
 34 | 
 35 | class DQNAgent:
 36 |     def __init__(self, state_size, action_size, is_atari_env, is_delayed_agent=False, delay_value=0, epsilon_min=0.001,
 37 |                 epsilon_decay=0.999, learning_rate=0.001, epsilon=1.0, use_m_step_reward=False, use_latest_reward=True,
 38 |                  loss='mse', **kwargs):
 39 |         self.state_size = state_size
 40 |         self.action_size = action_size
 41 |         self.is_atari_env = is_atari_env
 42 |         mem_len = 50000 if self.is_atari_env else 2000
 43 |         self.memory = deque(maxlen=mem_len)
 44 |         self.gamma = 0.95    # discount rate
 45 |         self.epsilon = epsilon  # exploration rate
 46 |         self.epsilon_min = epsilon_min
 47 |         self.epsilon_decay = epsilon_decay #0.995
 48 |         self.learning_rate = learning_rate
 49 |         self.sample_buffer = deque()
 50 |         self.is_delayed_agent = is_delayed_agent
 51 |         self.delay_value = delay_value
 52 |         self.model = self._build_model(loss=loss)
 53 |         self.use_m_step_reward = use_m_step_reward
 54 |         self.use_latest_reward = use_latest_reward
 55 | 
 56 | 
 57 |     def _huber_loss(self, y_true, y_pred, clip_delta=1.0):
 58 |         """Huber loss for Q Learning
 59 |         References: https://en.wikipedia.org/wiki/Huber_loss
 60 |                     https://www.tensorflow.org/api_docs/python/tf/losses/huber_loss
 61 |         """
 62 |         error = y_true - y_pred
 63 |         cond = K.abs(error) <= clip_delta
 64 | 
 65 |         squared_loss = 0.5 * K.square(error)
 66 |         quadratic_loss = 0.5 * K.square(clip_delta) + clip_delta * (K.abs(error) - clip_delta)
 67 | 
 68 |         return K.mean(tf.where(cond, squared_loss, quadratic_loss))
 69 | 
 70 |     def _build_model(self, loss=None, input_size=None, output_size=None):
 71 |         loss = self._huber_loss if loss is 'huber' else loss
 72 |         input_size = self.state_size if input_size is None else input_size
 73 |         output_size = self.action_size if output_size is None else output_size
 74 | 
 75 |         # Neural Net for Deep-Q learning Model
 76 |         model = Sequential()
 77 |         if self.is_atari_env:
 78 |             model.add(Conv2D(32, 8, strides=(4,4), input_shape=input_size, activation='relu'))
 79 |             model.add(MaxPool2D())
 80 |             model.add(Conv2D(64, 4, strides=(2,2), activation='relu'))
 81 |             model.add(MaxPool2D())
 82 |             model.add(Conv2D(64, 3, strides=(1,1), activation='relu'))
 83 |             model.add(MaxPool2D())
 84 |             model.add(Flatten())
 85 |             model.add(Dense(64, activation='relu'))
 86 |             model.add(Dense(64, activation='relu'))
 87 |             model.add(Dense(output_size, activation='linear'))
 88 |         else:
 89 |             model.add(Dense(24, input_dim=input_size, activation='relu'))
 90 |             model.add(Dense(24, activation='relu'))
 91 |             model.add(Dense(output_size, activation='linear'))
 92 | 
 93 |         model.compile(loss=loss,
 94 |                       optimizer=Adam(lr=self.learning_rate))
 95 |         return model
 96 | 
 97 |     def memorize(self, state, action, reward, next_state, done):
 98 |         if self.is_delayed_agent:
 99 |             # for earlier time than delay_value, the data is problematic (non-delayed response)
100 |             # Construct modified tuple by keeping old s_t with new a_{t+m}, r_{t+m} s_{t+m+1}
101 |             new_tuple = (state, action, reward, next_state, done)
102 |             self.sample_buffer.append(new_tuple)
103 |             if len(self.sample_buffer) - 1 >= self.delay_value:
104 |                 old_tuple = self.sample_buffer.popleft()
105 |                 modified_tuple = list(deepcopy(old_tuple))
106 |                 modified_tuple[1] = action
107 |                 modified_tuple[2] = self.m_step_reward(first_reward=old_tuple[2])
108 |                 # trying to use s_{t+1} instead of s_{t+m} as in the original ICML2020 submission
109 |                 # modified_tuple[3] = next_state
110 |                 modified_tuple = tuple(modified_tuple)
111 |                 self.memory.append(modified_tuple)
112 |         else:
113 |             self.memory.append((state, action, reward, next_state, done))
114 | 
115 |     def act(self, state, eval=False):
116 |         if not eval and np.random.rand() <= self.epsilon:
117 |             return random.randrange(self.action_size)
118 |         act_values = self.model.predict(state)
119 |         return np.argmax(act_values[0])  # returns action
120 | 
121 |     def m_step_reward(self, first_reward):
122 |         if not self.use_m_step_reward:
123 |             if self.use_latest_reward:
124 |                 return self.sample_buffer[-1][2]
125 |             else:
126 |                 return first_reward
127 |         else:
128 |             discounted_rew = first_reward
129 |             for i in range(self.delay_value):
130 |                 discounted_rew += self.gamma ** (i + 1) * self.sample_buffer[i][2]
131 |             return discounted_rew
132 | 
133 |     def effective_gamma(self):
134 |         return self.gamma if not self.use_m_step_reward else (self.gamma ** (self.delay_value + 1))
135 | 
136 |     def replay(self, batch_size):
137 |         minibatch = random.sample(self.memory, batch_size)
138 |         for state, action, reward, next_state, done in minibatch:
139 |             target = reward
140 |             if not done:
141 |                 target = (reward + self.effective_gamma() *
142 |                           np.amax(self.model.predict(next_state)[0]))
143 |             target_f = self.model.predict(state)
144 |             target_f[0][action] = target
145 |             # self.model.fit(state, target_f, epochs=1, verbose=0,
146 |             #                callbacks=[WandbCallback()])
147 |             self.model.fit(state, target_f, epochs=1, verbose=0)
148 |         if self.epsilon > self.epsilon_min:
149 |             self.epsilon *= self.epsilon_decay
150 | 
151 | 
152 |     def load(self, name):
153 |         self.model.load_weights(name)
154 | 
155 |     def save(self, name):
156 |         self.model.save_weights(name)
157 | 
158 |     def clear_action_buffer(self):
159 |         self.sample_buffer.clear()
160 | 
161 | 
162 | class DDQNAgent(DQNAgent):
163 |     def __init__(self, state_size, action_size, is_atari_env, is_delayed_agent=False, delay_value=0, epsilon_min=0.001,
164 |                 epsilon_decay=0.999, learning_rate=0.001, epsilon=1.0, use_m_step_reward=False, use_latest_reward=True):
165 |         super().__init__(state_size, action_size, is_atari_env=is_atari_env, is_delayed_agent=is_delayed_agent, delay_value=delay_value,
166 |                          epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, learning_rate=learning_rate,
167 |                          epsilon=epsilon, use_m_step_reward=use_m_step_reward, use_latest_reward=use_latest_reward,
168 |                          loss='huber')
169 |         # self.model = self._build_model()
170 |         self.target_model = self._build_model(loss='huber')
171 |         self.update_target_model()
172 | 
173 | 
174 |     def update_target_model(self):
175 |         # copy weights from model to target_model
176 |         self.target_model.set_weights(self.model.get_weights())
177 | 
178 |     def train_model(self, batch):
179 |         state_vec, action_vec, reward_vec, next_state_vec, done_vec = batch
180 |         target = self.model.predict(state_vec)
181 |         # a = self.model.predict(next_state)[0]
182 |         t = self.target_model.predict(next_state_vec)#[0]
183 |         not_done_arr = np.invert(np.asarray(done_vec))
184 |         new_targets = reward_vec + not_done_arr * self.effective_gamma() * np.amax(t, axis=1)
185 |         for i in range(len(batch[0])):
186 |             target[i][action_vec[i]] = new_targets[i]
187 |         # target[0][action] = reward + self.gamma * t[np.argmax(a)]
188 |         train_history = self.model.fit(state_vec, target, epochs=1, verbose=0)
189 |         q_loss = train_history.history['loss'][0]
190 |         loss_dict = {'q_loss': q_loss}
191 |         return loss_dict
192 | 
193 |     def _create_batch(self, indices):
194 |         state_vec, action_vec, reward_vec, next_state_vec, done_vec = [], [], [], [], []
195 |         for i in indices:
196 |             data = self.memory[i]
197 |             state, action, reward, next_state, done = data
198 |             state_vec.append(np.array(state, copy=False))
199 |             action_vec.append(action)
200 |             reward_vec.append(reward)
201 |             next_state_vec.append(np.array(next_state, copy=False))
202 |             done_vec.append(done)
203 |         return np.concatenate(state_vec, axis=0), action_vec, reward_vec, np.concatenate(next_state_vec, axis=0), done_vec
204 | 
205 |     def replay(self, batch_size):
206 |         loss = {}
207 |         indices = np.random.choice(len(self.memory), batch_size)
208 |         batch = self._create_batch(indices)
209 |         sample_loss = self.train_model(batch)
210 |         update_loss(loss, sample_loss)
211 |         if self.epsilon > self.epsilon_min:
212 |             self.epsilon *= self.epsilon_decay
213 |         return loss
214 | 
215 | class DDQNPlanningAgent(DDQNAgent):
216 |     def __init__(self, state_size, action_size, is_atari_env, is_delayed_agent=False, delay_value=0, epsilon_min=0.001,
217 |                  epsilon_decay=0.999, learning_rate=0.001, epsilon=1.0, use_m_step_reward=False,
218 |                  use_latest_reward=True, env=None, use_learned_forward_model=True):
219 |         super().__init__(state_size, action_size, is_atari_env=is_atari_env, is_delayed_agent=is_delayed_agent, delay_value=delay_value,
220 |                          epsilon_min=epsilon_min, epsilon_decay=epsilon_decay, learning_rate=learning_rate,
221 |                          epsilon=epsilon, use_m_step_reward=use_m_step_reward, use_latest_reward=use_latest_reward)
222 |         self.use_learned_forward_model = use_learned_forward_model
223 |         if self.use_learned_forward_model:
224 |             keras_forward_model = self._build_model(loss='mse', input_size=self.state_size + 1, output_size=self.state_size)
225 |             self.forward_model = ForwardModel(keras_forward_model)
226 |         else:
227 |             self.forward_model = env
228 | 
229 |     def train_model(self, batch):
230 |         loss_dict = super().train_model(batch)
231 |         if self.use_learned_forward_model and self.delay_value > 0:
232 |             state_vec, action_vec, _, next_state_vec, _ = batch
233 |             act_t = np.asarray([action_vec]).transpose()
234 |             concat_vec = np.concatenate((state_vec, act_t), axis=1)
235 |             train_history = self.forward_model.keras_model.fit(concat_vec, next_state_vec, epochs=1, verbose=0)
236 |             f_model_loss = train_history.history['loss'][0]
237 |             loss_dict['f_model_loss'] = f_model_loss
238 |         return loss_dict
239 | 
240 |     def act(self, state, pending_actions, eval):
241 |         if not eval and np.random.rand() <= self.epsilon:
242 |             return random.randrange(self.action_size)
243 |         last_state = state
244 |         if self.delay_value > 0:
245 |             if not self.use_learned_forward_model:
246 |                 self.forward_model.store_initial_state()
247 |                 # initial_state = deepcopy(state)
248 |             for curr_action in pending_actions:
249 |                 last_state = self.forward_model.get_next_state(state=last_state, action=curr_action)
250 |             if not self.use_learned_forward_model:
251 |                 self.forward_model.restore_initial_state()
252 |         last_state_r = reshape_state(last_state, self.is_atari_env, self.state_size)
253 |         act_values = self.model.predict(last_state_r)
254 |         return np.argmax(act_values[0])  # returns best action for last state
255 | 
256 |     def memorize(self, state, action, reward, next_state, done):
257 |         # for earlier time than delay_value, the data is problematic (non-delayed response)
258 |         # Construct modified tuple by keeping old s_t with new a_{t+m}, r_{t+m} s_{t+m+1}
259 |         new_tuple = (state, action, reward, next_state, done)
260 |         self.sample_buffer.append(new_tuple)
261 |         if len(self.sample_buffer) - 1 >= self.delay_value:
262 |             old_tuple = self.sample_buffer.popleft()
263 |             modified_tuple = list(deepcopy(old_tuple))
264 |             # build time-coherent tuple from new tuple and old action
265 |             modified_tuple[0] = state
266 |             # modified_tuple[1] = action
267 |             modified_tuple[2] = reward #self.m_step_reward(first_reward=old_tuple[2])
268 |             modified_tuple[3] = next_state
269 |             modified_tuple = tuple(modified_tuple)
270 |             self.memory.append(modified_tuple)
271 | 
272 | class ForwardModel:
273 |     def __init__(self, keras_model):
274 |         self.keras_model = keras_model
275 | 
276 |     def get_next_state(self, state, action):
277 |         input = concatenate_state_action(state, action)
278 |         return self.keras_model.predict(input)
279 | 
280 |     def reset_to_state(self, state):
281 |         # not necessary here. Only used if the forwrad_model is the actual env instance
282 |         pass


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: tf_14
  2 | channels:
  3 |   - anaconda
  4 |   - conda-forge
  5 |   - defaults
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=main
  8 |   - _tflow_select=2.1.0=gpu
  9 |   - absl-py=0.10.0=py37_0
 10 |   - astor=0.8.1=py37_0
 11 |   - blas=1.0=mkl
 12 |   - c-ares=1.16.1=h7b6447c_0
 13 |   - ca-certificates=2021.5.25=h06a4308_1
 14 |   - certifi=2021.5.30=py37h06a4308_0
 15 |   - cudatoolkit=10.1.243=h6bb024c_0
 16 |   - cudnn=7.6.5=cuda10.1_0
 17 |   - cupti=10.1.168=0
 18 |   - gast=0.4.0=py_0
 19 |   - google-pasta=0.2.0=py_0
 20 |   - grpcio=1.31.0=py37hf8bcb03_0
 21 |   - h5py=2.10.0=py37hd6299e0_1
 22 |   - hdf5=1.10.6=hb1b8bf9_0
 23 |   - importlib-metadata=2.0.0=py_1
 24 |   - intel-openmp=2020.2=254
 25 |   - keras-applications=1.0.8=py_1
 26 |   - keras-preprocessing=1.1.0=py_1
 27 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 28 |   - libblas=3.8.0=14_mkl
 29 |   - libcblas=3.8.0=14_mkl
 30 |   - libffi=3.3=he6710b0_2
 31 |   - libgcc-ng=9.1.0=hdf63c60_0
 32 |   - libgfortran-ng=7.3.0=hdf63c60_0
 33 |   - liblapack=3.8.0=14_mkl
 34 |   - libprotobuf=3.13.0.1=hd408876_0
 35 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 36 |   - markdown=3.3.2=py37_0
 37 |   - mkl=2019.4=243
 38 |   - mkl-service=2.3.0=py37he904b0f_0
 39 |   - mkl_fft=1.2.0=py37h23d657b_0
 40 |   - mkl_random=1.1.0=py37hd6b4f25_0
 41 |   - ncurses=6.2=he6710b0_1
 42 |   - openssl=1.1.1k=h27cfd23_0
 43 |   - pip=21.0.1=py37h06a4308_0
 44 |   - protobuf=3.13.0.1=py37he6710b0_1
 45 |   - python=3.7.10=hdb3f193_0
 46 |   - python_abi=3.7=1_cp37m
 47 |   - readline=8.1=h27cfd23_0
 48 |   - scipy=1.5.2=py37h0b6359f_0
 49 |   - setuptools=52.0.0=py37h06a4308_0
 50 |   - six=1.15.0=py_0
 51 |   - sqlite=3.35.2=hdfb4753_0
 52 |   - tensorboard=1.14.0=py37hf484d3e_0
 53 |   - tensorflow=1.14.0=gpu_py37h74c33d7_0
 54 |   - tensorflow-base=1.14.0=gpu_py37he45bfe2_0
 55 |   - tensorflow-estimator=1.14.0=py_0
 56 |   - tensorflow-gpu=1.14.0=h0d30ee6_0
 57 |   - termcolor=1.1.0=py37_1
 58 |   - tk=8.6.10=hbc83047_0
 59 |   - unzip=6.0=h611a1e1_0
 60 |   - werkzeug=1.0.1=py_0
 61 |   - wheel=0.36.2=pyhd3eb1b0_0
 62 |   - wrapt=1.12.1=py37h7b6447c_1
 63 |   - xz=5.2.5=h7b6447c_0
 64 |   - zip=3.0=h1adfe0e_0
 65 |   - zipp=3.3.1=py_0
 66 |   - zlib=1.2.11=h7b6447c_3
 67 |   - pip:
 68 |     - atari-py==0.2.9
 69 |     - backcall==0.2.0
 70 |     - chardet==4.0.0
 71 |     - click==8.0.1
 72 |     - cloudpickle==1.6.0
 73 |     - configparser==5.0.2
 74 |     - cvxopt==1.2.6
 75 |     - decorator==5.0.9
 76 |     - docker-pycreds==0.4.0
 77 |     - gitdb==4.0.7
 78 |     - gitpython==3.1.18
 79 |     - gym==0.18.3
 80 |     - idna==2.10
 81 |     - ipdb==0.13.9
 82 |     - ipython==7.25.0
 83 |     - ipython-genutils==0.2.0
 84 |     - jedi==0.18.0
 85 |     - keras==2.3.1
 86 |     - matplotlib-inline==0.1.2
 87 |     - nashpy==0.0.20
 88 |     - numpy==1.20.1
 89 |     - nvidia-htop==1.0.2
 90 |     - opencv-python==4.5.2.54
 91 |     - opencv-python-headless==4.5.2.54
 92 |     - parso==0.8.2
 93 |     - pathtools==0.1.2
 94 |     - pexpect==4.8.0
 95 |     - pickleshare==0.7.5
 96 |     - pillow==8.2.0
 97 |     - promise==2.3
 98 |     - prompt-toolkit==3.0.19
 99 |     - psutil==5.8.0
100 |     - ptyprocess==0.7.0
101 |     - pyglet==1.5.15
102 |     - pygments==2.9.0
103 |     - python-dateutil==2.8.1
104 |     - pyyaml==5.4.1
105 |     - requests==2.25.1
106 |     - sentry-sdk==1.1.0
107 |     - shortuuid==1.0.1
108 |     - smmap==4.0.0
109 |     - subprocess32==3.5.4
110 |     - theano==1.0.5
111 |     - toml==0.10.2
112 |     - tqdm==4.61.1
113 |     - traitlets==5.0.5
114 |     - typing-extensions==3.10.0.0
115 |     - unrar==0.4
116 |     - urllib3==1.26.6
117 |     - wandb==0.10.32
118 |     - wcwidth==0.2.5


--------------------------------------------------------------------------------
/example_sweep.yml:
--------------------------------------------------------------------------------
 1 | program: ddqn_main.py
 2 | method: grid
 3 | metric:
 4 |   goal: maximize
 5 |   name: final_score
 6 | parameters:
 7 |   env_name:
 8 |     values: ['CartPole-v1'] #'CartPole-v1', 'Acrobot-v1'
 9 |   agent_type:
10 |     values: ['delayed']
11 |   use_learned_forward_model:
12 |     values: [False, True]
13 |   delay_value:
14 |     values: [0, 5, 15, 25]
15 |   physical_noise_std_ratio:
16 |     values: [0.1]
17 |   seed:
18 |     values: [1, 2, 3]
19 |   use_reward_shaping:
20 |     values: [True]
21 |   epsilon_decay:
22 |     values: [0.999] # 0.9999 for acrobot
23 |   epsilon_min:
24 |     values: [0.001]
25 |   learning_rate:
26 |     values: [0.005]
27 |   double_q:
28 |     values: [True]
29 |   target_network_update_freq:
30 |     values: [300]
31 |   total_steps:
32 |     values: [250000]
33 | 
34 | 


--------------------------------------------------------------------------------
/gym_modifications/acrobot.py:
--------------------------------------------------------------------------------
  1 | """classic Acrobot task"""
  2 | import numpy as np
  3 | from numpy import sin, cos, pi
  4 | 
  5 | from gym import core, spaces
  6 | from gym.utils import seeding
  7 | 
  8 | __copyright__ = "Copyright 2013, RLPy http://acl.mit.edu/RLPy"
  9 | __credits__ = ["Alborz Geramifard", "Robert H. Klein", "Christoph Dann",
 10 |                "William Dabney", "Jonathan P. How"]
 11 | __license__ = "BSD 3-Clause"
 12 | __author__ = "Christoph Dann <cdann@cdann.de>"
 13 | 
 14 | # SOURCE:
 15 | # https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
 16 | 
 17 | 
 18 | class AcrobotEnv(core.Env):
 19 | 
 20 |     """
 21 |     Acrobot is a 2-link pendulum with only the second joint actuated.
 22 |     Initially, both links point downwards. The goal is to swing the
 23 |     end-effector at a height at least the length of one link above the base.
 24 |     Both links can swing freely and can pass by each other, i.e., they don't
 25 |     collide when they have the same angle.
 26 |     **STATE:**
 27 |     The state consists of the sin() and cos() of the two rotational joint
 28 |     angles and the joint angular velocities :
 29 |     [cos(theta1) sin(theta1) cos(theta2) sin(theta2) thetaDot1 thetaDot2].
 30 |     For the first link, an angle of 0 corresponds to the link pointing downwards.
 31 |     The angle of the second link is relative to the angle of the first link.
 32 |     An angle of 0 corresponds to having the same angle between the two links.
 33 |     A state of [1, 0, 1, 0, ..., ...] means that both links point downwards.
 34 |     **ACTIONS:**
 35 |     The action is either applying +1, 0 or -1 torque on the joint between
 36 |     the two pendulum links.
 37 |     .. note::
 38 |         The dynamics equations were missing some terms in the NIPS paper which
 39 |         are present in the book. R. Sutton confirmed in personal correspondence
 40 |         that the experimental results shown in the paper and the book were
 41 |         generated with the equations shown in the book.
 42 |         However, there is the option to run the domain with the paper equations
 43 |         by setting book_or_nips = 'nips'
 44 |     **REFERENCE:**
 45 |     .. seealso::
 46 |         R. Sutton: Generalization in Reinforcement Learning:
 47 |         Successful Examples Using Sparse Coarse Coding (NIPS 1996)
 48 |     .. seealso::
 49 |         R. Sutton and A. G. Barto:
 50 |         Reinforcement learning: An introduction.
 51 |         Cambridge: MIT press, 1998.
 52 |     .. warning::
 53 |         This version of the domain uses the Runge-Kutta method for integrating
 54 |         the system dynamics and is more realistic, but also considerably harder
 55 |         than the original version which employs Euler integration,
 56 |         see the AcrobotLegacy class.
 57 |     """
 58 | 
 59 |     metadata = {
 60 |         'render.modes': ['human', 'rgb_array'],
 61 |         'video.frames_per_second' : 15
 62 |     }
 63 | 
 64 |     dt = .2
 65 | 
 66 |     LINK_LENGTH_1 = 1.  # [m]
 67 |     LINK_LENGTH_2 = 1.  # [m]
 68 |     LINK_MASS_1 = 1.  #: [kg] mass of link 1
 69 |     LINK_MASS_2 = 1.  #: [kg] mass of link 2
 70 |     LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
 71 |     LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
 72 |     LINK_MOI = 1.  #: moments of inertia for both links
 73 | 
 74 |     MAX_VEL_1 = 4 * pi
 75 |     MAX_VEL_2 = 9 * pi
 76 | 
 77 |     AVAIL_TORQUE = [-1., 0., +1]
 78 | 
 79 |     torque_noise_max = 0.
 80 | 
 81 |     #: use dynamics equations from the nips paper or the book
 82 |     book_or_nips = "book"
 83 |     action_arrow = None
 84 |     domain_fig = None
 85 |     actions_num = 3
 86 | 
 87 |     def __init__(self, physical_noise_std_ratio=0):
 88 |         self.viewer = None
 89 |         high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32)
 90 |         low = -high
 91 |         self.observation_space = spaces.Box(low=low, high=high, dtype=np.float32)
 92 |         self.action_space = spaces.Discrete(3)
 93 |         self.state = None
 94 |         self.seed()
 95 |         #external edit: add normal noise to the parameters, with std proportional to the original values
 96 |         self.physical_noise_std_ratio = physical_noise_std_ratio
 97 | 
 98 |     def seed(self, seed=None):
 99 |         self.np_random, seed = seeding.np_random(seed)
100 |         return [seed]
101 | 
102 |     def reset(self):
103 |         self.state = self.np_random.uniform(low=-0.1, high=0.1, size=(4,))
104 |         return self._get_ob()
105 | 
106 |     def step(self, a):
107 |         s = self.state
108 |         torque = self.AVAIL_TORQUE[a]
109 | 
110 |         # Add noise to the force action
111 |         if self.torque_noise_max > 0:
112 |             torque += self.np_random.uniform(-self.torque_noise_max, self.torque_noise_max)
113 | 
114 |         # Now, augment the state with our force action so it can be passed to
115 |         # _dsdt
116 |         s_augmented = np.append(s, torque)
117 | 
118 |         ns = rk4(self._dsdt, s_augmented, [0, self.dt])
119 |         # only care about final timestep of integration returned by integrator
120 |         ns = ns[-1]
121 |         ns = ns[:4]  # omit action
122 |         # ODEINT IS TOO SLOW!
123 |         # ns_continuous = integrate.odeint(self._dsdt, self.s_continuous, [0, self.dt])
124 |         # self.s_continuous = ns_continuous[-1] # We only care about the state
125 |         # at the ''final timestep'', self.dt
126 | 
127 |         ns[0] = wrap(ns[0], -pi, pi)
128 |         ns[1] = wrap(ns[1], -pi, pi)
129 |         ns[2] = bound(ns[2], -self.MAX_VEL_1, self.MAX_VEL_1)
130 |         ns[3] = bound(ns[3], -self.MAX_VEL_2, self.MAX_VEL_2)
131 |         self.state = ns
132 |         terminal = self._terminal()
133 |         reward = -1. if not terminal else 0.
134 |         return (self._get_ob(), reward, terminal, {})
135 | 
136 |     def _get_ob(self):
137 |         s = self.state
138 |         return np.array([cos(s[0]), sin(s[0]), cos(s[1]), sin(s[1]), s[2], s[3]])
139 | 
140 |     def _terminal(self):
141 |         s = self.state
142 |         return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.)
143 | 
144 |     def _perturb_param(self, param):
145 |         return param + np.random.normal(0, param * self.physical_noise_std_ratio)
146 | 
147 |     def _dsdt(self, s_augmented, t):
148 |         # m1 = self.LINK_MASS_1
149 |         m1 = self._perturb_param(self.LINK_MASS_1)
150 |         # m2 = self.LINK_MASS_2
151 |         m2 = self._perturb_param(self.LINK_MASS_2)
152 |         # l1 = self.LINK_LENGTH_1
153 |         l1 = self._perturb_param(self.LINK_LENGTH_1)
154 | 
155 |         lc1 = self.LINK_COM_POS_1
156 |         lc2 = self.LINK_COM_POS_2
157 |         I1 = self.LINK_MOI
158 |         I2 = self.LINK_MOI
159 | 
160 |         g = 9.8
161 |         a = s_augmented[-1]
162 |         s = s_augmented[:-1]
163 |         theta1 = s[0]
164 |         theta2 = s[1]
165 |         dtheta1 = s[2]
166 |         dtheta2 = s[3]
167 |         d1 = m1 * lc1 ** 2 + m2 * \
168 |             (l1 ** 2 + lc2 ** 2 + 2 * l1 * lc2 * cos(theta2)) + I1 + I2
169 |         d2 = m2 * (lc2 ** 2 + l1 * lc2 * cos(theta2)) + I2
170 |         phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.)
171 |         phi1 = - m2 * l1 * lc2 * dtheta2 ** 2 * sin(theta2) \
172 |                - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)  \
173 |             + (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2) + phi2
174 |         if self.book_or_nips == "nips":
175 |             # the following line is consistent with the description in the
176 |             # paper
177 |             ddtheta2 = (a + d2 / d1 * phi1 - phi2) / \
178 |                 (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
179 |         else:
180 |             # the following line is consistent with the java implementation and the
181 |             # book
182 |             ddtheta2 = (a + d2 / d1 * phi1 - m2 * l1 * lc2 * dtheta1 ** 2 * sin(theta2) - phi2) \
183 |                 / (m2 * lc2 ** 2 + I2 - d2 ** 2 / d1)
184 |         ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
185 |         return (dtheta1, dtheta2, ddtheta1, ddtheta2, 0.)
186 | 
187 |     def render(self, mode='human'):
188 |         from gym.envs.classic_control import rendering
189 | 
190 |         s = self.state
191 | 
192 |         if self.viewer is None:
193 |             self.viewer = rendering.Viewer(500,500)
194 |             bound = self.LINK_LENGTH_1 + self.LINK_LENGTH_2 + 0.2  # 2.2 for default
195 |             self.viewer.set_bounds(-bound,bound,-bound,bound)
196 | 
197 |         if s is None: return None
198 | 
199 |         p1 = [-self.LINK_LENGTH_1 *
200 |               cos(s[0]), self.LINK_LENGTH_1 * sin(s[0])]
201 | 
202 |         p2 = [p1[0] - self.LINK_LENGTH_2 * cos(s[0] + s[1]),
203 |               p1[1] + self.LINK_LENGTH_2 * sin(s[0] + s[1])]
204 | 
205 |         xys = np.array([[0,0], p1, p2])[:,::-1]
206 |         thetas = [s[0]- pi/2, s[0]+s[1]-pi/2]
207 |         link_lengths = [self.LINK_LENGTH_1, self.LINK_LENGTH_2]
208 | 
209 |         self.viewer.draw_line((-2.2, 1), (2.2, 1))
210 |         for ((x,y),th,llen) in zip(xys, thetas, link_lengths):
211 |             l,r,t,b = 0, llen, .1, -.1
212 |             jtransform = rendering.Transform(rotation=th, translation=(x,y))
213 |             link = self.viewer.draw_polygon([(l,b), (l,t), (r,t), (r,b)])
214 |             link.add_attr(jtransform)
215 |             link.set_color(0,.8, .8)
216 |             circ = self.viewer.draw_circle(.1)
217 |             circ.set_color(.8, .8, 0)
218 |             circ.add_attr(jtransform)
219 | 
220 |         return self.viewer.render(return_rgb_array = mode=='rgb_array')
221 | 
222 |     def close(self):
223 |         if self.viewer:
224 |             self.viewer.close()
225 |             self.viewer = None
226 | 
227 | def wrap(x, m, M):
228 |     """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
229 |     truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
230 |     For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
231 | 
232 |     Args:
233 |         x: a scalar
234 |         m: minimum possible value in range
235 |         M: maximum possible value in range
236 | 
237 |     Returns:
238 |         x: a scalar, wrapped
239 |     """
240 |     diff = M - m
241 |     while x > M:
242 |         x = x - diff
243 |     while x < m:
244 |         x = x + diff
245 |     return x
246 | 
247 | def bound(x, m, M=None):
248 |     """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
249 |     have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
250 | 
251 |     Args:
252 |         x: scalar
253 | 
254 |     Returns:
255 |         x: scalar, bound between min (m) and Max (M)
256 |     """
257 |     if M is None:
258 |         M = m[1]
259 |         m = m[0]
260 |     # bound x between min (m) and Max (M)
261 |     return min(max(x, m), M)
262 | 
263 | 
264 | def rk4(derivs, y0, t, *args, **kwargs):
265 |     """
266 |     Integrate 1D or ND system of ODEs using 4-th order Runge-Kutta.
267 |     This is a toy implementation which may be useful if you find
268 |     yourself stranded on a system w/o scipy.  Otherwise use
269 |     :func:`scipy.integrate`.
270 | 
271 |     Args:
272 |         derivs: the derivative of the system and has the signature ``dy = derivs(yi, ti)``
273 |         y0: initial state vector
274 |         t: sample times
275 |         args: additional arguments passed to the derivative function
276 |         kwargs: additional keyword arguments passed to the derivative function
277 | 
278 |     Example 1 ::
279 |         ## 2D system
280 |         def derivs6(x,t):
281 |             d1 =  x[0] + 2*x[1]
282 |             d2 =  -3*x[0] + 4*x[1]
283 |             return (d1, d2)
284 |         dt = 0.0005
285 |         t = arange(0.0, 2.0, dt)
286 |         y0 = (1,2)
287 |         yout = rk4(derivs6, y0, t)
288 |     Example 2::
289 |         ## 1D system
290 |         alpha = 2
291 |         def derivs(x,t):
292 |             return -alpha*x + exp(-t)
293 |         y0 = 1
294 |         yout = rk4(derivs, y0, t)
295 |     If you have access to scipy, you should probably be using the
296 |     scipy.integrate tools rather than this function.
297 | 
298 |     Returns:
299 |         yout: Runge-Kutta approximation of the ODE
300 |     """
301 | 
302 |     try:
303 |         Ny = len(y0)
304 |     except TypeError:
305 |         yout = np.zeros((len(t),), np.float_)
306 |     else:
307 |         yout = np.zeros((len(t), Ny), np.float_)
308 | 
309 |     yout[0] = y0
310 | 
311 | 
312 |     for i in np.arange(len(t) - 1):
313 | 
314 |         thist = t[i]
315 |         dt = t[i + 1] - thist
316 |         dt2 = dt / 2.0
317 |         y0 = yout[i]
318 | 
319 |         k1 = np.asarray(derivs(y0, thist, *args, **kwargs))
320 |         k2 = np.asarray(derivs(y0 + dt2 * k1, thist + dt2, *args, **kwargs))
321 |         k3 = np.asarray(derivs(y0 + dt2 * k2, thist + dt2, *args, **kwargs))
322 |         k4 = np.asarray(derivs(y0 + dt * k3, thist + dt, *args, **kwargs))
323 |         yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
324 |     return yout
325 | 


--------------------------------------------------------------------------------
/gym_modifications/cartpole.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Classic cart-pole system implemented by Rich Sutton et al.
  3 | Copied from http://incompleteideas.net/sutton/book/code/pole.c
  4 | permalink: https://perma.cc/C9ZM-652R
  5 | """
  6 | 
  7 | import math
  8 | import gym
  9 | from gym import spaces, logger
 10 | from gym.utils import seeding
 11 | import numpy as np
 12 | 
 13 | 
 14 | class CartPoleEnv(gym.Env):
 15 |     """
 16 |     Description:
 17 |         A pole is attached by an un-actuated joint to a cart, which moves along
 18 |         a frictionless track. The pendulum starts upright, and the goal is to
 19 |         prevent it from falling over by increasing and reducing the cart's
 20 |         velocity.
 21 | 
 22 |     Source:
 23 |         This environment corresponds to the version of the cart-pole problem
 24 |         described by Barto, Sutton, and Anderson
 25 | 
 26 |     Observation:
 27 |         Type: Box(4)
 28 |         Num	Observation               Min             Max
 29 |         0	Cart Position             -4.8            4.8
 30 |         1	Cart Velocity             -Inf            Inf
 31 |         2	Pole Angle                -24 deg         24 deg
 32 |         3	Pole Velocity At Tip      -Inf            Inf
 33 | 
 34 |     Actions:
 35 |         Type: Discrete(2)
 36 |         Num	Action
 37 |         0	Push cart to the left
 38 |         1	Push cart to the right
 39 | 
 40 |         Note: The amount the velocity that is reduced or increased is not
 41 |         fixed; it depends on the angle the pole is pointing. This is because
 42 |         the center of gravity of the pole increases the amount of energy needed
 43 |         to move the cart underneath it
 44 | 
 45 |     Reward:
 46 |         Reward is 1 for every step taken, including the termination step
 47 | 
 48 |     Starting State:
 49 |         All observations are assigned a uniform random value in [-0.05..0.05]
 50 | 
 51 |     Episode Termination:
 52 |         Pole Angle is more than 12 degrees.
 53 |         Cart Position is more than 2.4 (center of the cart reaches the edge of
 54 |         the display).
 55 |         Episode length is greater than 200.
 56 |         Solved Requirements:
 57 |         Considered solved when the average reward is greater than or equal to
 58 |         195.0 over 100 consecutive trials.
 59 |     """
 60 | 
 61 |     metadata = {
 62 |         'render.modes': ['human', 'rgb_array'],
 63 |         'video.frames_per_second': 50
 64 |     }
 65 | 
 66 |     def __init__(self, physical_noise_std_ratio=0):
 67 |         self.gravity = 9.8
 68 |         self.masscart = 1.0
 69 |         self.masspole = 0.1
 70 |         self.total_mass = (self.masspole + self.masscart)
 71 |         self.length = 0.5  # actually half the pole's length
 72 |         self.polemass_length = (self.masspole * self.length)
 73 |         self.force_mag = 10.0
 74 |         self.tau = 0.02  # seconds between state updates
 75 |         self.kinematics_integrator = 'euler'
 76 | 
 77 |         # external edit: add normal noise to the parameters, with std proportional to the original values
 78 |         self.physical_noise_std_ratio = physical_noise_std_ratio
 79 | 
 80 |         # Angle at which to fail the episode
 81 |         self.theta_threshold_radians = 12 * 2 * math.pi / 360
 82 |         self.x_threshold = 2.4
 83 | 
 84 |         # Angle limit set to 2 * theta_threshold_radians so failing observation
 85 |         # is still within bounds.
 86 |         high = np.array([self.x_threshold * 2,
 87 |                          np.finfo(np.float32).max,
 88 |                          self.theta_threshold_radians * 2,
 89 |                          np.finfo(np.float32).max],
 90 |                         dtype=np.float32)
 91 | 
 92 |         self.action_space = spaces.Discrete(2)
 93 |         self.observation_space = spaces.Box(-high, high, dtype=np.float32)
 94 | 
 95 |         self.seed()
 96 |         self.viewer = None
 97 |         self.state = None
 98 | 
 99 |         self.steps_beyond_done = None
100 | 
101 |     def seed(self, seed=None):
102 |         self.np_random, seed = seeding.np_random(seed)
103 |         return [seed]
104 | 
105 |     def _perturb_param(self, param):
106 |         return param + np.random.normal(0, param * self.physical_noise_std_ratio)
107 | 
108 |     def step(self, action):
109 |         masspole = self._perturb_param(self.masspole)
110 |         masscart = self._perturb_param(self.masscart)
111 |         total_mass = (masspole + masscart)
112 |         length = self._perturb_param(self.length)
113 | 
114 |         err_msg = "%r (%s) invalid" % (action, type(action))
115 |         assert self.action_space.contains(action), err_msg
116 | 
117 |         x, x_dot, theta, theta_dot = self.state
118 |         force = self.force_mag if action == 1 else -self.force_mag
119 |         costheta = math.cos(theta)
120 |         sintheta = math.sin(theta)
121 | 
122 |         # For the interested reader:
123 |         # https://coneural.org/florian/papers/05_cart_pole.pdf
124 |         temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / total_mass
125 |         thetaacc = (self.gravity * sintheta - costheta * temp) / (
126 |                     length * (4.0 / 3.0 - masspole * costheta ** 2 / total_mass))
127 |         xacc = temp - self.polemass_length * thetaacc * costheta / total_mass
128 | 
129 |         if self.kinematics_integrator == 'euler':
130 |             x = x + self.tau * x_dot
131 |             x_dot = x_dot + self.tau * xacc
132 |             theta = theta + self.tau * theta_dot
133 |             theta_dot = theta_dot + self.tau * thetaacc
134 |         else:  # semi-implicit euler
135 |             x_dot = x_dot + self.tau * xacc
136 |             x = x + self.tau * x_dot
137 |             theta_dot = theta_dot + self.tau * thetaacc
138 |             theta = theta + self.tau * theta_dot
139 | 
140 |         self.state = (x, x_dot, theta, theta_dot)
141 | 
142 |         done = bool(
143 |             x < -self.x_threshold
144 |             or x > self.x_threshold
145 |             or theta < -self.theta_threshold_radians
146 |             or theta > self.theta_threshold_radians
147 |         )
148 | 
149 |         if not done:
150 |             reward = 1.0
151 |         elif self.steps_beyond_done is None:
152 |             # Pole just fell!
153 |             self.steps_beyond_done = 0
154 |             reward = 1.0
155 |         else:
156 |             if self.steps_beyond_done == 0:
157 |                 logger.warn(
158 |                     "You are calling 'step()' even though this "
159 |                     "environment has already returned done = True. You "
160 |                     "should always call 'reset()' once you receive 'done = "
161 |                     "True' -- any further steps are undefined behavior."
162 |                 )
163 |             self.steps_beyond_done += 1
164 |             reward = 0.0
165 | 
166 |         return np.array(self.state), reward, done, {}
167 | 
168 |     def reset(self):
169 |         self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4,))
170 |         self.steps_beyond_done = None
171 |         return np.array(self.state)
172 | 
173 |     def render(self, mode='human'):
174 |         screen_width = 600
175 |         screen_height = 400
176 | 
177 |         world_width = self.x_threshold * 2
178 |         scale = screen_width / world_width
179 |         carty = 100  # TOP OF CART
180 |         polewidth = 10.0
181 |         polelen = scale * (2 * self.length)
182 |         cartwidth = 50.0
183 |         cartheight = 30.0
184 | 
185 |         if self.viewer is None:
186 |             from gym.envs.classic_control import rendering
187 |             self.viewer = rendering.Viewer(screen_width, screen_height)
188 |             l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
189 |             axleoffset = cartheight / 4.0
190 |             cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
191 |             self.carttrans = rendering.Transform()
192 |             cart.add_attr(self.carttrans)
193 |             self.viewer.add_geom(cart)
194 |             l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2
195 |             pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
196 |             pole.set_color(.8, .6, .4)
197 |             self.poletrans = rendering.Transform(translation=(0, axleoffset))
198 |             pole.add_attr(self.poletrans)
199 |             pole.add_attr(self.carttrans)
200 |             self.viewer.add_geom(pole)
201 |             self.axle = rendering.make_circle(polewidth / 2)
202 |             self.axle.add_attr(self.poletrans)
203 |             self.axle.add_attr(self.carttrans)
204 |             self.axle.set_color(.5, .5, .8)
205 |             self.viewer.add_geom(self.axle)
206 |             self.track = rendering.Line((0, carty), (screen_width, carty))
207 |             self.track.set_color(0, 0, 0)
208 |             self.viewer.add_geom(self.track)
209 | 
210 |             self._pole_geom = pole
211 | 
212 |         if self.state is None:
213 |             return None
214 | 
215 |         # Edit the pole polygon vertex
216 |         pole = self._pole_geom
217 |         l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2
218 |         pole.v = [(l, b), (l, t), (r, t), (r, b)]
219 | 
220 |         x = self.state
221 |         cartx = x[0] * scale + screen_width / 2.0  # MIDDLE OF CART
222 |         self.carttrans.set_translation(cartx, carty)
223 |         self.poletrans.set_rotation(-x[2])
224 | 
225 |         return self.viewer.render(return_rgb_array=mode == 'rgb_array')
226 | 
227 |     def close(self):
228 |         if self.viewer:
229 |             self.viewer.close()
230 |             self.viewer = None
231 | 


--------------------------------------------------------------------------------
/init_main.py:
--------------------------------------------------------------------------------
 1 | # SPDX-License-Identifier: Apache-2.0
 2 | import gym
 3 | from delayed_env import DelayedEnv
 4 | import wandb
 5 | import warnings
 6 | 
 7 | # from keras import backend as K
 8 | from tensorflow.python.keras import backend as K
 9 | 
10 | import tensorflow as tf
11 | 
12 | 
13 | config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
14 | config.gpu_options.per_process_gpu_memory_fraction = 0.9
15 | config.gpu_options.allow_growth = True
16 | sess = tf.compat.v1.Session(config=config)
17 | K.set_session(sess)
18 | 
19 | 
20 | def init_main():
21 |     hyperparameter_defaults = dict(
22 |         is_delayed_agent=False,
23 |         double_q=True,
24 |         delay_value=5,
25 |         epsilon_decay=0.999, # Cartpole: 0.999, Acrobot: 0.9999, MountainCar: 0.99999
26 |         epsilon_min=0.001, #0.001
27 |         learning_rate=0.005, # Cartpole & Acrobot: 0.005, #mountainCar: 0.0001
28 |         seed=1,
29 |         epsilon=1.0,
30 |         use_m_step_reward=False,
31 |         use_latest_reward=False,
32 |         use_reward_shaping=True,
33 |         physical_noise_std_ratio=0.1, # default: 0.1
34 |         env_name='CartPole-v1', #'CartPole-v1', 'Acrobot-v1', 'MountainCar-v0'
35 |         train_freq=1,
36 |         target_network_update_freq=300,
37 |         use_learned_forward_model=True,
38 |         agent_type='delayed', #'delayed', 'augmented', 'oblivious'
39 |         # total_steps=3000, replaced with a delay-dependent function
40 |     )
41 |     # Pass your defaults to wandb.init
42 |     wandb.init(config=hyperparameter_defaults)
43 |     config = wandb.config
44 |     if 'CartPole' in config.env_name or 'Acrobot' in config.env_name:
45 |         try:
46 |             orig_env = gym.make(config.env_name, physical_noise_std_ratio=config.physical_noise_std_ratio)
47 |         except TypeError as e:
48 |             warnings.warn('{} gym env has not been modified as needed to support added noise. See README.md for '
49 |                           'instructions.\nRunning original noiseless version instead.'.format(config.env_name))
50 |             orig_env = gym.make(config.env_name)
51 |     else:
52 |         orig_env = gym.make(config.env_name)
53 |     # orig_env = DiscretizeActions(orig_env) # for mujoco envs
54 |     delayed_env = DelayedEnv(orig_env, config.delay_value)
55 |     state_size = orig_env.observation_space.shape#[0]
56 |     if not delayed_env.is_atari_env:
57 |         state_size = state_size[0]
58 |     action_size = orig_env.action_space.n
59 |     done = False
60 |     batch_size = 32
61 |     return config, delayed_env, state_size, action_size, done, batch_size
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/pretrained_agents/2xcbo7mg_Acrobot-v1_ddqn_delay.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galdl/rl_delay_basic/2f88729d2339f468326cc38cd04e792617f544e7/pretrained_agents/2xcbo7mg_Acrobot-v1_ddqn_delay.h5


--------------------------------------------------------------------------------
/pretrained_agents/i06rfoxy_cartpole_ddqn_no_delay.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/galdl/rl_delay_basic/2f88729d2339f468326cc38cd04e792617f544e7/pretrained_agents/i06rfoxy_cartpole_ddqn_no_delay.h5


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.18.5
2 | wandb==0.10.32
3 | tensorflow==2.2.0
4 | keras==2.4.3
5 | gym==0.18.3
6 | tqdm==4.61.1


--------------------------------------------------------------------------------
/third_party/gym.patch:
--------------------------------------------------------------------------------
  1 | diff --git a/gym/envs/classic_control/acrobot.py b/gym/envs/classic_control/acrobot.py
  2 | index 91321f3..afa2522 100644
  3 | --- a/gym/envs/classic_control/acrobot.py
  4 | +++ b/gym/envs/classic_control/acrobot.py
  5 | @@ -14,6 +14,7 @@ __author__ = "Christoph Dann <cdann@cdann.de>"
  6 |  # SOURCE:
  7 |  # https://github.com/rlpy/rlpy/blob/master/rlpy/Domains/Acrobot.py
  8 |  
  9 | +
 10 |  class AcrobotEnv(core.Env):
 11 |  
 12 |      """
 13 | @@ -83,7 +84,7 @@ class AcrobotEnv(core.Env):
 14 |      domain_fig = None
 15 |      actions_num = 3
 16 |  
 17 | -    def __init__(self):
 18 | +    def __init__(self, physical_noise_std_ratio=0):
 19 |          self.viewer = None
 20 |          high = np.array([1.0, 1.0, 1.0, 1.0, self.MAX_VEL_1, self.MAX_VEL_2], dtype=np.float32)
 21 |          low = -high
 22 | @@ -91,6 +92,8 @@ class AcrobotEnv(core.Env):
 23 |          self.action_space = spaces.Discrete(3)
 24 |          self.state = None
 25 |          self.seed()
 26 | +        #external edit: add normal noise to the parameters, with std proportional to the original values
 27 | +        self.physical_noise_std_ratio = physical_noise_std_ratio
 28 |  
 29 |      def seed(self, seed=None):
 30 |          self.np_random, seed = seeding.np_random(seed)
 31 | @@ -138,14 +141,22 @@ class AcrobotEnv(core.Env):
 32 |          s = self.state
 33 |          return bool(-cos(s[0]) - cos(s[1] + s[0]) > 1.)
 34 |  
 35 | +    def _perturb_param(self, param):
 36 | +        return param + np.random.normal(0, param * self.physical_noise_std_ratio)
 37 | +
 38 |      def _dsdt(self, s_augmented, t):
 39 | -        m1 = self.LINK_MASS_1
 40 | -        m2 = self.LINK_MASS_2
 41 | -        l1 = self.LINK_LENGTH_1
 42 | +        # m1 = self.LINK_MASS_1
 43 | +        m1 = self._perturb_param(self.LINK_MASS_1)
 44 | +        # m2 = self.LINK_MASS_2
 45 | +        m2 = self._perturb_param(self.LINK_MASS_2)
 46 | +        # l1 = self.LINK_LENGTH_1
 47 | +        l1 = self._perturb_param(self.LINK_LENGTH_1)
 48 | +
 49 |          lc1 = self.LINK_COM_POS_1
 50 |          lc2 = self.LINK_COM_POS_2
 51 |          I1 = self.LINK_MOI
 52 |          I2 = self.LINK_MOI
 53 | +
 54 |          g = 9.8
 55 |          a = s_augmented[-1]
 56 |          s = s_augmented[:-1]
 57 | diff --git a/gym/envs/classic_control/cartpole.py b/gym/envs/classic_control/cartpole.py
 58 | index 78d806b..d3c6ba2 100644
 59 | --- a/gym/envs/classic_control/cartpole.py
 60 | +++ b/gym/envs/classic_control/cartpole.py
 61 | @@ -25,17 +25,17 @@ class CartPoleEnv(gym.Env):
 62 |  
 63 |      Observation:
 64 |          Type: Box(4)
 65 | -        Num     Observation               Min                     Max
 66 | -        0       Cart Position             -4.8                    4.8
 67 | -        1       Cart Velocity             -Inf                    Inf
 68 | -        2       Pole Angle                -0.418 rad (-24 deg)    0.418 rad (24 deg)
 69 | -        3       Pole Angular Velocity     -Inf                    Inf
 70 | +        Num	Observation               Min             Max
 71 | +        0	Cart Position             -4.8            4.8
 72 | +        1	Cart Velocity             -Inf            Inf
 73 | +        2	Pole Angle                -24 deg         24 deg
 74 | +        3	Pole Velocity At Tip      -Inf            Inf
 75 |  
 76 |      Actions:
 77 |          Type: Discrete(2)
 78 | -        Num   Action
 79 | -        0     Push cart to the left
 80 | -        1     Push cart to the right
 81 | +        Num	Action
 82 | +        0	Push cart to the left
 83 | +        1	Push cart to the right
 84 |  
 85 |          Note: The amount the velocity that is reduced or increased is not
 86 |          fixed; it depends on the angle the pole is pointing. This is because
 87 | @@ -54,7 +54,7 @@ class CartPoleEnv(gym.Env):
 88 |          the display).
 89 |          Episode length is greater than 200.
 90 |          Solved Requirements:
 91 | -        Considered solved when the average return is greater than or equal to
 92 | +        Considered solved when the average reward is greater than or equal to
 93 |          195.0 over 100 consecutive trials.
 94 |      """
 95 |  
 96 | @@ -63,7 +63,7 @@ class CartPoleEnv(gym.Env):
 97 |          'video.frames_per_second': 50
 98 |      }
 99 |  
100 | -    def __init__(self):
101 | +    def __init__(self, physical_noise_std_ratio=0):
102 |          self.gravity = 9.8
103 |          self.masscart = 1.0
104 |          self.masspole = 0.1
105 | @@ -74,6 +74,9 @@ class CartPoleEnv(gym.Env):
106 |          self.tau = 0.02  # seconds between state updates
107 |          self.kinematics_integrator = 'euler'
108 |  
109 | +        # external edit: add normal noise to the parameters, with std proportional to the original values
110 | +        self.physical_noise_std_ratio = physical_noise_std_ratio
111 | +
112 |          # Angle at which to fail the episode
113 |          self.theta_threshold_radians = 12 * 2 * math.pi / 360
114 |          self.x_threshold = 2.4
115 | @@ -99,7 +102,15 @@ class CartPoleEnv(gym.Env):
116 |          self.np_random, seed = seeding.np_random(seed)
117 |          return [seed]
118 |  
119 | +    def _perturb_param(self, param):
120 | +        return param + np.random.normal(0, param * self.physical_noise_std_ratio)
121 | +
122 |      def step(self, action):
123 | +        masspole = self._perturb_param(self.masspole)
124 | +        masscart = self._perturb_param(self.masscart)
125 | +        total_mass = (masspole + masscart)
126 | +        length = self._perturb_param(self.length)
127 | +
128 |          err_msg = "%r (%s) invalid" % (action, type(action))
129 |          assert self.action_space.contains(action), err_msg
130 |  
131 | @@ -110,9 +121,10 @@ class CartPoleEnv(gym.Env):
132 |  
133 |          # For the interested reader:
134 |          # https://coneural.org/florian/papers/05_cart_pole.pdf
135 | -        temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / self.total_mass
136 | -        thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta ** 2 / self.total_mass))
137 | -        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
138 | +        temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / total_mass
139 | +        thetaacc = (self.gravity * sintheta - costheta * temp) / (
140 | +                    length * (4.0 / 3.0 - masspole * costheta ** 2 / total_mass))
141 | +        xacc = temp - self.polemass_length * thetaacc * costheta / total_mass
142 |  
143 |          if self.kinematics_integrator == 'euler':
144 |              x = x + self.tau * x_dot
145 | @@ -163,7 +175,7 @@ class CartPoleEnv(gym.Env):
146 |          screen_height = 400
147 |  
148 |          world_width = self.x_threshold * 2
149 | -        scale = screen_width/world_width
150 | +        scale = screen_width / world_width
151 |          carty = 100  # TOP OF CART
152 |          polewidth = 10.0
153 |          polelen = scale * (2 * self.length)
154 | @@ -186,7 +198,7 @@ class CartPoleEnv(gym.Env):
155 |              pole.add_attr(self.poletrans)
156 |              pole.add_attr(self.carttrans)
157 |              self.viewer.add_geom(pole)
158 | -            self.axle = rendering.make_circle(polewidth/2)
159 | +            self.axle = rendering.make_circle(polewidth / 2)
160 |              self.axle.add_attr(self.poletrans)
161 |              self.axle.add_attr(self.carttrans)
162 |              self.axle.set_color(.5, .5, .8)
163 | 


--------------------------------------------------------------------------------