├── LICENSE ├── README.md ├── core ├── __init__.py ├── buffer.py ├── env_wrapper.py ├── genealogy.py ├── learner.py ├── mod_utils.py ├── models.py ├── neuroevolution.py ├── off_policy_algo.py ├── portfolio.py ├── runner.py └── ucb.py └── main.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DISCONTINUATION OF PROJECT # 2 | This project will no longer be maintained by Intel. 3 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project. 4 | Intel no longer accepts patches to this project. 5 | ![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg) 6 | 7 | Codebase for [Collaborative Evolutionary Reinforcement Learning](https://arxiv.org/pdf/1905.00976.pdf) accepted to be published in the Proceedings of the 36th International Conference on Machine Learning, Long Beach, California, PMLR 97, 2019. Copyright 2019 by the author(s). 8 | 9 | ## Guide to set up and run CERL Experiments 10 | 11 | 12 | 1. Setup Conda 13 | - Install Anaconda3 14 | - conda create -n $ENV_NAME$ python=3.6.1 15 | - source activate $ENV_NAME$ 16 | 17 | 2. Install Pytorch version 1.0 18 | - Refer to https://pytorch.org/ for instructions 19 | - conda install pytorch torchvision -c pytorch [GPU-version] 20 | 21 | 3. Install Numpy, Cython and Scipy 22 | - pip install numpy==1.15.4 23 | - pip install cython==0.29.2 24 | - pip install scipy==1.1.0 25 | 26 | 4. Install Mujoco and OpenAI_Gym 27 | - Download mjpro150 from https://www.roboti.us/index.html 28 | - Unzip mjpro150 and place it + mjkey.txt (license file) in ~/.mujoco/ (create the .mujoco dir in you home folder) 29 | - pip install -U 'mujoco-py<1.50.2,>=1.50.1' 30 | - pip install 'gym[all]' 31 | 32 | ## Code labels 33 | 34 | main.py: Main Script runs everything 35 | 36 | core/runner.py: Rollout worker 37 | 38 | core/ucb.py: Upper Confidence Bound implemented for learner selection by the resource-manager 39 | 40 | core/portfolio.py: Portfolio of learners which can vary in their hyperparameters 41 | 42 | core/learner.py: Learner agent encapsulating the algo and sum-statistics 43 | 44 | core/buffer.py: Cyclic Replay buffer 45 | 46 | core/env_wrapper.py: Wrapper around the Mujoco env 47 | 48 | core/models.py: Actor/Critic model 49 | 50 | core/neuroevolution.py: Implements Neuroevolution 51 | 52 | core/off_policy_algo.py: Implements the off_policy_gradient learner TD3 53 | 54 | core/mod_utils.py: Helper functions 55 | 56 | ## Reproduce Results 57 | 58 | python main.py -env HalfCheetah-v2 -portfolio {10,14} -total_steps 2 -seed {2018,2022} 59 | 60 | python main.py -env Hopper-v2 -portfolio {10,14} -total_steps 1.5 -seed {2018,2022} 61 | 62 | python main.py -env Humanoid-v2 -portfolio {10,14} -total_steps 1 -seed {2018,2022} 63 | 64 | python main.py -env Walker2d-v2 -portfolio {10,14} -total_steps 2 -seed {2018,2022} 65 | 66 | python main.py -env Swimmer-v2 -portfolio {10,14} -total_steps 2 -seed {2018,2022} 67 | 68 | python main.py -env Hopper-v2 -portfolio {100,102} -total_steps 5 -seed {2018,2022} 69 | 70 | where {} represents an inclusive discrete range: {10, 14} --> {10, 11, 12, 13, 14} 71 | 72 | 73 | ## Note 74 | All roll-outs (evaluation of actors in the evolutionary population and the explorative roll-outs 75 | conducted by the learners run in parallel). They are farmed out to different CPU cores, 76 | and write asynchronously to the collective replay buffer. Thus, slight variations in results 77 | are observed even with the same seed. 78 | -------------------------------------------------------------------------------- /core/__init__.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | -------------------------------------------------------------------------------- /core/buffer.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | import numpy as np 18 | import random 19 | import torch 20 | from torch.multiprocessing import Manager 21 | 22 | 23 | class Buffer(): 24 | """Cyclic Buffer stores experience tuples from the rollouts 25 | Parameters: 26 | capacity (int): Maximum number of experiences to hold in cyclic buffer 27 | """ 28 | 29 | def __init__(self, capacity, buffer_gpu): 30 | self.capacity = capacity; self.buffer_gpu = buffer_gpu; self.counter = 0 31 | self.manager = Manager() 32 | self.tuples = self.manager.list() #Temporary shared buffer to get experiences from processes 33 | self.s = []; self.ns = []; self.a = []; self.r = []; self.done = [] 34 | 35 | # Temporary tensors that cane be loaded in GPU for fast sampling during gradient updates (updated each gen) --> Faster sampling - no need to cycle experiences in and out of gpu 1000 times 36 | self.sT = None; self.nsT = None; self.aT = None; self.rT = None; self.doneT = None 37 | 38 | 39 | def referesh(self): 40 | """Housekeeping 41 | Parameters: 42 | None 43 | Returns: 44 | None 45 | """ 46 | 47 | # Add ALL EXPERIENCE COLLECTED TO MEMORY concurrently 48 | for _ in range(len(self.tuples)): 49 | exp = self.tuples.pop() 50 | self.s.append(exp[0]) 51 | self.ns.append(exp[1]) 52 | self.a.append(exp[2]) 53 | self.r.append(exp[3]) 54 | self.done.append(exp[4]) 55 | 56 | 57 | #Trim to make the buffer size < capacity 58 | while self.__len__() > self.capacity: 59 | self.s.pop(0); self.ns.pop(0); self.a.pop(0); self.r.pop(0); self.done.pop(0) 60 | 61 | 62 | def __len__(self): 63 | return len(self.s) 64 | 65 | def sample(self, batch_size): 66 | """Sample a batch of experiences from memory with uniform probability 67 | Parameters: 68 | batch_size (int): Size of the batch to sample 69 | Returns: 70 | Experience (tuple): A tuple of (state, next_state, action, shaped_reward, done) each as a numpy array with shape (batch_size, :) 71 | """ 72 | ind = random.sample(range(len(self.s)), batch_size) 73 | 74 | return self.sT[ind], self.nsT[ind], self.aT[ind], self.rT[ind], self.doneT[ind] 75 | #return np.vstack([self.s[i] for i in ind]), np.vstack([self.ns[i] for i in ind]), np.vstack([self.a[i] for i in ind]), np.vstack([self.r[i] for i in ind]), np.vstack([self.done[i] for i in ind]) 76 | 77 | 78 | def tensorify(self): 79 | """Method to save experiences to drive 80 | Parameters: 81 | None 82 | Returns: 83 | None 84 | """ 85 | self.referesh() #Referesh first 86 | 87 | self.sT = torch.tensor(np.vstack(self.s)) 88 | self.nsT = torch.tensor(np.vstack(self.ns)) 89 | self.aT = torch.tensor(np.vstack(self.a)) 90 | self.rT = torch.tensor(np.vstack(self.r)) 91 | self.doneT = torch.tensor(np.vstack(self.done)) 92 | if self.buffer_gpu: 93 | self.sT = self.sT.cuda() 94 | self.nsT = self.nsT.cuda() 95 | self.aT = self.aT.cuda() 96 | self.rT = self.rT.cuda() 97 | self.doneT = self.doneT.cuda() 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /core/env_wrapper.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | import gym 18 | 19 | 20 | class EnvironmentWrapper: 21 | """Wrapper around the Environment to expose a cleaner interface for RL 22 | 23 | Parameters: 24 | env_name (str): Env name 25 | 26 | 27 | """ 28 | def __init__(self, env_name, ALGO): 29 | """ 30 | A base template for all environment wrappers. 31 | """ 32 | self.env = gym.make(env_name) 33 | self.action_low = float(self.env.action_space.low[0]) 34 | self.action_high = float(self.env.action_space.high[0]) 35 | self.ALGO = ALGO 36 | 37 | 38 | 39 | 40 | def reset(self): 41 | """Method overloads reset 42 | Parameters: 43 | None 44 | 45 | Returns: 46 | next_obs (list): Next state 47 | """ 48 | return self.env.reset() 49 | 50 | 51 | def step(self, action: object): #Expects a numpy action 52 | """Take an action to forward the simulation 53 | 54 | Parameters: 55 | action (ndarray): action to take in the env 56 | 57 | Returns: 58 | next_obs (list): Next state 59 | reward (float): Reward for this step 60 | done (bool): Simulation done? 61 | info (None): Template from OpenAi gym (doesnt have anything) 62 | """ 63 | 64 | action = self.action_low + action * (self.action_high - self.action_low) 65 | return self.env.step(action) 66 | 67 | def render(self): 68 | self.env.render() 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /core/genealogy.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | from copy import deepcopy 18 | 19 | 20 | class Info(): 21 | def __init__(self, origin): 22 | self.origin = origin 23 | self.history = [origin] 24 | self.crossover = [] 25 | self.num_mut = 0.0 26 | 27 | def reset(self): 28 | self.history = [] 29 | self.crossover = [] 30 | self.num_mut = 0.0 31 | 32 | 33 | 34 | 35 | 36 | class Genealogy(): 37 | def __init__(self): 38 | self.wwid_counter = 0 39 | self.tree = {} 40 | 41 | def new_id(self, origin): 42 | wwid = self.wwid_counter + 1 43 | self.wwid_counter += 1 44 | self.tree[wwid] = Info(origin) 45 | return wwid 46 | 47 | 48 | def mutation(self, wwid, gen): 49 | self.tree[wwid].history.append('mut_'+str(gen)) 50 | 51 | def elite(self, wwid, gen): 52 | self.tree[wwid].history.append('elite_' + str(gen)) 53 | 54 | ######### INHERITANCE OPS ########### 55 | def crossover(self, parent1, parent2, gen): 56 | origin = 'crossover_' + str(gen) 57 | wwid = self.wwid_counter + 1 58 | self.wwid_counter += 1 59 | self.tree[wwid] = Info(origin) 60 | return wwid 61 | 62 | def asexual(self, parent): 63 | wwid = self.wwid_counter + 1 64 | self.wwid_counter += 1 65 | self.tree[wwid] = deepcopy(self.tree[parent]) 66 | return wwid 67 | 68 | -------------------------------------------------------------------------------- /core/learner.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | from core.off_policy_algo import Off_Policy_Algo 18 | 19 | 20 | 21 | 22 | class Learner: 23 | """Learner object encapsulating a local learner 24 | 25 | Parameters: 26 | algo_name (str): Algorithm Identifier 27 | state_dim (int): State size 28 | action_dim (int): Action size 29 | actor_lr (float): Actor learning rate 30 | critic_lr (float): Critic learning rate 31 | gamma (float): DIscount rate 32 | tau (float): Target network sync generate 33 | init_w (bool): Use kaimling normal to initialize? 34 | **td3args (**kwargs): arguments for TD3 algo 35 | 36 | 37 | """ 38 | 39 | def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True, **td3args): 40 | self.td3args = td3args; self.id = id 41 | self.algo = Off_Policy_Algo(wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w) 42 | 43 | 44 | #LEARNER STATISTICS 45 | self.fitnesses = [] 46 | self.ep_lens = [] 47 | self.value = None 48 | self.visit_count = 0 49 | 50 | 51 | def update_parameters(self, replay_buffer, buffer_gpu, batch_size, iterations): 52 | for _ in range(iterations): 53 | s, ns, a, r, done = replay_buffer.sample(batch_size) 54 | if not buffer_gpu: 55 | s = s.cuda(); ns = ns.cuda(); a = a.cuda(); r = r.cuda(); done = done.cuda() 56 | self.algo.update_parameters(s, ns, a, r, done, 1, **self.td3args) 57 | 58 | 59 | def update_stats(self, fitness, ep_len, gamma=0.2): 60 | self.visit_count += 1 61 | self.fitnesses.append(fitness) 62 | self.ep_lens.append(ep_len) 63 | 64 | if self.value == None: self.value = fitness 65 | else: self.value = gamma * fitness + (1-gamma) * self.value 66 | -------------------------------------------------------------------------------- /core/mod_utils.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | from torch import nn 18 | from torch.autograd import Variable 19 | import random, pickle, copy, argparse 20 | import numpy as np, torch, os 21 | 22 | class Tracker(): #Tracker 23 | """Tracker class to log progress and save metrics periodically 24 | 25 | Parameters: 26 | save_folder (str): Folder name for saving progress 27 | vars_string (list): List of metric names to log 28 | project_string: (str): String decorator for metric filenames 29 | 30 | Returns: 31 | None 32 | """ 33 | 34 | def __init__(self, save_folder, vars_string, project_string): 35 | self.vars_string = vars_string; self.project_string = project_string 36 | self.foldername = save_folder 37 | self.all_tracker = [[[],0.0,[]] for _ in vars_string] #[Id of var tracked][fitnesses, avg_fitness, csv_fitnesses] 38 | self.counter = 0 39 | self.conv_size = 1 40 | if not os.path.exists(self.foldername): 41 | os.makedirs(self.foldername) 42 | 43 | 44 | def update(self, updates, generation): 45 | """Add a metric observed 46 | 47 | Parameters: 48 | updates (list): List of new scoresfor each tracked metric 49 | generation (int): Current gen 50 | 51 | Returns: 52 | None 53 | """ 54 | 55 | self.counter += 1 56 | for update, var in zip(updates, self.all_tracker): 57 | if update == None: continue 58 | var[0].append(update) 59 | 60 | #Constrain size of convolution 61 | for var in self.all_tracker: 62 | if len(var[0]) > self.conv_size: var[0].pop(0) 63 | 64 | #Update new average 65 | for var in self.all_tracker: 66 | if len(var[0]) == 0: continue 67 | var[1] = sum(var[0])/float(len(var[0])) 68 | 69 | if self.counter % 1 == 0: # Save to csv file 70 | for i, var in enumerate(self.all_tracker): 71 | if len(var[0]) == 0: continue 72 | var[2].append(np.array([generation, var[1]])) 73 | filename = self.foldername + self.vars_string[i] + self.project_string 74 | np.savetxt(filename, np.array(var[2]), fmt='%.3f', delimiter=',') 75 | 76 | 77 | def str2bool(v): 78 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 79 | return True 80 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 81 | return False 82 | else: 83 | raise argparse.ArgumentTypeError('Boolean value expected.') 84 | 85 | 86 | def hard_update(target, source): 87 | """Hard update (clone) from target network to source 88 | 89 | Parameters: 90 | target (object): A pytorch model 91 | source (object): A pytorch model 92 | 93 | Returns: 94 | None 95 | """ 96 | 97 | for target_param, param in zip(target.parameters(), source.parameters()): 98 | target_param.data.copy_(param.data) 99 | 100 | #Signature transfer if applicable 101 | try: 102 | target.wwid[0] = source.wwid[0] 103 | except: 104 | None 105 | 106 | 107 | def soft_update(target, source, tau): 108 | """Soft update from target network to source 109 | 110 | Parameters: 111 | target (object): A pytorch model 112 | source (object): A pytorch model 113 | tau (float): Tau parameter 114 | 115 | Returns: 116 | None 117 | 118 | """ 119 | 120 | for target_param, param in zip(target.parameters(), source.parameters()): 121 | target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) 122 | 123 | 124 | def to_numpy(var): 125 | """Tensor --> numpy 126 | 127 | Parameters: 128 | var (tensor): tensor 129 | 130 | Returns: 131 | var (ndarray): ndarray 132 | """ 133 | return var.data.numpy() 134 | 135 | def to_tensor(ndarray, volatile=False, requires_grad=False): 136 | """numpy --> Variable 137 | 138 | Parameters: 139 | ndarray (ndarray): ndarray 140 | volatile (bool): create a volatile tensor? 141 | requires_grad (bool): tensor requires gradients? 142 | 143 | Returns: 144 | var (variable): variable 145 | """ 146 | 147 | if isinstance(ndarray, list): ndarray = np.array(ndarray) 148 | return Variable(torch.from_numpy(ndarray).float(), volatile=volatile, requires_grad=requires_grad) 149 | 150 | def pickle_obj(filename, object): 151 | """Pickle object 152 | 153 | Parameters: 154 | filename (str): folder to dump pickled object 155 | object (object): object to pickle 156 | 157 | Returns: 158 | None 159 | """ 160 | 161 | handle = open(filename, "wb") 162 | pickle.dump(object, handle) 163 | 164 | def unpickle_obj(filename): 165 | """Unpickle object from disk 166 | 167 | Parameters: 168 | filename (str): file from which to load and unpickle object 169 | 170 | Returns: 171 | obj (object): unpickled object 172 | """ 173 | with open(filename, 'rb') as f: 174 | return pickle.load(f) 175 | 176 | def init_weights(m): 177 | """Initialize weights using kaiming uniform initialization in place 178 | 179 | Parameters: 180 | m (nn.module): Linear module from torch.nn 181 | 182 | Returns: 183 | None 184 | """ 185 | if type(m) == nn.Linear: 186 | nn.init.kaiming_uniform_(m.weight) 187 | m.bias.data.fill_(0.01) 188 | 189 | 190 | def list_mean(l): 191 | """compute avergae from a list 192 | 193 | Parameters: 194 | l (list): list 195 | 196 | Returns: 197 | mean (float): mean 198 | """ 199 | if len(l) == 0: return None 200 | else: return sum(l)/len(l) 201 | 202 | def pprint(l): 203 | """Pretty print 204 | 205 | Parameters: 206 | l (list/float/None): object to print 207 | 208 | Returns: 209 | pretty print str 210 | """ 211 | 212 | if isinstance(l, list): 213 | if len(l) == 0: return None 214 | else: 215 | if l == None: return None 216 | else: return '%.2f'%l 217 | 218 | 219 | 220 | 221 | def flatten(d): 222 | """Recursive method to flatten a dict -->list 223 | 224 | Parameters: 225 | d (dict): dict 226 | 227 | Returns: 228 | l (list) 229 | """ 230 | 231 | res = [] # Result list 232 | if isinstance(d, dict): 233 | for key, val in sorted(d.items()): 234 | res.extend(flatten(val)) 235 | elif isinstance(d, list): 236 | res = d 237 | else: 238 | res = [d] 239 | return res 240 | 241 | def reverse_flatten(d, l): 242 | """Recursive method to unflatten a list -->dict [Reverse of flatten] in place 243 | 244 | Parameters: 245 | d (dict): dict 246 | l (list): l 247 | 248 | Returns: 249 | None 250 | """ 251 | 252 | if isinstance(d, dict): 253 | for key, _ in sorted(d.items()): 254 | 255 | #FLoat is immutable so 256 | if isinstance(d[key], float): 257 | d[key] = l[0] 258 | l[:] = l[1:] 259 | continue 260 | 261 | reverse_flatten(d[key], l) 262 | elif isinstance(d, list): 263 | d[:] = l[0:len(d)] 264 | l[:] = l[len(d):] 265 | 266 | 267 | def load_all_models_dir(dir, model_template): 268 | """Load all models from a given directory onto a template 269 | 270 | Parameters: 271 | dir (str): directory 272 | model_template (object): Class template to load the objects onto 273 | 274 | Returns: 275 | models (list): list of loaded objects 276 | """ 277 | 278 | list_files = os.listdir(dir) 279 | print(list_files) 280 | models = [] 281 | for i, fname in enumerate(list_files): 282 | try: 283 | model_template.load_state_dict(torch.load(dir + fname)) 284 | model_template.eval() 285 | models.append(copy.deepcopy(model_template)) 286 | except: 287 | print(fname, 'failed to load') 288 | return models 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | -------------------------------------------------------------------------------- /core/models.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | import torch 18 | import torch.nn as nn 19 | import torch.nn.functional as F 20 | from torch.distributions import Normal 21 | 22 | 23 | class Actor(nn.Module): 24 | """Actor model 25 | 26 | Parameters: 27 | args (object): Parameter class 28 | """ 29 | 30 | def __init__(self, state_dim, action_dim, wwid): 31 | super(Actor, self).__init__() 32 | 33 | self.wwid = torch.Tensor([wwid]) 34 | l1 = 400; l2 = 300 35 | 36 | # Construct Hidden Layer 1 37 | self.f1 = nn.Linear(state_dim, l1) 38 | self.ln1 = nn.LayerNorm(l1) 39 | 40 | #Hidden Layer 2 41 | self.f2 = nn.Linear(l1, l2) 42 | self.ln2 = nn.LayerNorm(l2) 43 | 44 | #Out 45 | self.w_out = nn.Linear(l2, action_dim) 46 | 47 | def forward(self, input): 48 | """Method to forward propagate through the actor's graph 49 | 50 | Parameters: 51 | input (tensor): states 52 | 53 | Returns: 54 | action (tensor): actions 55 | 56 | 57 | """ 58 | #Hidden Layer 1 59 | out = F.elu(self.f1(input)) 60 | out = self.ln1(out) 61 | 62 | #Hidden Layer 2 63 | out = F.elu(self.f2(out)) 64 | out = self.ln2(out) 65 | 66 | #Out 67 | return torch.sigmoid(self.w_out(out)) 68 | 69 | 70 | class Critic(nn.Module): 71 | 72 | """Critic model 73 | 74 | Parameters: 75 | args (object): Parameter class 76 | 77 | """ 78 | 79 | def __init__(self, state_dim, action_dim): 80 | super(Critic, self).__init__() 81 | l1 = 400; l2 = 300 82 | 83 | ######################## Q1 Head ################## 84 | # Construct Hidden Layer 1 with state 85 | self.q1f1 = nn.Linear(state_dim + action_dim, l1) 86 | self.q1ln1 = nn.LayerNorm(l1) 87 | 88 | #Hidden Layer 2 89 | self.q1f2 = nn.Linear(l1, l2) 90 | self.q1ln2 = nn.LayerNorm(l2) 91 | 92 | #Out 93 | self.q1out = nn.Linear(l2, 1) 94 | 95 | 96 | ######################## Q2 Head ################## 97 | # Construct Hidden Layer 1 with state 98 | self.q2f1 = nn.Linear(state_dim + action_dim, l1) 99 | self.q2ln1 = nn.LayerNorm(l1) 100 | 101 | #Hidden Layer 2 102 | self.q2f2 = nn.Linear(l1, l2) 103 | self.q2ln2 = nn.LayerNorm(l2) 104 | 105 | #Out 106 | self.q2out = nn.Linear(l2, 1) 107 | 108 | ######################## Value Head ################## [NOT USED IN CERL] 109 | # Construct Hidden Layer 1 with 110 | self.vf1 = nn.Linear(state_dim, l1) 111 | self.vln1 = nn.LayerNorm(l1) 112 | 113 | # Hidden Layer 2 114 | self.vf2 = nn.Linear(l1, l2) 115 | self.vln2 = nn.LayerNorm(l2) 116 | 117 | # Out 118 | self.vout = nn.Linear(l2, 1) 119 | 120 | 121 | 122 | 123 | 124 | def forward(self, obs, action): 125 | """Method to forward propagate through the critic's graph 126 | 127 | Parameters: 128 | input (tensor): states 129 | input (tensor): actions 130 | 131 | Returns: 132 | Q1 (tensor): Qval 1 133 | Q2 (tensor): Qval 2 134 | V (tensor): Value 135 | 136 | 137 | 138 | """ 139 | 140 | #Concatenate observation+action as critic state 141 | state = torch.cat([obs, action], 1) 142 | 143 | ###### Q1 HEAD #### 144 | q1 = F.elu(self.q1f1(state)) 145 | q1 = self.q1ln1(q1) 146 | q1 = F.elu(self.q1f2(q1)) 147 | q1 = self.q1ln2(q1) 148 | q1 = self.q1out(q1) 149 | 150 | ###### Q2 HEAD #### 151 | q2 = F.elu(self.q2f1(state)) 152 | q2 = self.q2ln1(q2) 153 | q2 = F.elu(self.q2f2(q2)) 154 | q2 = self.q2ln2(q2) 155 | q2 = self.q2out(q2) 156 | 157 | ###### Value HEAD #### 158 | v = F.elu(self.vf1(obs)) 159 | v = self.vln1(v) 160 | v = F.elu(self.vf2(v)) 161 | v = self.vln2(v) 162 | v = self.vout(v) 163 | 164 | 165 | return q1, q2, v 166 | 167 | 168 | 169 | # Initialize weights 170 | def weights_init(m): 171 | classname = m.__class__.__name__ 172 | if classname.find('Linear') != -1: 173 | torch.nn.init.xavier_uniform_(m.weight) 174 | 175 | -------------------------------------------------------------------------------- /core/neuroevolution.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | import random 18 | import numpy as np 19 | import math 20 | import core.mod_utils as utils 21 | 22 | 23 | 24 | class SSNE: 25 | """Neuroevolution object that contains all then method to run SUb-structure based Neuroevolution (SSNE) 26 | 27 | Parameters: 28 | args (object): parameter class 29 | 30 | 31 | """ 32 | 33 | def __init__(self, args): 34 | self.gen = 0 35 | self.args = args; 36 | self.population_size = self.args.pop_size; 37 | #RL TRACKERS 38 | self.rl_sync_pool = []; self.all_offs = []; self.rl_res = {"elites":0.0, 'selects': 0.0, 'discarded':0.0}; self.num_rl_syncs = 0.0001 39 | self.lineage = [0.0 for _ in range(self.population_size)]; self.lineage_depth = 10 40 | 41 | def selection_tournament(self, index_rank, num_offsprings, tournament_size): 42 | """Conduct tournament selection 43 | 44 | Parameters: 45 | index_rank (list): Ranking encoded as net_indexes 46 | num_offsprings (int): Number of offsprings to generate 47 | tournament_size (int): Size of tournament 48 | 49 | Returns: 50 | offsprings (list): List of offsprings returned as a list of net indices 51 | 52 | """ 53 | 54 | 55 | total_choices = len(index_rank) 56 | offsprings = [] 57 | for i in range(num_offsprings): 58 | winner = np.min(np.random.randint(total_choices, size=tournament_size)) 59 | offsprings.append(index_rank[winner]) 60 | 61 | offsprings = list(set(offsprings)) # Find unique offsprings 62 | if len(offsprings) % 2 != 0: # Number of offsprings should be even 63 | offsprings.append(index_rank[winner]) 64 | return offsprings 65 | 66 | def list_argsort(self, seq): 67 | """Sort the list 68 | 69 | Parameters: 70 | seq (list): list 71 | 72 | Returns: 73 | sorted list 74 | 75 | """ 76 | return sorted(range(len(seq)), key=seq.__getitem__) 77 | 78 | def regularize_weight(self, weight, mag): 79 | """Clamps on the weight magnitude (reguralizer) 80 | 81 | Parameters: 82 | weight (float): weight 83 | mag (float): max/min value for weight 84 | 85 | Returns: 86 | weight (float): clamped weight 87 | 88 | """ 89 | if weight > mag: weight = mag 90 | if weight < -mag: weight = -mag 91 | return weight 92 | 93 | def crossover_inplace(self, gene1, gene2): 94 | """Conduct one point crossover in place 95 | 96 | Parameters: 97 | gene1 (object): A pytorch model 98 | gene2 (object): A pytorch model 99 | 100 | Returns: 101 | None 102 | 103 | """ 104 | 105 | 106 | keys1 = list(gene1.state_dict()) 107 | keys2 = list(gene2.state_dict()) 108 | 109 | for key in keys1: 110 | if key not in keys2: continue 111 | 112 | # References to the variable tensors 113 | W1 = gene1.state_dict()[key] 114 | W2 = gene2.state_dict()[key] 115 | 116 | if len(W1.shape) == 2: #Weights no bias 117 | num_variables = W1.shape[0] 118 | # Crossover opertation [Indexed by row] 119 | try: num_cross_overs = random.randint(0, int(num_variables * 0.3)) # Number of Cross overs 120 | except: num_cross_overs = 1 121 | for i in range(num_cross_overs): 122 | receiver_choice = random.random() # Choose which gene to receive the perturbation 123 | if receiver_choice < 0.5: 124 | ind_cr = random.randint(0, W1.shape[0]-1) # 125 | W1[ind_cr, :] = W2[ind_cr, :] 126 | else: 127 | ind_cr = random.randint(0, W1.shape[0]-1) # 128 | W2[ind_cr, :] = W1[ind_cr, :] 129 | 130 | elif len(W1.shape) == 1: #Bias or LayerNorm 131 | if random.random() <0.8: continue #Crossover here with low frequency 132 | num_variables = W1.shape[0] 133 | # Crossover opertation [Indexed by row] 134 | #num_cross_overs = random.randint(0, int(num_variables * 0.05)) # Crossover number 135 | for i in range(1): 136 | receiver_choice = random.random() # Choose which gene to receive the perturbation 137 | if receiver_choice < 0.5: 138 | ind_cr = random.randint(0, W1.shape[0]-1) # 139 | W1[ind_cr] = W2[ind_cr] 140 | else: 141 | ind_cr = random.randint(0, W1.shape[0]-1) # 142 | W2[ind_cr] = W1[ind_cr] 143 | 144 | def mutate_inplace(self, gene): 145 | """Conduct mutation in place 146 | 147 | Parameters: 148 | gene (object): A pytorch model 149 | 150 | Returns: 151 | None 152 | 153 | """ 154 | mut_strength = 0.02 155 | num_mutation_frac = 0.03 156 | super_mut_strength = 1.0 157 | super_mut_prob = 0.1 158 | reset_prob = super_mut_prob + 0.1 159 | 160 | num_params = len(list(gene.parameters())) 161 | ssne_probabilities = np.random.uniform(0, 1, num_params) * 2 162 | 163 | for i, param in enumerate(gene.parameters()): # Mutate each param 164 | 165 | # References to the variable keys 166 | W = param.data 167 | if len(W.shape) == 2: # Weights, no bias 168 | 169 | num_weights = W.shape[0] * W.shape[1] 170 | ssne_prob = ssne_probabilities[i] 171 | 172 | if random.random() < ssne_prob: 173 | num_mutations = random.randint(0, 174 | int(math.ceil(num_mutation_frac * num_weights))) # Number of mutation instances 175 | for _ in range(num_mutations): 176 | ind_dim1 = random.randint(0, W.shape[0]-1) 177 | ind_dim2 = random.randint(0, W.shape[-1]-1) 178 | random_num = random.random() 179 | 180 | if random_num < super_mut_prob: # Super Mutation probability 181 | W[ind_dim1, ind_dim2] += random.gauss(0, super_mut_strength * W[ind_dim1, ind_dim2]) 182 | elif random_num < reset_prob: # Reset probability 183 | W[ind_dim1, ind_dim2] = random.gauss(0, 0.1) 184 | else: # mutauion even normal 185 | W[ind_dim1, ind_dim2] += random.gauss(0, mut_strength * W[ind_dim1, ind_dim2]) 186 | 187 | # Regularization hard limit 188 | W[ind_dim1, ind_dim2] = self.regularize_weight(W[ind_dim1, ind_dim2], 189 | self.args.weight_magnitude_limit) 190 | 191 | elif len(W.shape) == 1: # Bias or layernorm 192 | num_weights = W.shape[0] 193 | ssne_prob = ssne_probabilities[i]*0.04 #Low probability of mutation here 194 | 195 | if random.random() < ssne_prob: 196 | num_mutations = random.randint(0, 197 | int(math.ceil(num_mutation_frac * num_weights))) # Number of mutation instances 198 | for _ in range(num_mutations): 199 | ind_dim = random.randint(0, W.shape[0]-1) 200 | random_num = random.random() 201 | 202 | if random_num < super_mut_prob: # Super Mutation probability 203 | W[ind_dim] += random.gauss(0, super_mut_strength * W[ind_dim]) 204 | elif random_num < reset_prob: # Reset probability 205 | W[ind_dim] = random.gauss(0, 1) 206 | else: # mutauion even normal 207 | W[ind_dim] += random.gauss(0, mut_strength * W[ind_dim]) 208 | 209 | # Regularization hard limit 210 | W[ind_dim] = self.regularize_weight(W[ind_dim], self.args.weight_magnitude_limit) 211 | 212 | 213 | 214 | def reset_genome(self, gene): 215 | """Reset a model's weights in place 216 | 217 | Parameters: 218 | gene (object): A pytorch model 219 | 220 | Returns: 221 | None 222 | 223 | """ 224 | for param in (gene.parameters()): 225 | param.data.copy_(param.data) 226 | 227 | def epoch(self, gen, genealogy, pop, net_inds, fitness_evals, migration): 228 | """Method to implement a round of selection and mutation operation 229 | 230 | Parameters: 231 | pop (shared_list): Population of models 232 | net_inds (list): Indices of individuals evaluated this generation 233 | fitness_evals (list): Fitness values for evaluated individuals 234 | **migration (object): Policies from learners to be synced into population 235 | 236 | Returns: 237 | None 238 | 239 | """ 240 | 241 | self.gen+= 1; num_elitists = int(self.args.elite_fraction * len(fitness_evals)) 242 | if num_elitists < 2: num_elitists = 2 243 | 244 | 245 | # Entire epoch is handled with indices; Index rank nets by fitness evaluation (0 is the best after reversing) 246 | index_rank = self.list_argsort(fitness_evals); index_rank.reverse() 247 | elitist_index = index_rank[:num_elitists] # Elitist indexes safeguard 248 | 249 | # Selection step 250 | offsprings = self.selection_tournament(index_rank, num_offsprings=len(index_rank) - len(elitist_index) - len(migration), tournament_size=3) 251 | 252 | #Transcripe ranked indexes from now on to refer to net indexes 253 | elitist_index = [net_inds[i] for i in elitist_index] 254 | offsprings = [net_inds[i] for i in offsprings] 255 | 256 | #Figure out unselected candidates 257 | unselects = []; new_elitists = [] 258 | for net_i in net_inds: 259 | if net_i in offsprings or net_i in elitist_index: 260 | continue 261 | else: 262 | unselects.append(net_i) 263 | random.shuffle(unselects) 264 | 265 | #Inheritance step (sync learners to population) 266 | for policy in migration: 267 | replacee = unselects.pop(0) 268 | utils.hard_update(target=pop[replacee], source=policy) 269 | wwid = genealogy.asexual(int(policy.wwid.item())) 270 | pop[replacee].wwid[0] = wwid 271 | 272 | # Elitism step, assigning elite candidates to some unselects 273 | for i in elitist_index: 274 | try: replacee = unselects.pop(0) 275 | except: replacee = offsprings.pop(0) 276 | new_elitists.append(replacee) 277 | utils.hard_update(target=pop[replacee], source=pop[i]) 278 | wwid = genealogy.asexual(int(pop[i].wwid.item())) 279 | pop[replacee].wwid[0] = wwid 280 | genealogy.elite(wwid, gen) 281 | 282 | #self.lineage[replacee] = self.lineage[i] 283 | 284 | # Crossover for unselected genes with 100 percent probability 285 | if len(unselects) % 2 != 0: # Number of unselects left should be even 286 | unselects.append(unselects[random.randint(0, len(unselects)-1)]) 287 | for i, j in zip(unselects[0::2], unselects[1::2]): 288 | off_i = random.choice(new_elitists); 289 | off_j = random.choice(offsprings) 290 | utils.hard_update(target=pop[i], source=pop[off_i]) 291 | utils.hard_update(target=pop[j], source=pop[off_j]) 292 | self.crossover_inplace(pop[i], pop[j]) 293 | wwid1 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) 294 | wwid2 = genealogy.crossover(int(pop[off_i].wwid.item()), int(pop[off_j].wwid.item()), gen) 295 | pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 296 | 297 | #self.lineage[i] = (self.lineage[off_i]+self.lineage[off_j])/2 298 | #self.lineage[j] = (self.lineage[off_i] + self.lineage[off_j]) / 2 299 | 300 | # Crossover for selected offsprings 301 | for i, j in zip(offsprings[0::2], offsprings[1::2]): 302 | if random.random() < self.args.crossover_prob: 303 | self.crossover_inplace(pop[i], pop[j]) 304 | wwid1 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen) 305 | wwid2 = genealogy.crossover(int(pop[i].wwid.item()), int(pop[j].wwid.item()), gen) 306 | pop[i].wwid[0] = wwid1; pop[j].wwid[0] = wwid2 307 | 308 | 309 | # Mutate all genes in the population except the new elitists 310 | for net_i in net_inds: 311 | if net_i not in new_elitists: # Spare the new elitists 312 | if random.random() < self.args.mutation_prob: 313 | self.mutate_inplace(pop[net_i]) 314 | genealogy.mutation(int(pop[net_i].wwid.item()), gen) 315 | 316 | 317 | self.all_offs[:] = offsprings[:] 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | -------------------------------------------------------------------------------- /core/off_policy_algo.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | import torch 18 | import torch.nn as nn 19 | from torch.optim import Adam 20 | import torch.nn.functional as F 21 | import numpy as np 22 | from core import mod_utils as utils 23 | from core.models import Actor, Critic 24 | 25 | 26 | class Off_Policy_Algo(object): 27 | """Classes implementing TD3 and DDPG off-policy learners 28 | 29 | Parameters: 30 | args (object): Parameter class 31 | 32 | 33 | """ 34 | def __init__(self, wwid, algo_name, state_dim, action_dim, actor_lr, critic_lr, gamma, tau, init_w = True): 35 | 36 | self.algo_name = algo_name; self.gamma = gamma; self.tau = tau 37 | 38 | #Initialize actors 39 | self.actor = Actor(state_dim, action_dim, wwid) 40 | if init_w: self.actor.apply(utils.init_weights) 41 | self.actor_target = Actor(state_dim, action_dim, wwid) 42 | utils.hard_update(self.actor_target, self.actor) 43 | self.actor_optim = Adam(self.actor.parameters(), actor_lr) 44 | 45 | 46 | self.critic = Critic(state_dim, action_dim) 47 | if init_w: self.critic.apply(utils.init_weights) 48 | self.critic_target = Critic(state_dim, action_dim) 49 | utils.hard_update(self.critic_target, self.critic) 50 | self.critic_optim = Adam(self.critic.parameters(), critic_lr) 51 | 52 | self.loss = nn.MSELoss() 53 | 54 | self.actor_target.cuda(); self.critic_target.cuda(); self.actor.cuda(); self.critic.cuda() 55 | self.num_critic_updates = 0 56 | 57 | #Statistics Tracker 58 | self.action_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} 59 | self.policy_loss = {'min':[], 'max': [], 'mean':[], 'std':[]} 60 | self.critic_loss = {'mean':[]} 61 | self.q = {'min':[], 'max': [], 'mean':[], 'std':[]} 62 | self.val = {'min':[], 'max': [], 'mean':[], 'std':[]} 63 | 64 | def compute_stats(self, tensor, tracker): 65 | """Computes stats from intermediate tensors 66 | 67 | Parameters: 68 | tensor (tensor): tensor 69 | tracker (object): logger 70 | 71 | Returns: 72 | None 73 | 74 | 75 | """ 76 | tracker['min'].append(torch.min(tensor).item()) 77 | tracker['max'].append(torch.max(tensor).item()) 78 | tracker['mean'].append(torch.mean(tensor).item()) 79 | tracker['mean'].append(torch.mean(tensor).item()) 80 | 81 | def update_parameters(self, state_batch, next_state_batch, action_batch, reward_batch, done_batch, num_epoch=1, **kwargs): 82 | """Runs a step of Bellman upodate and policy gradient using a batch of experiences 83 | 84 | Parameters: 85 | state_batch (tensor): Current States 86 | next_state_batch (tensor): Next States 87 | action_batch (tensor): Actions 88 | reward_batch (tensor): Rewards 89 | done_batch (tensor): Done batch 90 | num_epoch (int): Number of learning iteration to run with the same data 91 | 92 | Returns: 93 | None 94 | 95 | """ 96 | 97 | if isinstance(state_batch, list): state_batch = torch.cat(state_batch); next_state_batch = torch.cat(next_state_batch); action_batch = torch.cat(action_batch); reward_batch = torch.cat(reward_batch). done_batch = torch.cat(done_batch) 98 | 99 | for _ in range(num_epoch): 100 | ########### CRITIC UPDATE #################### 101 | 102 | #Compute next q-val, next_v and target 103 | with torch.no_grad(): 104 | #Policy Noise 105 | policy_noise = np.random.normal(0, kwargs['policy_noise'], (action_batch.size()[0], action_batch.size()[1])) 106 | policy_noise = torch.clamp(torch.Tensor(policy_noise), -kwargs['policy_noise_clip'], kwargs['policy_noise_clip']) 107 | 108 | #Compute next action_bacth 109 | next_action_batch = self.actor_target.forward(next_state_batch) + policy_noise.cuda() 110 | next_action_batch = torch.clamp(next_action_batch, 0,1) 111 | 112 | #Compute Q-val and value of next state masking by done 113 | q1, q2, _ = self.critic_target.forward(next_state_batch, next_action_batch) 114 | q1 = (1 - done_batch) * q1 115 | q2 = (1 - done_batch) * q2 116 | 117 | #Select which q to use as next-q (depends on algo) 118 | if self.algo_name == 'TD3' or self.algo_name == 'TD3_actor_min': next_q = torch.min(q1, q2) 119 | elif self.algo_name == 'DDPG': next_q = q1 120 | elif self.algo_name == 'TD3_max': next_q = torch.max(q1, q2) 121 | 122 | #Compute target q and target val 123 | target_q = reward_batch + (self.gamma * next_q) 124 | 125 | 126 | self.critic_optim.zero_grad() 127 | current_q1, current_q2, current_val = self.critic.forward((state_batch), (action_batch)) 128 | self.compute_stats(current_q1, self.q) 129 | 130 | dt = self.loss(current_q1, target_q) 131 | 132 | if self.algo_name == 'TD3' or self.algo_name == 'TD3_max': dt = dt + self.loss(current_q2, target_q) 133 | self.critic_loss['mean'].append(dt.item()) 134 | 135 | dt.backward() 136 | 137 | self.critic_optim.step() 138 | self.num_critic_updates += 1 139 | 140 | 141 | #Delayed Actor Update 142 | if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: 143 | 144 | actor_actions = self.actor.forward(state_batch) 145 | Q1, Q2, val = self.critic.forward(state_batch, actor_actions) 146 | 147 | # if self.args.use_advantage: policy_loss = -(Q1 - val) 148 | policy_loss = -Q1 149 | 150 | self.compute_stats(policy_loss,self.policy_loss) 151 | policy_loss = policy_loss.mean() 152 | 153 | 154 | self.actor_optim.zero_grad() 155 | 156 | 157 | 158 | policy_loss.backward(retain_graph=True) 159 | self.actor_optim.step() 160 | 161 | 162 | if self.num_critic_updates % kwargs['policy_ups_freq'] == 0: utils.soft_update(self.actor_target, self.actor, self.tau) 163 | utils.soft_update(self.critic_target, self.critic, self.tau) 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /core/portfolio.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | from core.learner import Learner 18 | 19 | 20 | def initialize_portfolio(portfolio, args, genealogy, portfolio_id): 21 | """Portfolio of learners 22 | 23 | Parameters: 24 | portfolio (list): Incoming list 25 | args (object): param class 26 | 27 | Returns: 28 | portfolio (list): Portfolio of learners 29 | """ 30 | 31 | 32 | if portfolio_id == 10: 33 | td3args = {'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high} 34 | 35 | # Learner 1 36 | wwid = genealogy.new_id('learner_1') 37 | portfolio.append( 38 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.9, tau=5e-3, 39 | init_w=True, **td3args)) 40 | 41 | # Learner 3 42 | wwid = genealogy.new_id('learner_3') 43 | portfolio.append( 44 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.99, tau=5e-3, 45 | init_w=True, **td3args)) 46 | 47 | # Learner 4 48 | wwid = genealogy.new_id('learner_4') 49 | portfolio.append( 50 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.997, tau=5e-3, 51 | init_w=True, **td3args)) 52 | 53 | # Learner 4 54 | wwid = genealogy.new_id('learner_4') 55 | portfolio.append( 56 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.9995, tau=5e-3, 57 | init_w=True, **td3args)) 58 | 59 | if portfolio_id == 11: 60 | td3args = {'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high} 61 | 62 | # Learner 1 63 | wwid = genealogy.new_id('learner_1') 64 | portfolio.append( 65 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.9, tau=5e-3, 66 | init_w=True, **td3args)) 67 | 68 | if portfolio_id == 12: 69 | td3args = {'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high} 70 | 71 | # Learner 1 72 | wwid = genealogy.new_id('learner_1') 73 | portfolio.append( 74 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.99, tau=5e-3, 75 | init_w=True, **td3args)) 76 | 77 | if portfolio_id == 13: 78 | td3args = {'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high} 79 | 80 | # Learner 1 81 | wwid = genealogy.new_id('learner_1') 82 | portfolio.append( 83 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.997, tau=5e-3, 84 | init_w=True, **td3args)) 85 | 86 | if portfolio_id == 14: 87 | td3args = {'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high} 88 | 89 | # Learner 1 90 | wwid = genealogy.new_id('learner_1') 91 | portfolio.append( 92 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.9995, tau=5e-3, 93 | init_w=True, **td3args)) 94 | 95 | 96 | 97 | ##############MOTIVATING EXAMPLE ####### 98 | if portfolio_id == 100: 99 | 100 | td3args = {'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high} 101 | 102 | 103 | 104 | # Learner 1 105 | wwid = genealogy.new_id('learner_1') 106 | portfolio.append( 107 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.0, tau=5e-3, 108 | init_w=True, **td3args)) 109 | 110 | # Learner 2 111 | wwid = genealogy.new_id('learner_2') 112 | portfolio.append( 113 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=1.0, tau=5e-3, init_w=True, 114 | **td3args)) 115 | 116 | if portfolio_id == 101: 117 | td3args = {'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high} 118 | 119 | 120 | # Learner 3 121 | wwid = genealogy.new_id('learner_3') 122 | portfolio.append( 123 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=0.0, tau=5e-3, 124 | init_w=True, **td3args)) 125 | 126 | if portfolio_id == 102: 127 | td3args = {'policy_noise': 0.2, 'policy_noise_clip': 0.5, 'policy_ups_freq': 2, 'action_low': args.action_low, 'action_high': args.action_high} 128 | 129 | 130 | # Learner 1 131 | wwid = genealogy.new_id('learner_1') 132 | portfolio.append( 133 | Learner(wwid, 'TD3', args.state_dim, args.action_dim, actor_lr=1e-3, critic_lr=1e-3, gamma=1.0, tau=5e-3, 134 | init_w=True, **td3args)) 135 | 136 | 137 | 138 | return portfolio 139 | -------------------------------------------------------------------------------- /core/runner.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | from core.env_wrapper import EnvironmentWrapper 18 | from core import mod_utils as utils 19 | import numpy as np 20 | import torch 21 | 22 | 23 | # Rollout evaluate an agent in a complete game 24 | def rollout_worker(id, task_pipe, result_pipe, is_noise, data_bucket, model_bucket, env_name, noise_std, ALGO): 25 | """Rollout Worker runs a simulation in the environment to generate experiences and fitness values 26 | 27 | Parameters: 28 | task_pipe (pipe): Receiver end of the task pipe used to receive signal to start on a task 29 | result_pipe (pipe): Sender end of the pipe used to report back results 30 | is_noise (bool): Use noise? 31 | data_bucket (list of shared object): A list of shared object reference to s,ns,a,r,done (replay buffer) managed by a manager that is used to store experience tuples 32 | model_bucket (shared list object): A shared list object managed by a manager used to store all the models (actors) 33 | env_name (str): Environment name? 34 | noise_std (float): Standard deviation of Gaussian for sampling noise 35 | 36 | Returns: 37 | None 38 | """ 39 | env = EnvironmentWrapper(env_name, ALGO) 40 | np.random.seed(id) ###make sure the random seeds across learners are different 41 | 42 | ###LOOP### 43 | while True: 44 | identifier = task_pipe.recv() # Wait until a signal is received to start rollout 45 | if identifier == 'TERMINATE': exit(0) #Kill yourself 46 | 47 | # Get the requisite network 48 | net = model_bucket[identifier] 49 | 50 | 51 | fitness = 0.0; 52 | total_frame = 0 53 | state = env.reset(); 54 | rollout_trajectory = [] 55 | state = utils.to_tensor(np.array(state)).unsqueeze(0) 56 | while True: # unless done 57 | 58 | action = net.forward(state) 59 | action = utils.to_numpy(action) 60 | if is_noise: 61 | action = (action + np.random.normal(0, noise_std, size=env.env.action_space.shape[0])).clip(env.env.action_space.low, env.env.action_space.high) 62 | 63 | next_state, reward, done, info = env.step(action.flatten()) # Simulate one step in environment 64 | 65 | 66 | next_state = utils.to_tensor(np.array(next_state)).unsqueeze(0) 67 | fitness += reward 68 | 69 | # If storing transitions 70 | if data_bucket != None: #Skip for test set 71 | rollout_trajectory.append([utils.to_numpy(state), utils.to_numpy(next_state), 72 | np.float32(action), np.reshape(np.float32(np.array([reward])), (1, 1)), 73 | np.reshape(np.float32(np.array([float(done)])), (1, 1))]) 74 | state = next_state 75 | total_frame += 1 76 | 77 | # DONE FLAG IS Received 78 | if done: 79 | 80 | # Push experiences to main 81 | for entry in rollout_trajectory: 82 | data_bucket.append(entry) 83 | 84 | 85 | break 86 | 87 | # Send back id, fitness, total length and shaped fitness using the result pipe 88 | result_pipe.send([identifier, fitness, total_frame]) 89 | -------------------------------------------------------------------------------- /core/ucb.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | import math, random 18 | 19 | 20 | def ucb(allocation_size, portfolio, c): 21 | """Upper Confidence Bound implementation to pick learners 22 | 23 | Parameters: 24 | allocation_size (int): Size of allocation (num of resources) 25 | portfolio (list): List of learners 26 | c (float): Exploration coefficient in UCB 27 | 28 | Returns: 29 | allocation (list): List of learner ids formulating the resource allocation 30 | """ 31 | 32 | 33 | values = [learner.value for learner in portfolio] 34 | #Normalize values 35 | values = [val - min(values) for val in values] 36 | values = [val/(sum(values)+0.1) for val in values] 37 | 38 | visit_counts = [learner.visit_count for learner in portfolio] 39 | total_visit = sum(visit_counts) 40 | 41 | ######## Implement UCB ######## 42 | ucb_scores = [(values[i]) + c * math.sqrt( math.log(total_visit)/visit_counts[i]) for i in range(len(portfolio))] 43 | 44 | 45 | ########## Use UCB scores to perform probabilistic resource allocation (different from making one choice) ########## 46 | allocation = roulette_wheel(ucb_scores, allocation_size) 47 | 48 | 49 | 50 | 51 | return allocation 52 | 53 | 54 | 55 | def roulette_wheel(probs, num_samples): 56 | """Roulette_wheel selection from a prob. distribution 57 | 58 | Parameters: 59 | probs (list): Probability distribution 60 | num_samples (int): Num of iterations to sample from distribution 61 | 62 | Returns: 63 | out (list): List of samples based on incoming distribution 64 | """ 65 | 66 | #Normalize 67 | probs = [prob - min(probs) + abs(min(probs)) for prob in probs] #Biased translation (to positive axis) to ensure the lowest does not end up with a probability of zero 68 | 69 | ####### HACK FOR ROLLOUT_SIZE = 1 ##### 70 | if sum(probs) != 0: 71 | probs = [prob / sum(probs) for prob in probs] 72 | else: 73 | probs = [1.0 for _ in probs] 74 | ####### END HACK ##### 75 | 76 | 77 | #Selection 78 | out = [] 79 | for _ in range(num_samples): 80 | rand = random.random() 81 | 82 | for i in range(len(probs)): 83 | if rand < sum(probs[0:i+1]): 84 | out.append(i) 85 | break 86 | 87 | print('UCB_prob_mass', ["%.2f" %i for i in probs]) 88 | print('Allocation', out) 89 | print() 90 | 91 | return out 92 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # ****************************************************************************** 2 | # Copyright 2019 Intel Corporation 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # ****************************************************************************** 16 | 17 | import numpy as np, os, time, random, torch, sys 18 | from core.neuroevolution import SSNE 19 | from core.models import Actor 20 | from core import mod_utils as utils 21 | from core.mod_utils import str2bool 22 | from core.ucb import ucb 23 | from core.runner import rollout_worker 24 | from core.portfolio import initialize_portfolio 25 | from torch.multiprocessing import Process, Pipe, Manager 26 | import threading 27 | from core.buffer import Buffer 28 | from core.genealogy import Genealogy 29 | import gym 30 | import argparse 31 | 32 | 33 | 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument('-pop_size', type=int, help='#Policies in the population', default=10) 36 | parser.add_argument('-seed', type=int, help='Seed', default=2018) 37 | parser.add_argument('-rollout_size', type=int, help='#Policies in rolout size', default=10) 38 | parser.add_argument('-env', type=str, help='#Environment name', default='Humanoid-v2') 39 | parser.add_argument('-gradperstep', type=float, help='#Gradient step per env step', default=1.0) 40 | parser.add_argument('-savetag', type=str, help='#Tag to append to savefile', default='') 41 | parser.add_argument('-gpu_id', type=int, help='#GPU ID ', default=0) 42 | parser.add_argument('-buffer_gpu', type=str2bool, help='#Store buffer in GPU?', default=0) 43 | parser.add_argument('-portfolio', type=int, help='Portfolio ID', default=10) 44 | parser.add_argument('-total_steps', type=float, help='#Total steps in the env in millions ', default=2) 45 | parser.add_argument('-batchsize', type=int, help='Seed', default=256) 46 | parser.add_argument('-noise', type=float, help='Noise STD', default=0.1) 47 | 48 | 49 | POP_SIZE = vars(parser.parse_args())['pop_size'] 50 | BATCHSIZE = vars(parser.parse_args())['batchsize'] 51 | ROLLOUT_SIZE = vars(parser.parse_args())['rollout_size'] 52 | ENV_NAME = vars(parser.parse_args())['env'] 53 | GRADPERSTEP = vars(parser.parse_args())['gradperstep'] 54 | SAVETAG = vars(parser.parse_args())['savetag'] 55 | BUFFER_GPU = vars(parser.parse_args())['buffer_gpu'] 56 | SEED = vars(parser.parse_args())['seed'] 57 | GPU_DEVICE = vars(parser.parse_args())['gpu_id'] 58 | PORTFOLIO_ID = vars(parser.parse_args())['portfolio'] 59 | TOTAL_STEPS = int(vars(parser.parse_args())['total_steps'] * 1000000) 60 | NOISE_STD = vars(parser.parse_args())['noise'] 61 | os.environ["CUDA_VISIBLE_DEVICES"]=str(GPU_DEVICE) 62 | 63 | #ICML EXPERIMENT 64 | if PORTFOLIO_ID == 11 or PORTFOLIO_ID == 12 or PORTFOLIO_ID == 13 or PORTFOLIO_ID == 14 or PORTFOLIO_ID == 101 or PORTFOLIO_ID == 102: ISOLATE_PG = True 65 | else: 66 | ISOLATE_PG = False 67 | ALGO = "TD3" 68 | SAVE = True 69 | TEST_SIZE=10 70 | 71 | 72 | class Parameters: 73 | def __init__(self): 74 | """Parameter class stores all parameters for policy gradient 75 | 76 | Parameters: 77 | None 78 | 79 | Returns: 80 | None 81 | """ 82 | self.seed = SEED 83 | self.asynch_frac = 1.0 #Aynchronosity of NeuroEvolution 84 | self.algo = ALGO 85 | 86 | self.batch_size = BATCHSIZE #Batch size 87 | self.noise_std = NOISE_STD #Gaussian noise exploration std 88 | self.ucb_coefficient = 0.9 #Exploration coefficient in UCB 89 | self.gradperstep = GRADPERSTEP 90 | self.buffer_gpu = BUFFER_GPU 91 | self.rollout_size = ROLLOUT_SIZE #Size of learner rollouts 92 | 93 | #NeuroEvolution stuff 94 | self.pop_size = POP_SIZE 95 | self.elite_fraction = 0.2 96 | self.crossover_prob = 0.01 97 | self.mutation_prob = 0.2 98 | 99 | #######unused######## 100 | self.extinction_prob = 0.005 # Probability of extinction event 101 | self.extinction_magnituide = 0.5 # Probabilty of extinction for each genome, given an extinction event 102 | self.weight_magnitude_limit = 10000000 103 | self.mut_distribution = 1 # 1-Gaussian, 2-Laplace, 3-Uniform 104 | 105 | 106 | #Save Results 107 | dummy_env = gym.make(ENV_NAME) 108 | self.state_dim = dummy_env.observation_space.shape[0]; self.action_dim = dummy_env.action_space.shape[0] 109 | self.action_low = float(dummy_env.action_space.low[0]); self.action_high = float(dummy_env.action_space.high[0]) 110 | self.savefolder = 'Results/' 111 | if not os.path.exists('Results/'): os.makedirs('Results/') 112 | self.aux_folder = self.savefolder + 'Auxiliary/' 113 | if not os.path.exists(self.aux_folder): os.makedirs(self.aux_folder) 114 | 115 | 116 | class CERL_Agent: 117 | """Main CERL class containing all methods for CERL 118 | 119 | Parameters: 120 | args (int): Parameter class with all the parameters 121 | 122 | """ 123 | 124 | def __init__(self, args): 125 | self.args = args 126 | self.evolver = SSNE(self.args) 127 | 128 | #MP TOOLS 129 | self.manager = Manager() 130 | 131 | #Genealogy tool 132 | self.genealogy = Genealogy() 133 | 134 | #Initialize population 135 | self.pop = self.manager.list() 136 | for _ in range(args.pop_size): 137 | wwid = self.genealogy.new_id('evo') 138 | if ALGO == 'SAC': self.pop.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, wwid)) 139 | else: self.pop.append(Actor(args.state_dim, args.action_dim, wwid)) 140 | 141 | if ALGO == "SAC": self.best_policy = GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1) 142 | else: 143 | self.best_policy = Actor(args.state_dim, args.action_dim, -1) 144 | 145 | 146 | #Turn off gradients and put in eval mod 147 | for actor in self.pop: 148 | actor = actor.cpu() 149 | actor.eval() 150 | 151 | #Init BUFFER 152 | self.replay_buffer = Buffer(1000000, self.args.buffer_gpu) 153 | 154 | #Intialize portfolio of learners 155 | self.portfolio = [] 156 | self.portfolio = initialize_portfolio(self.portfolio, self.args, self.genealogy, PORTFOLIO_ID) 157 | self.rollout_bucket = self.manager.list() 158 | for _ in range(len(self.portfolio)): 159 | if ALGO == 'SAC': self.rollout_bucket.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) 160 | else: self.rollout_bucket.append(Actor(args.state_dim, args.action_dim, -1)) 161 | 162 | 163 | 164 | # Initialize shared data bucket 165 | self.data_bucket = self.replay_buffer.tuples 166 | 167 | ############## MULTIPROCESSING TOOLS ################### 168 | 169 | 170 | #Evolutionary population Rollout workers 171 | self.evo_task_pipes = [Pipe() for _ in range(args.pop_size)] 172 | self.evo_result_pipes = [Pipe() for _ in range(args.pop_size)] 173 | self.evo_workers = [Process(target=rollout_worker, args=(id, self.evo_task_pipes[id][1], self.evo_result_pipes[id][0], False, self.data_bucket, self.pop, ENV_NAME, None, ALGO)) for id in range(args.pop_size)] 174 | for worker in self.evo_workers: worker.start() 175 | self.evo_flag = [True for _ in range(args.pop_size)] 176 | 177 | #Learner rollout workers 178 | self.task_pipes = [Pipe() for _ in range(args.rollout_size)] 179 | self.result_pipes = [Pipe() for _ in range(args.rollout_size)] 180 | self.workers = [Process(target=rollout_worker, args=(id, self.task_pipes[id][1], self.result_pipes[id][0], True, self.data_bucket, self.rollout_bucket, ENV_NAME, args.noise_std, ALGO)) for id in range(args.rollout_size)] 181 | for worker in self.workers: worker.start() 182 | self.roll_flag = [True for _ in range(args.rollout_size)] 183 | 184 | #Test bucket 185 | self.test_bucket = self.manager.list() 186 | if ALGO == 'SAC': 187 | self.test_bucket.append(GaussianPolicy(args.state_dim, args.action_dim, args.hidden_size, -1)) 188 | else: 189 | self.test_bucket.append(Actor(args.state_dim, args.action_dim, -1)) 190 | 191 | #5 Test workers 192 | self.test_task_pipes = [Pipe() for _ in range(TEST_SIZE)] 193 | self.test_result_pipes = [Pipe() for _ in range(TEST_SIZE)] 194 | self.test_workers = [Process(target=rollout_worker, args=(id, self.test_task_pipes[id][1], self.test_result_pipes[id][0], False, None, self.test_bucket, ENV_NAME, None, ALGO)) for id in range(TEST_SIZE)] 195 | for worker in self.test_workers: worker.start() 196 | self.test_flag = False 197 | 198 | #Meta-learning controller (Resource Distribution) 199 | self.allocation = [] #Allocation controls the resource allocation across learners 200 | for i in range(args.rollout_size): self.allocation.append(i % len(self.portfolio)) #Start uniformly (equal resources) 201 | #self.learner_stats = [{'fitnesses': [], 'ep_lens': [], 'value': 0.0, 'visit_count':0} for _ in range(len(self.portfolio))] #Track node statistsitic (each node is a learner), to compute UCB scores 202 | 203 | #Trackers 204 | self.best_score = 0.0; self.gen_frames = 0; self.total_frames = 0; self.best_shaped_score = None; self.test_score = None; self.test_std = None 205 | 206 | 207 | 208 | def train(self, gen, frame_tracker): 209 | """Main training loop to do rollouts, neureoevolution, and policy gradients 210 | 211 | Parameters: 212 | gen (int): Current epoch of training 213 | 214 | Returns: 215 | None 216 | """ 217 | ################ START ROLLOUTS ############## 218 | 219 | #Start Evolution rollouts 220 | if not ISOLATE_PG: 221 | for id, actor in enumerate(self.pop): 222 | if self.evo_flag[id]: 223 | self.evo_task_pipes[id][0].send(id) 224 | self.evo_flag[id] = False 225 | 226 | #Sync all learners actor to cpu (rollout) actor 227 | for i, learner in enumerate(self.portfolio): 228 | learner.algo.actor.cpu() 229 | utils.hard_update(self.rollout_bucket[i], learner.algo.actor) 230 | learner.algo.actor.cuda() 231 | 232 | # Start Learner rollouts 233 | for rollout_id, learner_id in enumerate(self.allocation): 234 | if self.roll_flag[rollout_id]: 235 | self.task_pipes[rollout_id][0].send(learner_id) 236 | self.roll_flag[rollout_id] = False 237 | 238 | #Start Test rollouts 239 | if gen % 5 == 0: 240 | self.test_flag = True 241 | for pipe in self.test_task_pipes: pipe[0].send(0) 242 | 243 | 244 | ############# UPDATE PARAMS USING GRADIENT DESCENT ########## 245 | if self.replay_buffer.__len__() > self.args.batch_size * 10: ###BURN IN PERIOD 246 | self.replay_buffer.tensorify() # Tensorify the buffer for fast sampling 247 | 248 | #Spin up threads for each learner 249 | threads = [threading.Thread(target=learner.update_parameters, args=(self.replay_buffer, self.args.buffer_gpu, self.args.batch_size, int(self.gen_frames * self.args.gradperstep))) for learner in 250 | self.portfolio] 251 | 252 | # Start threads 253 | for thread in threads: thread.start() 254 | 255 | #Join threads 256 | for thread in threads: thread.join() 257 | self.gen_frames = 0 258 | 259 | 260 | ########## SOFT -JOIN ROLLOUTS FOR EVO POPULATION ############ 261 | if not ISOLATE_PG: 262 | all_fitness = []; all_net_ids = []; all_eplens = [] 263 | while True: 264 | for i in range(self.args.pop_size): 265 | if self.evo_result_pipes[i][1].poll(): 266 | entry = self.evo_result_pipes[i][1].recv() 267 | all_fitness.append(entry[1]); all_net_ids.append(entry[0]); all_eplens.append(entry[2]); self.gen_frames+= entry[2]; self.total_frames += entry[2] 268 | self.evo_flag[i] = True 269 | 270 | # Soft-join (50%) 271 | if len(all_fitness) / self.args.pop_size >= self.args.asynch_frac: break 272 | 273 | ########## HARD -JOIN ROLLOUTS FOR LEARNER ROLLOUTS ############ 274 | for i in range(self.args.rollout_size): 275 | entry = self.result_pipes[i][1].recv() 276 | learner_id = entry[0]; fitness = entry[1]; num_frames = entry[2] 277 | self.portfolio[learner_id].update_stats(fitness, num_frames) 278 | 279 | self.gen_frames += num_frames; self.total_frames += num_frames 280 | if fitness > self.best_score: self.best_score = fitness 281 | 282 | self.roll_flag[i] = True 283 | 284 | #Referesh buffer (housekeeping tasks - pruning to keep under capacity) 285 | self.replay_buffer.referesh() 286 | ######################### END OF PARALLEL ROLLOUTS ################ 287 | 288 | ############ PROCESS MAX FITNESS ############# 289 | if not ISOLATE_PG: 290 | champ_index = all_net_ids[all_fitness.index(max(all_fitness))] 291 | utils.hard_update(self.test_bucket[0], self.pop[champ_index]) 292 | if max(all_fitness) > self.best_score: 293 | self.best_score = max(all_fitness) 294 | utils.hard_update(self.best_policy, self.pop[champ_index]) 295 | if SAVE: 296 | torch.save(self.pop[champ_index].state_dict(), self.args.aux_folder + ENV_NAME+'_best'+SAVETAG) 297 | print("Best policy saved with score", '%.2f'%max(all_fitness)) 298 | 299 | else: #Run PG in isolation 300 | utils.hard_update(self.test_bucket[0], self.rollout_bucket[0]) 301 | 302 | ###### TEST SCORE ###### 303 | if self.test_flag: 304 | self.test_flag = False 305 | test_scores = [] 306 | for pipe in self.test_result_pipes: #Collect all results 307 | entry = pipe[1].recv() 308 | test_scores.append(entry[1]) 309 | test_scores = np.array(test_scores) 310 | test_mean = np.mean(test_scores); test_std = (np.std(test_scores)) 311 | 312 | # Update score to trackers 313 | frame_tracker.update([test_mean], self.total_frames) 314 | else: 315 | test_mean, test_std = None, None 316 | 317 | 318 | #NeuroEvolution's probabilistic selection and recombination step 319 | if not ISOLATE_PG: 320 | if gen % 5 == 0: 321 | self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, self.rollout_bucket) 322 | else: 323 | self.evolver.epoch(gen, self.genealogy, self.pop, all_net_ids, all_fitness, []) 324 | 325 | #META LEARNING - RESET ALLOCATION USING UCB 326 | if gen % 1 == 0: 327 | self.allocation = ucb(len(self.allocation), self.portfolio, self.args.ucb_coefficient) 328 | 329 | 330 | #Metrics 331 | if not ISOLATE_PG: 332 | champ_len = all_eplens[all_fitness.index(max(all_fitness))] 333 | champ_wwid = int(self.pop[champ_index].wwid.item()) 334 | max_fit = max(all_fitness) 335 | else: 336 | champ_len = num_frames; champ_wwid = int(self.rollout_bucket[0].wwid.item()) 337 | all_fitness = [fitness]; max_fit = fitness; all_eplens = [num_frames] 338 | 339 | return max_fit, champ_len, all_fitness, all_eplens, test_mean, test_std, champ_wwid 340 | 341 | if __name__ == "__main__": 342 | args = Parameters() # Create the Parameters class 343 | SAVETAG = SAVETAG + '_p' + str(PORTFOLIO_ID) 344 | SAVETAG = SAVETAG + '_s' + str(SEED) 345 | SAVETAg = SAVETAG + 'noise' + str(NOISE_STD) 346 | 347 | frame_tracker = utils.Tracker(args.savefolder, ['score_'+ENV_NAME+SAVETAG], '.csv') #Tracker class to log progress 348 | max_tracker = utils.Tracker(args.aux_folder, ['pop_max_score_'+ENV_NAME+SAVETAG], '.csv') #Tracker class to log progress FOR MAX (NOT REPORTED) 349 | 350 | #Set seeds 351 | torch.manual_seed(args.seed); np.random.seed(args.seed); random.seed(args.seed) 352 | 353 | #INITIALIZE THE MAIN AGENT CLASS 354 | agent = CERL_Agent(args) #Initialize the agent 355 | print('Running CERL for', ENV_NAME, 'State_dim:', args.state_dim, ' Action_dim:', args.action_dim) 356 | 357 | time_start = time.time() 358 | for gen in range(1, 1000000000): #Infinite generations 359 | 360 | #Train one iteration 361 | best_score, test_len, all_fitness, all_eplen, test_mean, test_std, champ_wwid = agent.train(gen, frame_tracker) 362 | 363 | #PRINT PROGRESS 364 | print('Env', ENV_NAME, 'Gen', gen, 'Frames', agent.total_frames, ' Pop_max/max_ever:','%.2f'%best_score, '/','%.2f'%agent.best_score, ' Avg:','%.2f'%frame_tracker.all_tracker[0][1], 365 | ' Frames/sec:','%.2f'%(agent.total_frames/(time.time()-time_start)), 366 | ' Champ_len', '%.2f'%test_len, ' Test_score u/std', utils.pprint(test_mean), utils.pprint(test_std), 'savetag', SAVETAG, 'noise', args.noise_std) 367 | 368 | # # PRINT MORE DETAILED STATS PERIODICALLY 369 | if gen % 5 == 0: 370 | print('Learner Fitness', [utils.pprint(learner.value) for learner in agent.portfolio], 'Sum_stats_resource_allocation', [learner.visit_count for learner in agent.portfolio]) 371 | print('Pop/rollout size', args.pop_size,'/',args.rollout_size, 'gradperstep', args.gradperstep, 'Seed', SEED, 'Portfolio_id', PORTFOLIO_ID) 372 | try: 373 | print('Best Policy ever genealogy:', agent.genealogy.tree[int(agent.best_policy.wwid.item())].history) 374 | print('Champ genealogy:', agent.genealogy.tree[champ_wwid].history) 375 | except: None 376 | print() 377 | 378 | max_tracker.update([best_score], agent.total_frames) 379 | if agent.total_frames > TOTAL_STEPS: 380 | break 381 | 382 | #Save sum stats 383 | if PORTFOLIO_ID == 10 or PORTFOLIO_ID == 100: 384 | visit_counts = np.array([learner.visit_count for learner in agent.portfolio]) 385 | np.savetxt(args.aux_folder + 'allocation_' + ENV_NAME + SAVETAG, visit_counts, fmt='%.3f', delimiter=',') 386 | 387 | ###Kill all processes 388 | try: 389 | for p in agent.task_pipes: p[0].send('TERMINATE') 390 | for p in agent.test_task_pipes: p[0].send('TERMINATE') 391 | for p in agent.evo_task_pipes: p[0].send('TERMINATE') 392 | 393 | except: None 394 | 395 | 396 | --------------------------------------------------------------------------------