├── .gitignore ├── LICENSE ├── README.md ├── configs ├── dqn_config.yml ├── searl_dqn_config.yml ├── searl_td3_config.yml └── td3_config.yml ├── images └── searl.png ├── requirements.txt ├── scripts ├── __init__.py ├── run_dqn.py ├── run_searl_dqn.py ├── run_searl_td3.py └── run_td3.py ├── searl ├── __init__.py ├── neuroevolution │ ├── __init__.py │ ├── components │ │ ├── __init__.py │ │ ├── envolvable_cnn.py │ │ ├── envolvable_mlp.py │ │ ├── individual_dqn.py │ │ ├── individual_td3.py │ │ ├── replay_memory.py │ │ └── utils.py │ ├── evaluation_dqn.py │ ├── evaluation_td3.py │ ├── mutation_cnn.py │ ├── mutation_mlp.py │ ├── searl_dqn.py │ ├── searl_td3.py │ ├── tournament_selection.py │ ├── training_dqn.py │ └── training_td3.py ├── rl_algorithms │ ├── __init__.py │ ├── components │ │ ├── __init__.py │ │ ├── replay_memory.py │ │ └── wrappers.py │ ├── dqn.py │ └── td3.py └── utils │ ├── __init__.py │ ├── handler │ ├── __init__.py │ ├── base_handler.py │ ├── checkpoint.py │ ├── config.py │ └── folder.py │ ├── log │ ├── __init__.py │ ├── csv.py │ ├── json.py │ ├── logger.py │ ├── pkl.py │ └── txt.py │ └── supporter.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Experiment Folders 2 | experiments/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | venv* 93 | .venv* 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # default data folder 109 | data_babi/ 110 | data_cnn/ 111 | data_tmp/ 112 | 113 | # pycharm 114 | .idea/ 115 | 116 | # folder 117 | .experiments/* 118 | .experiment/* 119 | .tmp/ 120 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [2021] [Jörg Franke] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sample-Efficient Automated Deep Reinforcement Learning 2 | 3 | [![Python](https://img.shields.io/badge/python-3.6-yellow.svg)](https://www.python.org/downloads/release/python-3611/) 4 | [![TensorFLow](https://img.shields.io/badge/PyTorch-1.6-yellow.svg)](https://pytorch.org/) 5 | 6 | This repository contains source code accompanying the ICLR 2021 publication: 7 | 8 | > [Sample-Efficient Automated Deep Reinforcement Learning](https://openreview.net/forum?id=hSjxQ3B7GWq) 9 | > Jörg K.H. Franke, Gregor Koehler, André Biedenkapp, Frank Hutter \ 10 | > In *Proceedings of the International Conference on Learning Representations (ICLR 2021)*, 2021. 11 | 12 | Sample-Efficient Automated Deep Reinforcement Learning (SEARL) jointly trains an RL off-policy agent and optimized the 13 | hyperparameters including the neural networks architecture. SEARL uses a population of agents with different 14 | hyperparameters and an evolutionary outer loop (Figure below) similar to PBT. During evaluation, each 15 | agent/hyperparameter combination gets a fitness score and the environment interactions are stored as transitions in an 16 | shared replay memory. Based on the fitness score, the best agent/hyperparameter combinations are selected, the 17 | hyperparameters are mutated and trained with samples from a shared replay memory and afterwards evaluated again. The 18 | population in SEARL benefits from a diverse set of experience in the shared replay memory. SEARL trains an RL agent and 19 | jointly finds optimal hyperparameters and neural architectures with up to ten times less environment interactions as 20 | random search or PBT. 21 | 22 | ![searl algorithm](images/searl.png) 23 | 24 | SEARL allows training an agent and simultaneously automatically tuning the hyperparameters with nearly the same amount 25 | of environment steps. For a fair evaluation we introduce a new evaluation protocol considering all 26 | environment interactions during an algorithms HPO. 27 | 28 | 29 | ## Usage 30 | 31 | 32 | ### Requirements 33 | 34 | The source code is tested on Linux with Python 3.6. Please install the `requirements.txt` and make sure your have 35 | MuJoCo version 2.0 binaries in your home folder as well as a valid licence. 36 | You can obtain a 30-day free trial licence on the [MuJoCo website](https://www.roboti.us/license.html). 37 | The license key will arrive in an email with your username and password. You can download the MuJoCo version 2.0 38 | binaries for [Linux](https://www.roboti.us/download/mujoco200_linux.zip). Please unzip the downloaded `mujoco200` 39 | directory into `~/.mujoco/mujoco200`, and place your license key (the `mjkey.txt` file from your email) 40 | at `~/.mujoco/mjkey.txt`. 41 | 42 | Please find help in case of any install issues at [openai/mujoco-py](https://github.com/openai/mujoco-py), in the 43 | [full documentation](https://openai.github.io/mujoco-py/build/html/index.html) or in the 44 | [mujoco-py issues scetion](https://github.com/openai/mujoco-py/issues). 45 | 46 | ### Run experiments 47 | 48 | Please find in the `scripts` folder run scripts for the RL experiments on TD3 and DQN as well as SEARL experiments on 49 | TD3 and DQN. By default, the script loads the configuration for the experiment from the `configs` folder. You can also 50 | start with a custom config or experiment directory by using the `--config_file` or `--expt_dir` argument. 51 | 52 | ## Cite 53 | 54 | If you use this code in your own work, please cite SEARL using the following bibtex entry: 55 | 56 | ``` 57 | @inproceedings{franke2020searl, 58 | title={Sample-Efficient Automated Deep Reinforcement Learning}, 59 | author={Franke, J{\"o}rg KH and K{\"o}hler, Gregor and Biedenkapp, Andr{\'e} and Hutter, Frank}, 60 | booktitle={International Conference on Learning Representations}, 61 | year={2021}, 62 | } 63 | ``` 64 | -------------------------------------------------------------------------------- /configs/dqn_config.yml: -------------------------------------------------------------------------------- 1 | dqn: 2 | optimizer: 'adam' 3 | lr_actor: 0.0001 4 | rm_capacity: 1000000 5 | batch_size: 128 6 | gamma: 0.99 7 | soft_update: True 8 | 9 | num_frames: 1000000 10 | replay_initial: 10000 11 | start_timesteps: 5000 12 | eval_episodes: 10 13 | 14 | eval_freq: 10000 15 | 16 | reset_target: False 17 | recreate_optim: False 18 | min_eval_steps: 250 19 | 20 | seed: 21 | numpy: 123 22 | torch: 123 23 | env: 123 24 | 25 | env: 26 | name: 'PongNoFrameskip-v4' # 'FreewayNoFrameskip-v4', 'EnduroNoFrameskip-v4', 'BoxingNoFrameskip-v4', 'RoadRunnerNoFrameskip-v4', 27 | 28 | expt: 29 | project_name: 'searl' 30 | session_name: 'baseline' 31 | experiment_name: 'default_dqn' 32 | 33 | support: 34 | save_models: False 35 | 36 | actor: 37 | channel_size: [32, 64, 64] 38 | kernal_size: [8, 4, 3] 39 | stride_size: [4, 2, 1] 40 | hidden_size: [128] 41 | num_atoms: 51 42 | Vmin: -10 43 | Vmax: 10 44 | mlp_activation: "relu" 45 | cnn_activation: "relu" 46 | layer_norm: False 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /configs/searl_dqn_config.yml: -------------------------------------------------------------------------------- 1 | ####################################### 2 | ### Logging Configuration ### 3 | ####################################### 4 | expt: 5 | project_name: "searl" 6 | session_name: "neuroevolution" 7 | experiment_name: "default_searl_dqn" 8 | 9 | 10 | ####################################### 11 | ### NEVO Configuration ### 12 | ####################################### 13 | nevo: 14 | population_size: 10 15 | tournament_size: 3 16 | selection: True 17 | mutation: True 18 | training: True 19 | elitism: True 20 | min_train_time: 250 21 | worker: 2 22 | reuse_batch: 1 23 | ind_memory: False 24 | init_random: False 25 | 26 | 27 | mutation: 28 | no_mutation: 0.2 29 | parameters: 0.2 30 | architecture: 0.2 31 | activation: 0.2 32 | rl_hyperparam: 0.2 33 | rl_hp_selection: ['lr_actor'] 34 | new_layer_prob: 0.2 35 | mutation_sd: 0.1 36 | 37 | 38 | train: 39 | replay_memory_size: 2000000 40 | num_frames: 2000000 41 | td3_double_q: False 42 | evo_warm_up: 1 43 | min_train_steps: 1000 44 | max_train_steps: 50000 45 | 46 | 47 | rl: 48 | train_frames_fraction: 0.5 # 5000 train_iternations 49 | gamma: 0.99 50 | soft_update: True 51 | tau: 0.005 52 | batch_size: 128 53 | lr_actor: 0.0001 54 | optimizer: "adam" ## ["adam", "adamax", "rmsprop", "sdg"] 55 | start_timesteps: 10000 56 | 57 | rm_capacity: 2000000 58 | 59 | num_frames: 50000000 60 | replay_initial: 10000 61 | eval_episodes: 10 62 | 63 | eval_freq: 10000 64 | 65 | reset_target: False 66 | recreate_optim: False 67 | min_eval_steps: 200 68 | 69 | num_atoms: 51 70 | Vmin: -10 71 | Vmax: 10 72 | 73 | 74 | seed: 75 | replay_memory: 123 76 | evaluation: 123 77 | mutation: 123 78 | training: 123 79 | torch: 123 80 | numpy: 123 81 | 82 | 83 | ####################################### 84 | ### Environment Configuration ### 85 | ####################################### 86 | env: 87 | name: 'PongNoFrameskip-v4' 88 | 89 | 90 | eval: 91 | eval_episodes: 1 92 | min_eval_steps: 250 93 | exploration_noise: 0.1 # Default 0.1 94 | test_episodes: 10 95 | test_seed: 123 96 | 97 | 98 | ####################################### 99 | ### Actor Starting Configuration ### 100 | ####################################### 101 | actor: 102 | channel_size: [32, 32] 103 | kernal_size: [8, 4] 104 | stride_size: [4, 2] 105 | hidden_size: [128] 106 | num_atoms: 51 107 | Vmin: -10 108 | Vmax: 10 109 | mlp_activation: "relu" 110 | cnn_activation: "relu" 111 | layer_norm: False 112 | 113 | -------------------------------------------------------------------------------- /configs/searl_td3_config.yml: -------------------------------------------------------------------------------- 1 | ####################################### 2 | ### Logging Configuration ### 3 | ####################################### 4 | expt: 5 | project_name: "searl" 6 | session_name: "neuroevolution" 7 | experiment_name: "default_searl_td3" 8 | 9 | 10 | ####################################### 11 | ### NEVO Configuration ### 12 | ####################################### 13 | nevo: 14 | population_size: 10 15 | tournament_size: 3 16 | selection: True 17 | mutation: True 18 | training: True 19 | elitism: True 20 | min_train_time: 200 21 | worker: 5 22 | reuse_batch: 1 23 | ind_memory: False 24 | init_random: False 25 | 26 | 27 | mutation: 28 | no_mutation: 0.2 29 | parameters: 0.2 30 | architecture: 0.2 31 | activation: 0.2 32 | rl_hyperparam: 0.2 33 | rl_hp_selection: ['lr_actor','lr_critic'] # 'train_frames_fraction','batch_size',,'td3_policy_noise','td3_update_freq', 'optimizer'] 34 | new_layer_prob: 0.2 35 | mutation_sd: 0.1 36 | 37 | 38 | train: 39 | replay_memory_size: 1000000 40 | num_frames: 2000000 41 | td3_double_q: True 42 | evo_warm_up: 1 43 | min_train_steps: 250 44 | 45 | 46 | rl: 47 | train_frames_fraction: 0.5 # 5000 train_iternations 48 | gamma: 0.99 49 | tau: 0.005 50 | batch_size: 100 51 | lr_actor: 0.001 52 | lr_critic: 0.001 53 | clip_grad_norm: 100 54 | td3_policy_noise: 0.2 # False or TD3 default: 0.2 55 | td3_noise_clip: 0.5 # default 0.5 56 | td3_update_freq: 2 # 1 or TD3 default: 2 57 | optimizer: "adam" ## ["adam", "adamax", "rmsprop", "sdg"] 58 | start_timesteps: 1 59 | 60 | 61 | seed: 62 | replay_memory: 123 63 | evaluation: 123 64 | mutation: 123 65 | training: 123 66 | torch: 123 67 | numpy: 123 68 | 69 | 70 | ####################################### 71 | ### Environment Configuration ### 72 | ####################################### 73 | env: 74 | name: 'Walker2d-v2' #'Walker2d-v2' #'HalfCheetah-v2' # HalfCheetah-v2' 75 | 76 | 77 | eval: 78 | eval_episodes: 1 79 | min_eval_steps: 250 80 | exploration_noise: 0.1 # Default 0.1 81 | test_episodes: 10 82 | test_seed: 123 83 | 84 | 85 | ####################################### 86 | ### Actor Starting Configuration ### 87 | ####################################### 88 | actor: 89 | hidden_size: [128] 90 | activation: 'relu' #'relu' , 'sigmoid' 'softplus', 91 | output_activation: 'tanh' 92 | layer_norm: True 93 | output_vanish: False 94 | 95 | ####################################### 96 | ### Critic Starting Configuration ### 97 | ####################################### 98 | critic: 99 | hidden_size: [128] 100 | activation: 'relu' #'relu' , 'sigmoid' 'softplus', 101 | output_activation: 'linear' 102 | layer_norm: True 103 | output_vanish: True 104 | -------------------------------------------------------------------------------- /configs/td3_config.yml: -------------------------------------------------------------------------------- 1 | td3: 2 | gamma: 0.99 3 | tau: 0.005 4 | lr_actor: 0.001 5 | lr_critic: 0.001 6 | batch_size: 100 7 | double_q: True 8 | clip_grad_norm: 100 9 | td3_policy_noise: 0.2 # False or TD3 default: 0.2 10 | td3_noise_clip: 0.5 # default 0.5 11 | td3_update_freq: 2 # 1 or TD3 default: 2 12 | optimizer: 'adam' 13 | rm_capacity: 1000000 14 | eval_freq: 5000 15 | start_timesteps: 10000 16 | exploration_noise: 0.1 17 | eval_episodes: 10 18 | max_timesteps: 2000000 19 | reset_target: False 20 | recreate_optim: False 21 | 22 | seed: 23 | numpy: 123 24 | torch: 123 25 | env: 123 26 | 27 | env: 28 | name: 'HalfCheetah-v2' 29 | 30 | expt: 31 | project_name: 'searl' 32 | session_name: 'baseline' 33 | experiment_name: 'default_td3' 34 | 35 | support: 36 | save_models: False 37 | 38 | actor: 39 | hidden_size: [128] 40 | activation: 'relu' # 'sigmoid' 'softplus', 41 | output_activation: 'tanh' 42 | layer_norm: True 43 | output_vanish: False 44 | 45 | critic: 46 | hidden_size: [128] 47 | activation: 'relu' # 'sigmoid' 'softplus' 48 | output_activation: 'linear' 49 | layer_norm: True 50 | output_vanish: True 51 | -------------------------------------------------------------------------------- /images/searl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/images/searl.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cffi==1.14.5 2 | cloudpickle==1.3.0 3 | Cython==0.29.22 4 | fasteners==0.16 5 | fastrand==1.3.0 6 | future==0.18.2 7 | glfw==2.1.0 8 | gym==0.17.1 9 | imageio==2.9.0 10 | mujoco-py==2.0.2.9 11 | numpy==1.20.1 12 | opencv-python==4.5.1.48 13 | Pillow==8.1.2 14 | pyaml==20.4.0 15 | pycparser==2.20 16 | pyglet==1.5.0 17 | PyYAML==5.4.1 18 | scipy==1.6.1 19 | six==1.15.0 20 | torch==1.6.0 21 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/run_dqn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import os 4 | from pathlib import Path 5 | 6 | from searl.rl_algorithms.dqn import start_DQN_training 7 | 8 | parser = argparse.ArgumentParser(description='define cluster setup') 9 | 10 | parser.add_argument('--expt_dir', type=str, default=False, help='expt_dir') 11 | parser.add_argument('--config_file', type=str, default=False, help='config_dir') 12 | args = parser.parse_args() 13 | 14 | if args.config_file == False: 15 | print("no config file") 16 | config_file = Path(os.getcwd()).parents[0] / "configs/dqn_config.yml" 17 | else: 18 | config_file = args.config_file 19 | 20 | if args.expt_dir == False: 21 | print("no experiment dir") 22 | expt_dir = Path(os.getcwd()).parents[0] / "experiments" 23 | else: 24 | expt_dir = args.expt_dir 25 | 26 | 27 | with open(config_file, 'r') as f: 28 | config_dict = yaml.load(f, Loader=yaml.Loader) 29 | 30 | start_DQN_training(config_dict, expt_dir=expt_dir) 31 | -------------------------------------------------------------------------------- /scripts/run_searl_dqn.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import os 4 | from pathlib import Path 5 | 6 | from searl.neuroevolution.searl_dqn import start_searl_dqn_run 7 | 8 | parser = argparse.ArgumentParser(description='define cluster setup') 9 | 10 | parser.add_argument('--expt_dir', type=str, default=False, help='expt_dir') 11 | parser.add_argument('--config_file', type=str, default=False, help='config_dir') 12 | args = parser.parse_args() 13 | 14 | if args.config_file == False: 15 | print("no config file") 16 | config_file = Path(os.getcwd()).parents[0] / "configs/searl_dqn_config.yml" 17 | else: 18 | config_file = args.config_file 19 | 20 | if args.expt_dir == False: 21 | print("no experiment dir") 22 | expt_dir = Path(os.getcwd()).parents[0] / "experiments" 23 | else: 24 | expt_dir = args.expt_dir 25 | 26 | with open(config_file, 'r') as f: 27 | config_dict = yaml.load(f, Loader=yaml.Loader) 28 | 29 | start_searl_dqn_run(config_dict, expt_dir=expt_dir) 30 | -------------------------------------------------------------------------------- /scripts/run_searl_td3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import os 4 | from pathlib import Path 5 | 6 | from searl.neuroevolution.searl_td3 import start_searl_td3_run 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser(description='define cluster setup') 10 | 11 | parser.add_argument('--expt_dir', type=str, default=False, help='expt_dir') 12 | parser.add_argument('--config_file', type=str, default=False, help='config_dir') 13 | args = parser.parse_args() 14 | 15 | if args.config_file == False: 16 | print("no config file") 17 | config_file = Path(os.getcwd()).parents[0] / "configs/searl_td3_config.yml" 18 | else: 19 | config_file = args.config_file 20 | 21 | if args.expt_dir == False: 22 | print("no experiment dir") 23 | expt_dir = Path(os.getcwd()).parents[0] / "experiments" 24 | else: 25 | expt_dir = args.expt_dir 26 | 27 | os.environ["LD_LIBRARY_PATH"] = f"$LD_LIBRARY_PATH:{str(Path.home())}/.mujoco/mujoco200/bin:/usr/lib/nvidia-384" 28 | 29 | with open(config_file, 'r') as f: 30 | config_dict = yaml.load(f, Loader=yaml.Loader) 31 | 32 | start_searl_td3_run(config_dict, expt_dir=expt_dir) 33 | -------------------------------------------------------------------------------- /scripts/run_td3.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import os 4 | from pathlib import Path 5 | 6 | from searl.rl_algorithms.td3 import start_TD3_training 7 | 8 | parser = argparse.ArgumentParser(description='define cluster setup') 9 | 10 | parser.add_argument('--expt_dir', type=str, default=False, help='expt_dir') 11 | parser.add_argument('--config_file', type=str, default=False, help='config_dir') 12 | args = parser.parse_args() 13 | 14 | if args.config_file == False: 15 | print("no config file") 16 | config_file = Path(os.getcwd()).parents[0] / "configs/td3_config.yml" 17 | else: 18 | config_file = args.config_file 19 | 20 | if args.expt_dir == False: 21 | print("no experiment dir") 22 | expt_dir = Path(os.getcwd()).parents[0] / "experiments" 23 | else: 24 | expt_dir = args.expt_dir 25 | 26 | os.environ["LD_LIBRARY_PATH"] = f"$LD_LIBRARY_PATH:{str(Path.home())}/.mujoco/mujoco200/bin:/usr/lib/nvidia-384" 27 | 28 | with open(config_file, 'r') as f: 29 | config_dict = yaml.load(f, Loader=yaml.Loader) 30 | 31 | start_TD3_training(config_dict, expt_dir=expt_dir) 32 | -------------------------------------------------------------------------------- /searl/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils.handler.config import ConfigHandler -------------------------------------------------------------------------------- /searl/neuroevolution/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/neuroevolution/__init__.py -------------------------------------------------------------------------------- /searl/neuroevolution/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/neuroevolution/components/__init__.py -------------------------------------------------------------------------------- /searl/neuroevolution/components/envolvable_cnn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import math 3 | from collections import OrderedDict 4 | from typing import List 5 | 6 | import numpy as np 7 | import torch 8 | import torch.autograd as autograd 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | 13 | class NoisyLinear(nn.Module): 14 | def __init__(self, in_features, out_features, std_init=0.4): 15 | super(NoisyLinear, self).__init__() 16 | 17 | self.in_features = in_features 18 | self.out_features = out_features 19 | self.std_init = std_init 20 | 21 | self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features)) 22 | self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features)) 23 | self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features)) 24 | 25 | self.bias_mu = nn.Parameter(torch.FloatTensor(out_features)) 26 | self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features)) 27 | self.register_buffer('bias_epsilon', torch.FloatTensor(out_features)) 28 | 29 | self.reset_parameters() 30 | self.reset_noise() 31 | 32 | def forward(self, x): 33 | 34 | weight_epsilon = self.weight_epsilon.to(x.device) 35 | bias_epsilon = self.bias_epsilon.to(x.device) 36 | 37 | if self.training: 38 | weight = self.weight_mu + self.weight_sigma.mul(weight_epsilon) 39 | bias = self.bias_mu + self.bias_sigma.mul(bias_epsilon) 40 | else: 41 | weight = self.weight_mu 42 | bias = self.bias_mu 43 | 44 | return F.linear(x, weight, bias) 45 | 46 | def reset_parameters(self): 47 | mu_range = 1 / math.sqrt(self.weight_mu.size(1)) 48 | 49 | self.weight_mu.data.uniform_(-mu_range, mu_range) 50 | self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1))) 51 | 52 | self.bias_mu.data.uniform_(-mu_range, mu_range) 53 | self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0))) 54 | 55 | def reset_noise(self): 56 | epsilon_in = self._scale_noise(self.in_features) 57 | epsilon_out = self._scale_noise(self.out_features) 58 | 59 | self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) 60 | self.bias_epsilon.copy_(epsilon_out) 61 | 62 | def _scale_noise(self, size): 63 | x = torch.randn(size) 64 | x = x.sign().mul(x.abs().sqrt()) 65 | return x 66 | 67 | 68 | class EvolvableCnnDQN(nn.Module): 69 | 70 | def __init__(self, input_shape: List[int], 71 | channel_size: List[int], 72 | kernal_size: List[int], 73 | stride_size: List[int], 74 | hidden_size: List[int], 75 | num_actions: int, 76 | num_atoms: int, 77 | Vmin: int, 78 | Vmax: int, 79 | mlp_activation='relu', 80 | cnn_activation='relu', 81 | layer_norm=False, stored_values=None, device="cpu"): 82 | 83 | super(EvolvableCnnDQN, self).__init__() 84 | 85 | self.input_shape = input_shape 86 | self.channel_size = channel_size 87 | self.kernal_size = kernal_size 88 | self.stride_size = stride_size 89 | self.hidden_size = hidden_size 90 | self.num_actions = num_actions 91 | self.num_atoms = num_atoms 92 | self.Vmin = Vmin 93 | self.Vmax = Vmax 94 | self.mlp_activation = mlp_activation 95 | self.cnn_activation = cnn_activation 96 | self.layer_norm = layer_norm 97 | self.device = device 98 | 99 | self.net = self.create_nets() 100 | self.feature_net, self.value_net, self.advantage_net = self.create_nets() 101 | 102 | if stored_values is not None: 103 | self.inject_parameters(pvec=stored_values, without_layer_norm=False) 104 | 105 | def get_activation(self, activation_names): 106 | 107 | activation_functions = {'tanh': nn.Tanh, 'gelu': nn.GELU, 'relu': nn.ReLU, 'elu': nn.ELU, 108 | 'softsign': nn.Softsign, 'sigmoid': nn.Sigmoid, 'softplus': nn.Softplus, 109 | 'lrelu': nn.LeakyReLU, 'prelu': nn.PReLU, } 110 | return activation_functions[activation_names]() 111 | 112 | def create_mlp(self, input_size, output_size, hidden_size, name): 113 | 114 | net_dict = OrderedDict() 115 | 116 | net_dict[f"{name}_linear_layer_0"] = NoisyLinear(input_size, hidden_size[0]) 117 | if self.layer_norm: 118 | net_dict[f"{name}_layer_norm_0"] = nn.LayerNorm(hidden_size[0]) 119 | net_dict[f"{name}_activation_0"] = self.get_activation(self.mlp_activation) 120 | 121 | if len(hidden_size) > 1: 122 | for l_no in range(1, len(hidden_size)): 123 | net_dict[f"{name}_linear_layer_{str(l_no)}"] = NoisyLinear(hidden_size[l_no - 1], hidden_size[l_no]) 124 | if self.layer_norm: 125 | net_dict[f"{name}_layer_norm_{str(l_no)}"] = nn.LayerNorm(hidden_size[l_no]) 126 | net_dict[f"{name}_activation_{str(l_no)}"] = self.get_activation(self.mlp_activation) 127 | net_dict[f"{name}_linear_layer_output"] = NoisyLinear(hidden_size[-1], output_size) 128 | return nn.Sequential(net_dict) 129 | 130 | def create_cnn(self, input_size, channel_size, kernal_size, stride_size, name): 131 | 132 | net_dict = OrderedDict() 133 | 134 | net_dict[f"{name}_conv_layer_0"] = nn.Conv2d(in_channels=input_size, out_channels=channel_size[0], 135 | kernel_size=kernal_size[0], 136 | stride=stride_size[0]) 137 | if self.layer_norm: 138 | net_dict[f"{name}_layer_norm_0"] = nn.BatchNorm2d(channel_size[0]) 139 | net_dict[f"{name}_activation_0"] = self.get_activation(self.cnn_activation) 140 | 141 | if len(channel_size) > 1: 142 | for l_no in range(1, len(channel_size)): 143 | net_dict[f"{name}_conv_layer_{str(l_no)}"] = nn.Conv2d(in_channels=channel_size[l_no - 1], 144 | out_channels=channel_size[l_no], 145 | kernel_size=kernal_size[l_no], 146 | stride=stride_size[l_no]) 147 | if self.layer_norm: 148 | net_dict[f"{name}_layer_norm_{str(l_no)}"] = nn.BatchNorm2d(channel_size[l_no]) 149 | net_dict[f"{name}_activation_{str(l_no)}"] = self.get_activation(self.cnn_activation) 150 | 151 | return nn.Sequential(net_dict) 152 | 153 | def create_nets(self): 154 | 155 | feature_net = self.create_cnn(self.input_shape[0], self.channel_size, self.kernal_size, self.stride_size, 156 | name="feature") 157 | 158 | input_size = feature_net(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1) 159 | 160 | value_net = self.create_mlp(input_size, output_size=self.num_atoms, hidden_size=self.hidden_size, name="value") 161 | advantage_net = self.create_mlp(input_size, output_size=self.num_atoms * self.num_actions, 162 | hidden_size=self.hidden_size, 163 | name="adcantage") 164 | 165 | feature_net.to(self.device) 166 | value_net.to(self.device) 167 | advantage_net.to(self.device) 168 | 169 | return feature_net, value_net, advantage_net 170 | 171 | def reset_noise(self): 172 | for l in self.value_net: 173 | if isinstance(l, NoisyLinear): 174 | l.reset_noise() 175 | for l in self.advantage_net: 176 | if isinstance(l, NoisyLinear): 177 | l.reset_noise() 178 | 179 | def forward(self, x): 180 | if not isinstance(x, torch.Tensor): 181 | x = torch.FloatTensor(x) 182 | 183 | batch_size = x.size(0) 184 | x = x / 255. 185 | 186 | x = self.feature_net(x) 187 | x = x.view(batch_size, -1) 188 | 189 | value = self.value_net(x) 190 | advantage = self.advantage_net(x) 191 | 192 | value = value.view(batch_size, 1, self.num_atoms) 193 | advantage = advantage.view(batch_size, self.num_actions, self.num_atoms) 194 | 195 | x = value + advantage - advantage.mean(1, keepdim=True) 196 | x = F.softmax(x.view(-1, self.num_atoms), dim=-1).view(-1, self.num_actions, self.num_atoms) 197 | 198 | return x 199 | 200 | def act(self, state): 201 | 202 | if not isinstance(state, torch.Tensor): 203 | state = torch.FloatTensor(np.float32(state)).unsqueeze(0) 204 | 205 | state = state.to(self.device) 206 | 207 | dist = self.forward(state).data.cpu() 208 | dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms) 209 | action = dist.sum(2).max(1)[1].numpy()[0] 210 | return action 211 | 212 | @property 213 | def short_dict(self): 214 | short_dict = {"channel_size": self.channel_size, "kernal_size": self.kernal_size, 215 | "stride_size": self.stride_size, "hidden_size": self.hidden_size, 216 | "num_atoms": self.num_atoms, 217 | "Vmin": self.Vmin, "Vmax": self.Vmax, 218 | "mlp_activation": self.mlp_activation, "cnn_activation": self.cnn_activation, 219 | "layer_norm": self.layer_norm} 220 | return short_dict 221 | 222 | @property 223 | def init_dict(self): 224 | initdict = {"input_shape": self.input_shape, "channel_size": self.channel_size, "kernal_size": self.kernal_size, 225 | "stride_size": self.stride_size, "hidden_size": self.hidden_size, 226 | "num_actions": self.num_actions, "num_atoms": self.num_atoms, 227 | "Vmin": self.Vmin, "Vmax": self.Vmax, 228 | "mlp_activation": self.mlp_activation, "cnn_activation": self.cnn_activation, 229 | "layer_norm": self.layer_norm, "device": self.device} 230 | return initdict 231 | 232 | def get_model_dict(self): 233 | 234 | model_dict = self.init_dict 235 | model_dict.update({'stored_values': self.extract_parameters(without_layer_norm=False)}) 236 | return model_dict 237 | 238 | def count_parameters(self, without_layer_norm=False): 239 | count = 0 240 | for name, param in self.named_parameters(): 241 | if not without_layer_norm or not 'layer_norm' in name: 242 | count += param.data.cpu().numpy().flatten().shape[0] 243 | return count 244 | 245 | def extract_grad(self, without_layer_norm=False): 246 | tot_size = self.count_parameters(without_layer_norm) 247 | pvec = np.zeros(tot_size, np.float32) 248 | count = 0 249 | for name, param in self.named_parameters(): 250 | if not without_layer_norm or not 'layer_norm' in name: 251 | sz = param.grad.data.cpu().numpy().flatten().shape[0] 252 | pvec[count:count + sz] = param.grad.data.cpu().numpy().flatten() 253 | count += sz 254 | return pvec.copy() 255 | 256 | def extract_parameters(self, without_layer_norm=False): 257 | tot_size = self.count_parameters(without_layer_norm) 258 | pvec = np.zeros(tot_size, np.float32) 259 | count = 0 260 | for name, param in self.named_parameters(): 261 | if not without_layer_norm or not 'layer_norm' in name: 262 | sz = param.data.cpu().detach().numpy().flatten().shape[0] 263 | pvec[count:count + sz] = param.data.cpu().detach().numpy().flatten() 264 | count += sz 265 | return copy.deepcopy(pvec) 266 | 267 | def inject_parameters(self, pvec, without_layer_norm=False): 268 | count = 0 269 | 270 | for name, param in self.named_parameters(): 271 | if not without_layer_norm or not 'layer_norm' in name: 272 | sz = param.data.cpu().numpy().flatten().shape[0] 273 | raw = pvec[count:count + sz] 274 | reshaped = raw.reshape(param.data.cpu().numpy().shape) 275 | param.data = torch.from_numpy(copy.deepcopy(reshaped)).type(torch.FloatTensor) 276 | count += sz 277 | return pvec 278 | 279 | def add_mlp_layer(self): 280 | if len(self.hidden_size) < 3: # HARD LIMIT 281 | self.hidden_size += [self.hidden_size[-1]] 282 | 283 | self.recreate_nets() 284 | else: 285 | self.add_mlp_node() 286 | 287 | def add_mlp_node(self, hidden_layer=None, numb_new_nodes=None): 288 | if hidden_layer is None: 289 | hidden_layer = np.random.randint(0, len(self.hidden_size), 1)[0] 290 | else: 291 | hidden_layer = min(hidden_layer, len(self.hidden_size) - 1) 292 | if numb_new_nodes is None: 293 | numb_new_nodes = np.random.choice([32, 64, 128], 1)[0] 294 | 295 | if self.hidden_size[hidden_layer] + numb_new_nodes <= 1024: # HARD LIMIT 296 | 297 | self.hidden_size[hidden_layer] += numb_new_nodes 298 | 299 | self.recreate_nets() 300 | return {"hidden_layer": hidden_layer, "numb_new_nodes": numb_new_nodes} 301 | 302 | def add_cnn_layer(self): 303 | if len(self.channel_size) < 6: # HARD LIMIT 304 | self.channel_size += [self.channel_size[-1]] 305 | self.kernal_size += [3] 306 | 307 | stride_size_list = [[4], [4, 2], [4, 2, 1], [2, 2, 2, 1], [2, 1, 2, 1, 2], [2, 1, 2, 1, 2, 1]] 308 | self.stride_size = stride_size_list[len(self.channel_size) - 1] 309 | 310 | self.recreate_nets() 311 | else: 312 | self.add_cnn_channel() 313 | 314 | def change_cnn_kernal(self): 315 | if len(self.channel_size) > 1: 316 | hidden_layer = np.random.randint(1, min(4, len(self.channel_size)), 1)[0] 317 | self.kernal_size[hidden_layer] = np.random.choice([3, 4, 5, 7]) 318 | 319 | self.recreate_nets() 320 | else: 321 | self.add_cnn_layer() 322 | 323 | def add_cnn_channel(self, hidden_layer=None, numb_new_channels=None): 324 | 325 | if hidden_layer is None: 326 | hidden_layer = np.random.randint(0, len(self.channel_size), 1)[0] 327 | else: 328 | hidden_layer = min(hidden_layer, len(self.channel_size) - 1) 329 | if numb_new_channels is None: 330 | numb_new_nodes = np.random.choice([8, 16, 32], 1)[0] 331 | 332 | if self.channel_size[hidden_layer] + numb_new_nodes <= 256: # HARD LIMIT 333 | 334 | self.channel_size[hidden_layer] += numb_new_nodes 335 | 336 | self.recreate_nets() 337 | 338 | return {"hidden_layer": hidden_layer, "numb_new_channels": numb_new_channels} 339 | 340 | def recreate_nets(self): 341 | new_feature_net, new_value_net, new_advantage_net = self.create_nets() 342 | new_feature_net = self.preserve_parameters(old_net=self.feature_net, new_net=new_feature_net) 343 | new_value_net = self.preserve_parameters(old_net=self.value_net, new_net=new_value_net) 344 | new_advantage_net = self.preserve_parameters(old_net=self.advantage_net, new_net=new_advantage_net) 345 | self.feature_net, self.value_net, self.advantage_net = new_feature_net, new_value_net, new_advantage_net 346 | 347 | def clone(self): 348 | clone = EvolvableCnnDQN(**copy.deepcopy(self.init_dict)) 349 | clone.load_state_dict(self.state_dict()) 350 | return clone 351 | 352 | def preserve_parameters(self, old_net, new_net): 353 | 354 | old_net_dict = dict(old_net.named_parameters()) 355 | 356 | for key, param in new_net.named_parameters(): 357 | if key in old_net_dict.keys(): 358 | if old_net_dict[key].data.size() == param.data.size(): 359 | param.data = old_net_dict[key].data 360 | else: 361 | if not "norm" in key: 362 | old_size = old_net_dict[key].data.size() 363 | new_size = param.data.size() 364 | if len(param.data.size()) == 1: 365 | param.data[:min(old_size[0], new_size[0])] = old_net_dict[key].data[ 366 | :min(old_size[0], new_size[0])] 367 | elif len(param.data.size()) == 2: 368 | param.data[:min(old_size[0], new_size[0]), :min(old_size[1], new_size[1])] = old_net_dict[ 369 | key].data[ 370 | :min(old_size[ 371 | 0], 372 | new_size[ 373 | 0]), 374 | :min(old_size[ 375 | 1], 376 | new_size[ 377 | 1])] 378 | else: 379 | param.data[:min(old_size[0], new_size[0]), :min(old_size[1], new_size[1]), 380 | :min(old_size[2], new_size[2]), 381 | :min(old_size[3], new_size[3])] = old_net_dict[key].data[ 382 | :min(old_size[0], new_size[0]), 383 | :min(old_size[1], new_size[1]), 384 | :min(old_size[2], new_size[2]), 385 | :min(old_size[3], new_size[3]), 386 | ] 387 | 388 | return new_net 389 | 390 | def shrink_preserve_parameters(self, old_net, new_net): 391 | 392 | old_net_dict = dict(old_net.named_parameters()) 393 | 394 | for key, param in new_net.named_parameters(): 395 | if key in old_net_dict.keys(): 396 | if old_net_dict[key].data.size() == param.data.size(): 397 | param.data = old_net_dict[key].data 398 | else: 399 | if not "norm" in key: 400 | old_size = old_net_dict[key].data.size() 401 | new_size = param.data.size() 402 | min_0 = min(old_size[0], new_size[0]) 403 | if len(param.data.size()) == 1: 404 | param.data[:min_0] = old_net_dict[key].data[:min_0] 405 | else: 406 | min_1 = min(old_size[1], new_size[1]) 407 | param.data[:min_0, :min_1] = old_net_dict[key].data[:min_0, :min_1] 408 | return new_net 409 | -------------------------------------------------------------------------------- /searl/neuroevolution/components/envolvable_mlp.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from collections import OrderedDict 3 | from typing import List 4 | 5 | import numpy as np 6 | import torch 7 | import torch.nn as nn 8 | 9 | 10 | class EvolvableMLP(nn.Module): 11 | def __init__(self, num_inputs: int, num_outputs: int, hidden_size: List[int], activation='relu', 12 | output_activation=None, layer_norm=False, output_vanish=True, stored_values=None): 13 | super(EvolvableMLP, self).__init__() 14 | 15 | self.num_inputs = num_inputs 16 | self.num_outputs = num_outputs 17 | self.activation = activation 18 | self.output_activation = output_activation 19 | self.layer_norm = layer_norm 20 | self.output_vanish = output_vanish 21 | 22 | self.hidden_size = hidden_size 23 | 24 | self.net = self.create_net() 25 | 26 | if stored_values is not None: 27 | self.inject_parameters(pvec=stored_values, without_layer_norm=False) 28 | 29 | def get_activation(self, activation_names): 30 | activation_functions = {'tanh': nn.Tanh, 'linear': nn.Identity, 'relu': nn.ReLU, 'elu': nn.ELU, 31 | 'softsign': nn.Softsign, 'sigmoid': nn.Sigmoid, 'softplus': nn.Softplus, 32 | 'lrelu': nn.LeakyReLU, 'prelu': nn.PReLU, } 33 | 34 | return activation_functions[activation_names]() 35 | 36 | def create_net(self): 37 | net_dict = OrderedDict() 38 | 39 | net_dict["linear_layer_0"] = nn.Linear(self.num_inputs, self.hidden_size[0]) 40 | if self.layer_norm: 41 | net_dict["layer_norm_0"] = nn.LayerNorm(self.hidden_size[0]) 42 | net_dict["activation_0"] = self.get_activation(self.activation) 43 | 44 | if len(self.hidden_size) > 1: 45 | for l_no in range(1, len(self.hidden_size)): 46 | net_dict[f"linear_layer_{str(l_no)}"] = nn.Linear(self.hidden_size[l_no - 1], self.hidden_size[l_no]) 47 | if self.layer_norm: 48 | net_dict[f"layer_norm_{str(l_no)}"] = nn.LayerNorm(self.hidden_size[l_no]) 49 | net_dict[f"activation_{str(l_no)}"] = self.get_activation(self.activation) 50 | 51 | output_layer = nn.Linear(self.hidden_size[-1], self.num_outputs) 52 | 53 | if self.output_vanish: 54 | output_layer.weight.data.mul_(0.1) 55 | output_layer.bias.data.mul_(0.1) 56 | 57 | net_dict[f"linear_layer_output"] = output_layer 58 | if self.output_activation is not None: 59 | net_dict[f"activation_output"] = self.get_activation(self.output_activation) 60 | 61 | return nn.Sequential(net_dict) 62 | 63 | def forward(self, x): 64 | if not isinstance(x, torch.Tensor): 65 | x = torch.FloatTensor(x) 66 | 67 | for value in self.net: 68 | x = value(x) 69 | return x 70 | 71 | def get_model_dict(self): 72 | 73 | model_dict = self.init_dict 74 | model_dict.update({'stored_values': self.extract_parameters(without_layer_norm=False)}) 75 | return model_dict 76 | 77 | def count_parameters(self, without_layer_norm=False): 78 | count = 0 79 | for name, param in self.named_parameters(): 80 | if not without_layer_norm or not 'layer_norm' in name: 81 | count += param.data.cpu().numpy().flatten().shape[0] 82 | return count 83 | 84 | # function to return current pytorch gradient in same order as genome's flattened parameter vector 85 | def extract_grad(self, without_layer_norm=False): 86 | tot_size = self.count_parameters(without_layer_norm) 87 | pvec = np.zeros(tot_size, np.float32) 88 | count = 0 89 | for name, param in self.named_parameters(): 90 | if not without_layer_norm or not 'layer_norm' in name: 91 | sz = param.grad.data.cpu().numpy().flatten().shape[0] 92 | pvec[count:count + sz] = param.grad.data.cpu().numpy().flatten() 93 | count += sz 94 | return pvec.copy() 95 | 96 | # function to grab current flattened neural network weights 97 | def extract_parameters(self, without_layer_norm=False): 98 | tot_size = self.count_parameters(without_layer_norm) 99 | pvec = np.zeros(tot_size, np.float32) 100 | count = 0 101 | for name, param in self.named_parameters(): 102 | if not without_layer_norm or not 'layer_norm' in name: 103 | sz = param.data.cpu().detach().numpy().flatten().shape[0] 104 | pvec[count:count + sz] = param.data.cpu().detach().numpy().flatten() 105 | count += sz 106 | return copy.deepcopy(pvec) 107 | 108 | # function to inject a flat vector of ANN parameters into the model's current neural network weights 109 | def inject_parameters(self, pvec, without_layer_norm=False): 110 | count = 0 111 | 112 | for name, param in self.named_parameters(): 113 | if not without_layer_norm or not 'layer_norm' in name: 114 | sz = param.data.cpu().numpy().flatten().shape[0] 115 | raw = pvec[count:count + sz] 116 | reshaped = raw.reshape(param.data.cpu().numpy().shape) 117 | param.data = torch.from_numpy(copy.deepcopy(reshaped)).type(torch.FloatTensor) 118 | count += sz 119 | return pvec 120 | 121 | @property 122 | def init_dict(self): 123 | 124 | init_dict = {"num_inputs": self.num_inputs, "num_outputs": self.num_outputs, "hidden_size": self.hidden_size, 125 | "activation": self.activation, "output_activation": self.output_activation, 126 | "layer_norm": self.layer_norm} 127 | return init_dict 128 | 129 | @property 130 | def short_dict(self): 131 | 132 | short_dict = {"hidden_size": self.hidden_size, 133 | "activation": self.activation, "output_activation": self.output_activation, 134 | "layer_norm": self.layer_norm} 135 | return short_dict 136 | 137 | def add_layer(self): 138 | 139 | # add layer to hyper params 140 | if len(self.hidden_size) < 3: # HARD LIMIT 141 | self.hidden_size += [self.hidden_size[-1]] 142 | 143 | # copy old params to new net 144 | new_net = self.create_net() 145 | new_net = self.preserve_parameters(old_net=self.net, new_net=new_net) 146 | self.net = new_net 147 | else: 148 | self.add_node() 149 | 150 | def remove_layer(self): 151 | if len(self.hidden_size) > 1: # HARD LIMIT 152 | self.hidden_size = self.hidden_size[:1] 153 | new_net = self.create_net() 154 | new_net = self.shrink_preserve_parameters(old_net=self.net, new_net=new_net) 155 | self.net = new_net 156 | else: 157 | self.add_node() 158 | 159 | def add_node(self, hidden_layer=None, numb_new_nodes=None): 160 | 161 | if hidden_layer is None: 162 | hidden_layer = np.random.randint(0, len(self.hidden_size), 1)[0] 163 | else: 164 | hidden_layer = min(hidden_layer, len(self.hidden_size) - 1) 165 | if numb_new_nodes is None: 166 | numb_new_nodes = np.random.choice([16, 32, 64], 1)[0] 167 | 168 | if self.hidden_size[hidden_layer] + numb_new_nodes <= 500: # HARD LIMIT 169 | self.hidden_size[hidden_layer] += numb_new_nodes 170 | new_net = self.create_net() 171 | new_net = self.preserve_parameters(old_net=self.net, new_net=new_net) 172 | 173 | self.net = new_net 174 | 175 | return {"hidden_layer": hidden_layer, "numb_new_nodes": numb_new_nodes} 176 | 177 | def remove_node(self, hidden_layer=None, numb_new_nodes=None): 178 | 179 | if hidden_layer is None: 180 | hidden_layer = np.random.randint(0, len(self.hidden_size), 1)[0] 181 | else: 182 | hidden_layer = min(hidden_layer, len(self.hidden_size) - 1) 183 | if numb_new_nodes is None: 184 | numb_new_nodes = np.random.choice([16, 32, 64], 1)[0] 185 | 186 | if self.hidden_size[hidden_layer] - numb_new_nodes > 64: # HARD LIMIT 187 | self.hidden_size[hidden_layer] = self.hidden_size[hidden_layer] - numb_new_nodes 188 | new_net = self.create_net() 189 | new_net = self.shrink_preserve_parameters(old_net=self.net, new_net=new_net) 190 | 191 | self.net = new_net 192 | 193 | return {"hidden_layer": hidden_layer, "numb_new_nodes": numb_new_nodes} 194 | 195 | def clone(self): 196 | clone = EvolvableMLP(**copy.deepcopy(self.init_dict)) 197 | clone.load_state_dict(self.state_dict()) 198 | return clone 199 | 200 | def preserve_parameters(self, old_net, new_net): 201 | 202 | old_net_dict = dict(old_net.named_parameters()) 203 | 204 | for key, param in new_net.named_parameters(): 205 | if key in old_net_dict.keys(): 206 | if old_net_dict[key].data.size() == param.data.size(): 207 | param.data = old_net_dict[key].data 208 | else: 209 | if not "norm" in key: 210 | old_size = old_net_dict[key].data.size() 211 | new_size = param.data.size() 212 | if len(param.data.size()) == 1: 213 | param.data[:min(old_size[0], new_size[0])] = old_net_dict[key].data[ 214 | :min(old_size[0], new_size[0])] 215 | else: 216 | param.data[:min(old_size[0], new_size[0]), :min(old_size[1], new_size[1])] = old_net_dict[ 217 | key].data[ 218 | :min(old_size[ 219 | 0], 220 | new_size[ 221 | 0]), 222 | :min(old_size[ 223 | 1], 224 | new_size[ 225 | 1])] 226 | 227 | return new_net 228 | 229 | def shrink_preserve_parameters(self, old_net, new_net): 230 | 231 | old_net_dict = dict(old_net.named_parameters()) 232 | 233 | for key, param in new_net.named_parameters(): 234 | if key in old_net_dict.keys(): 235 | if old_net_dict[key].data.size() == param.data.size(): 236 | param.data = old_net_dict[key].data 237 | else: 238 | if not "norm" in key: 239 | old_size = old_net_dict[key].data.size() 240 | new_size = param.data.size() 241 | min_0 = min(old_size[0], new_size[0]) 242 | if len(param.data.size()) == 1: 243 | param.data[:min_0] = old_net_dict[key].data[:min_0] 244 | else: 245 | min_1 = min(old_size[1], new_size[1]) 246 | param.data[:min_0, :min_1] = old_net_dict[key].data[:min_0, :min_1] 247 | return new_net 248 | -------------------------------------------------------------------------------- /searl/neuroevolution/components/individual_dqn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from searl.neuroevolution.components.envolvable_cnn import EvolvableCnnDQN 4 | 5 | 6 | class DQNIndividual(): 7 | 8 | def __init__(self, state_dim, action_dim, actor_config, rl_config, index, device='cpu', replay_memory=None): 9 | self.state_dim = state_dim 10 | self.action_dim = action_dim 11 | self.actor_config = actor_config 12 | self.rl_config = rl_config 13 | self.index = index 14 | self.device = device 15 | 16 | self.actor = EvolvableCnnDQN(input_shape=state_dim, num_actions=action_dim, device=device, 17 | **actor_config).to(device) 18 | 19 | self.fitness = [] 20 | self.improvement = 0 21 | self.train_log = {"pre_fitness": None, "pre_rank": None, "post_fitness": None, "post_rank": None, "eval_eps": 0, 22 | "index": None, "parent_index": None, "mutation": None} 23 | 24 | self.replay_memory = replay_memory 25 | 26 | def clone(self, index=None): 27 | if index is None: 28 | index = self.index 29 | 30 | clone = type(self)(state_dim=self.state_dim, 31 | action_dim=self.action_dim, 32 | actor_config=copy.deepcopy(self.actor.short_dict), 33 | rl_config=copy.deepcopy(self.rl_config), 34 | index=index, 35 | replay_memory=self.replay_memory, 36 | device=self.device 37 | ) 38 | 39 | clone.fitness = copy.deepcopy(self.fitness) 40 | clone.train_log = copy.deepcopy(self.train_log) 41 | clone.actor = self.actor.clone() 42 | 43 | if self.replay_memory: 44 | self.replay_memory = copy.deepcopy(self.replay_memory) 45 | 46 | return clone 47 | -------------------------------------------------------------------------------- /searl/neuroevolution/components/individual_td3.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | from searl.neuroevolution.components.envolvable_mlp import EvolvableMLP 4 | 5 | 6 | class Individual(): 7 | 8 | def __init__(self, state_dim, action_dim, actor_config, critic_config, rl_config, index, td3_double_q, 9 | critic_2_config=None, replay_memory=None): 10 | 11 | self.state_dim = state_dim 12 | self.action_dim = action_dim 13 | self.actor_config = actor_config 14 | self.critic_config = critic_config 15 | self.rl_config = rl_config 16 | self.index = index 17 | self.td3_double_q = td3_double_q 18 | 19 | if critic_2_config is None: 20 | critic_2_config = copy.deepcopy(critic_config) 21 | 22 | self.actor = EvolvableMLP(num_inputs=state_dim, num_outputs=action_dim, **actor_config) 23 | self.critic_1 = EvolvableMLP(num_inputs=state_dim + action_dim, num_outputs=1, **critic_config) 24 | if td3_double_q: 25 | self.critic_2 = EvolvableMLP(num_inputs=state_dim + action_dim, num_outputs=1, **critic_2_config) 26 | 27 | self.fitness = [] 28 | self.improvement = 0 29 | self.train_log = {"pre_fitness": None, "pre_rank": None, "post_fitness": None, "post_rank": None, "eval_eps": 0, 30 | "index": None, "parent_index": None, "mutation": None} 31 | 32 | self.replay_memory = replay_memory 33 | 34 | def clone(self, index=None): 35 | if index is None: 36 | index = self.index 37 | 38 | if self.td3_double_q: 39 | critic_2_config = copy.deepcopy(self.critic_2.short_dict) 40 | else: 41 | critic_2_config = None 42 | 43 | clone = type(self)(state_dim=self.state_dim, 44 | action_dim=self.action_dim, 45 | actor_config=copy.deepcopy(self.actor.short_dict), 46 | critic_config=copy.deepcopy(self.critic_1.short_dict), 47 | rl_config=copy.deepcopy(self.rl_config), 48 | index=index, 49 | td3_double_q=self.td3_double_q, 50 | critic_2_config=critic_2_config, 51 | replay_memory=self.replay_memory) 52 | 53 | clone.fitness = copy.deepcopy(self.fitness) 54 | clone.train_log = copy.deepcopy(self.train_log) 55 | clone.actor = self.actor.clone() 56 | clone.critic_1 = self.critic_1.clone() 57 | if self.td3_double_q: 58 | clone.critic_2 = self.critic_2.clone() 59 | 60 | if self.replay_memory: 61 | self.replay_memory = copy.deepcopy(self.replay_memory) 62 | 63 | return clone 64 | -------------------------------------------------------------------------------- /searl/neuroevolution/components/replay_memory.py: -------------------------------------------------------------------------------- 1 | import multiprocessing as mp 2 | import queue 3 | import time 4 | from typing import List, Dict 5 | 6 | import fastrand 7 | import numpy as np 8 | 9 | 10 | class MPReplayMemory(object): 11 | 12 | def __init__(self, seed, capacity, batch_size, reuse_batch): 13 | 14 | ctx = mp.get_context('spawn') 15 | mp_manager = ctx.Manager() 16 | self.push_queue = mp_manager.Queue() 17 | self.sample_queue = mp_manager.Queue() 18 | self.save_queue = mp_manager.Queue() 19 | self.batch_size = batch_size 20 | 21 | np.random.seed(seed) 22 | self.memory_manager = ctx.Process(target=self._memory_manager, 23 | args=(capacity, batch_size, reuse_batch, self.push_queue, self.sample_queue, 24 | self.save_queue)) 25 | self.memory_manager.daemon = True 26 | self.memory_manager.start() 27 | 28 | def load(self, replay_memory_dict): 29 | self.push_queue.put(replay_memory_dict) 30 | 31 | def save(self): 32 | self.push_queue.put("SAVE") 33 | try: 34 | save_dict = self.save_queue.get(timeout=10) 35 | return save_dict 36 | except queue.Empty: 37 | print("save failed") 38 | return "no_save" 39 | 40 | @staticmethod 41 | def _memory_manager(capacity: int, batch_size: int, reuse_batch: int, push_queue: mp.Queue, sample_queue: mp.Queue, 42 | save_queue: mp.Queue): 43 | memory = [] 44 | position = 0 45 | 46 | while True: 47 | if not push_queue.empty(): 48 | queue_output = push_queue.get() 49 | if queue_output == "QUIT": 50 | return 51 | 52 | elif queue_output == "SAVE": 53 | save_queue.put({"memory": memory, "position": position}) 54 | 55 | elif isinstance(queue_output, Dict): 56 | memory = queue_output["memory"] 57 | position = queue_output["position"] 58 | 59 | elif isinstance(queue_output, List): 60 | for transition in queue_output: 61 | if len(memory) < capacity: 62 | memory.append(transition) 63 | else: 64 | memory[position] = transition 65 | position = (position + 1) % capacity 66 | else: 67 | if len(memory) < capacity: 68 | memory.append(queue_output) 69 | else: 70 | memory[position] = queue_output 71 | position = (position + 1) % capacity 72 | 73 | if sample_queue.qsize() < 20 and len(memory) > batch_size: 74 | 75 | transistion_list = [] 76 | for _ in range(batch_size): 77 | idx = fastrand.pcg32bounded(len(memory)) 78 | transistion_list.append(memory[idx]) 79 | for _ in range(reuse_batch): 80 | sample_queue.put(transistion_list) 81 | 82 | def close(self): 83 | print("CLOSE REPLAY MEMORY") 84 | self.push_queue.put("QUIT") 85 | while not self.push_queue.empty(): 86 | time.sleep(1) 87 | 88 | def get_push_queue(self): 89 | return self.push_queue 90 | 91 | def get_sample_queue(self): 92 | return self.sample_queue 93 | 94 | 95 | class ReplayMemory(object): 96 | 97 | def __init__(self, capacity: int, batch_size: int): 98 | self.storage = [] 99 | self.capacity = capacity 100 | self.batch_size = batch_size 101 | self.ptr = 0 102 | 103 | def add(self, transistions): 104 | if isinstance(transistions, List): 105 | for transition in transistions: 106 | self._add(transition) 107 | else: 108 | self._add(transistions) 109 | 110 | def put(self, transistions): 111 | self.add(transistions) 112 | 113 | def _add(self, transistion): 114 | if len(self.storage) == self.capacity: 115 | self.storage[int(self.ptr)] = transistion 116 | self.ptr = (self.ptr + 1) % self.capacity 117 | else: 118 | self.storage.append(transistion) 119 | 120 | def get(self): 121 | return self.sample() 122 | 123 | def sample(self): 124 | ind = np.random.randint(0, len(self.storage), size=self.batch_size) 125 | 126 | transition_list = [] 127 | for i in ind: 128 | transition_list.append(self.storage[i]) 129 | 130 | return transition_list 131 | -------------------------------------------------------------------------------- /searl/neuroevolution/components/utils.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | import numpy as np 4 | import torch 5 | 6 | fields = ('state', 'action', 'next_state', 'reward', 'done', 'weight', 'index') 7 | Transition = namedtuple('Transition', fields) 8 | Transition.__new__.__defaults__ = (None,) * len(Transition._fields) 9 | 10 | 11 | def to_tensor(ndarray, requires_grad=False): 12 | return torch.from_numpy(ndarray).float().requires_grad_(requires_grad) 13 | 14 | 15 | def feature_scaling(x): 16 | return (x - np.min(x)) / (np.max(x) - np.min(x)) 17 | 18 | 19 | def softmax(x): 20 | return np.exp(x) / np.sum(np.exp(x)) 21 | 22 | 23 | def soft_update(target, source, tau): 24 | for target_param, source_param in zip(target.parameters(), source.parameters()): 25 | target_param.data.copy_(target_param.data * (1.0 - tau) + source_param.data * tau) 26 | -------------------------------------------------------------------------------- /searl/neuroevolution/evaluation_dqn.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import numpy as np 4 | import torch 5 | 6 | from searl.neuroevolution.components.utils import Transition 7 | from searl.rl_algorithms.components.wrappers import make_atari, wrap_deepmind, wrap_pytorch 8 | 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 10 | print("train CUDA", device == torch.device("cuda"), device) 11 | 12 | 13 | class MPEvaluation(): 14 | def __init__(self, config, logger, replay_memory=None): 15 | 16 | self.rng = np.random.RandomState(config.seed.evaluation) 17 | self.cfg = config 18 | self.log = logger 19 | self.push_queue = replay_memory 20 | self.eval_episodes = config.eval.eval_episodes 21 | 22 | def test_individual(self, individual, epoch): 23 | return_dict = self._evaluate_individual(individual, self.cfg, self.cfg.eval.test_episodes, epoch, False) 24 | fitness = np.mean(return_dict[individual.index]["fitness_list"]) 25 | return fitness 26 | 27 | @staticmethod 28 | def _evaluate_individual(individual, config, num_episodes, seed, exploration_noise=False, start_phase=False): 29 | 30 | actor_net = individual.actor 31 | 32 | num_frames = 0 33 | fitness_list = [] 34 | transistions_list = [] 35 | episodes = 0 36 | 37 | env = make_atari(config.env.name) 38 | env = wrap_deepmind(env) 39 | env = wrap_pytorch(env) 40 | env.seed(seed) 41 | 42 | actor_net.eval() 43 | actor_net.to(device) 44 | actor_net.device = device 45 | 46 | with torch.no_grad(): 47 | while episodes < num_episodes or num_frames < config.eval.min_eval_steps: 48 | episode_fitness = 0.0 49 | episode_transitions = [] 50 | state = env.reset() 51 | 52 | done = False 53 | while not done: 54 | action = actor_net.act(state) 55 | 56 | next_state, reward, done, info = env.step(action) 57 | episode_fitness += reward 58 | num_frames += 1 59 | 60 | transition = Transition(torch.FloatTensor(state), torch.LongTensor([action]), 61 | torch.FloatTensor(next_state), torch.FloatTensor(np.array([reward])), 62 | torch.FloatTensor(np.array([done]).astype('uint8')) 63 | ) 64 | 65 | episode_transitions.append(transition) 66 | state = next_state 67 | episodes += 1 68 | fitness_list.append(episode_fitness) 69 | transistions_list.append(episode_transitions) 70 | 71 | actor_net.to(torch.device("cpu")) 72 | 73 | return {individual.index: {"fitness_list": fitness_list, "num_episodes": num_episodes, "num_frames": num_frames, 74 | "id": individual.index, "transitions": transistions_list}} 75 | 76 | def evaluate_population(self, population: List, exploration_noise=False, total_frames=1): 77 | 78 | population_id_lookup = [ind.index for ind in population] 79 | new_population_mean_fitness = np.zeros(len(population)) 80 | new_population_var_fitness = np.zeros(len(population)) 81 | 82 | start_phase = total_frames <= self.cfg.rl.start_timesteps 83 | if start_phase: 84 | self.log("start phase", time_step=total_frames) 85 | 86 | args_list = [(ind, self.cfg, self.eval_episodes, self.rng.randint(0, 100000), exploration_noise, start_phase) 87 | for ind in population] 88 | 89 | result_dict = [] 90 | for args in args_list: 91 | result_dict.append(self._evaluate_individual(*args)) 92 | 93 | eval_frames = 0 94 | for list_element in result_dict: 95 | for ind_id, value_dict in list_element.items(): 96 | pop_idx = population_id_lookup.index(ind_id) 97 | new_population_mean_fitness[pop_idx] = np.mean(value_dict['fitness_list']) 98 | new_population_var_fitness[pop_idx] = np.var(value_dict['fitness_list']) 99 | eval_frames += value_dict['num_frames'] 100 | 101 | population[pop_idx].train_log["eval_eps"] = self.eval_episodes 102 | 103 | for transitions in value_dict['transitions']: 104 | if self.cfg.nevo.ind_memory: 105 | population[pop_idx].replay_memory.add(transitions) 106 | else: 107 | self.push_queue.put(transitions) 108 | 109 | for idx in range(len(population)): 110 | population[idx].train_log["post_fitness"] = new_population_mean_fitness[idx] 111 | population[idx].train_log["index"] = population[idx].index 112 | self.log.csv.log_csv(population[idx].train_log) 113 | population[idx].train_log.update( 114 | {"pre_fitness": new_population_mean_fitness[idx], 115 | "eval_eps": 0}) # , "pre_rank": population_rank[idx], "eval_eps":0} 116 | population[idx].fitness.append(new_population_mean_fitness[idx]) 117 | if len(population[idx].fitness) > 1: 118 | population[idx].improvement = population[idx].fitness[-1] - population[idx].fitness[-2] 119 | else: 120 | population[idx].improvement = population[idx].fitness[-1] 121 | 122 | return new_population_mean_fitness, new_population_var_fitness, eval_frames 123 | -------------------------------------------------------------------------------- /searl/neuroevolution/evaluation_td3.py: -------------------------------------------------------------------------------- 1 | from collections import ChainMap 2 | from typing import List 3 | 4 | import gym 5 | import numpy as np 6 | import torch 7 | 8 | from searl.neuroevolution.components.utils import to_tensor, Transition 9 | 10 | 11 | class MPEvaluation(): 12 | """ 13 | evaluates and population and stores transitions an a push_queue 14 | 15 | """ 16 | 17 | def __init__(self, config, logger, push_queue=None): 18 | 19 | self.rng = np.random.RandomState(config.seed.evaluation) 20 | self.cfg = config 21 | self.log = logger 22 | self.push_queue = push_queue 23 | self.eval_episodes = config.eval.eval_episodes 24 | 25 | def test_individual(self, individual, epoch): 26 | return_dict = self._evaluate_individual(individual, self.cfg, self.cfg.eval.test_episodes, epoch, False) 27 | fitness = np.mean(return_dict[individual.index]["fitness_list"]) 28 | return fitness 29 | 30 | @staticmethod 31 | def _evaluate_individual(individual, config, num_episodes, seed, exploration_noise=False, start_phase=False): 32 | 33 | actor_net = individual.actor 34 | 35 | num_frames = 0 36 | fitness_list = [] 37 | transistions_list = [] 38 | episodes = 0 39 | 40 | env = gym.make(config.env.name) 41 | env.seed(seed) 42 | actor_net.eval() 43 | 44 | with torch.no_grad(): 45 | while episodes < num_episodes or num_frames < config.eval.min_eval_steps: 46 | episode_fitness = 0.0 47 | episode_transitions = [] 48 | state = env.reset() 49 | t_state = to_tensor(state).unsqueeze(0) 50 | done = False 51 | while not done: 52 | if start_phase: 53 | action = env.action_space.sample() 54 | action = to_tensor(action) 55 | else: 56 | action = actor_net(t_state) 57 | action.clamp(-1, 1) 58 | action = action.data.numpy() 59 | if exploration_noise is not False: 60 | action += config.eval.exploration_noise * np.random.randn(config.action_dim) 61 | action = np.clip(action, -1, 1) 62 | action = action.flatten() 63 | 64 | step_action = (action + 1) / 2 # [-1, 1] => [0, 1] 65 | step_action *= (env.action_space.high - env.action_space.low) 66 | step_action += env.action_space.low 67 | 68 | next_state, reward, done, info = env.step(step_action) # Simulate one step in environment 69 | 70 | done_bool = 0 if num_frames + 1 == env._max_episode_steps else float(done) 71 | 72 | t_next_state = to_tensor(next_state).unsqueeze(0) 73 | 74 | episode_fitness += reward 75 | num_frames += 1 76 | 77 | transition = Transition(state, action, next_state, np.array([reward]), 78 | np.array([done_bool]).astype('uint8')) 79 | episode_transitions.append(transition) 80 | t_state = t_next_state 81 | state = next_state 82 | episodes += 1 83 | fitness_list.append(episode_fitness) 84 | transistions_list.append(episode_transitions) 85 | 86 | return {individual.index: {"fitness_list": fitness_list, "num_episodes": num_episodes, "num_frames": num_frames, 87 | "id": individual.index, "transitions": transistions_list}} 88 | 89 | def evaluate_population(self, population: List, exploration_noise=False, total_frames=1, pool=None): 90 | population_id_lookup = [ind.index for ind in population] 91 | new_population_mean_fitness = np.zeros(len(population)) 92 | new_population_var_fitness = np.zeros(len(population)) 93 | 94 | start_phase = total_frames <= self.cfg.rl.start_timesteps 95 | if start_phase: 96 | self.log("start phase", time_step=total_frames) 97 | 98 | args_list = [(ind, self.cfg, self.eval_episodes, self.rng.randint(0, 100000), exploration_noise, start_phase) 99 | for ind in population] 100 | result_dicts = [pool.apply(self._evaluate_individual, args) for args in args_list] 101 | result_dict = dict(ChainMap(*result_dicts)) 102 | 103 | eval_frames = 0 104 | for ind_id, value_dict in result_dict.items(): 105 | pop_idx = population_id_lookup.index(ind_id) 106 | new_population_mean_fitness[pop_idx] = np.mean(value_dict['fitness_list']) 107 | new_population_var_fitness[pop_idx] = np.var(value_dict['fitness_list']) 108 | eval_frames += value_dict['num_frames'] 109 | 110 | population[pop_idx].train_log["eval_eps"] = self.eval_episodes 111 | 112 | for transitions in value_dict['transitions']: 113 | if self.cfg.nevo.ind_memory: 114 | population[pop_idx].replay_memory.add(transitions) 115 | else: 116 | self.push_queue.put(transitions) 117 | 118 | for idx in range(len(population)): 119 | population[idx].train_log["post_fitness"] = new_population_mean_fitness[idx] 120 | population[idx].train_log["index"] = population[idx].index 121 | self.log.csv.log_csv(population[idx].train_log) 122 | population[idx].train_log.update({"pre_fitness": new_population_mean_fitness[idx], "eval_eps": 0}) 123 | population[idx].fitness.append(new_population_mean_fitness[idx]) 124 | if len(population[idx].fitness) > 1: 125 | population[idx].improvement = population[idx].fitness[-1] - population[idx].fitness[-2] 126 | else: 127 | population[idx].improvement = population[idx].fitness[-1] 128 | 129 | return new_population_mean_fitness, new_population_var_fitness, eval_frames 130 | -------------------------------------------------------------------------------- /searl/neuroevolution/mutation_cnn.py: -------------------------------------------------------------------------------- 1 | import fastrand 2 | import numpy as np 3 | 4 | 5 | class Mutations(): 6 | 7 | def __init__(self, config): 8 | self.cfg = config 9 | self.rng = np.random.RandomState(self.cfg.seed.mutation) 10 | 11 | def no_mutation(self, individual): 12 | individual.train_log["mutation"] = "no_mutation" 13 | return individual 14 | 15 | def mutation(self, population): 16 | 17 | mutation_options = [] 18 | mutation_proba = [] 19 | if self.cfg.mutation.no_mutation: 20 | mutation_options.append(self.no_mutation) 21 | mutation_proba.append(float(self.cfg.mutation.no_mutation)) 22 | if self.cfg.mutation.architecture: 23 | mutation_options.append(self.architecture_mutate) 24 | mutation_proba.append(float(self.cfg.mutation.architecture)) 25 | if self.cfg.mutation.parameters: 26 | mutation_options.append(self.parameter_mutation) 27 | mutation_proba.append(float(self.cfg.mutation.parameters)) 28 | if self.cfg.mutation.activation: 29 | mutation_options.append(self.activation_mutation) 30 | mutation_proba.append(float(self.cfg.mutation.activation)) 31 | if self.cfg.mutation.rl_hyperparam: 32 | mutation_options.append(self.rl_hyperparam_mutation) 33 | mutation_proba.append(float(self.cfg.mutation.rl_hyperparam)) 34 | 35 | if len(mutation_options) == 0: 36 | return population 37 | 38 | mutation_proba = np.array(mutation_proba) / np.sum(mutation_proba) 39 | 40 | mutation_choice = self.rng.choice(mutation_options, len(population), p=mutation_proba) 41 | 42 | mutated_population = [] 43 | for mutation, individual in zip(mutation_choice, population): 44 | mutated_population.append(mutation(individual)) 45 | 46 | return mutated_population 47 | 48 | def rl_hyperparam_mutation(self, individual): 49 | 50 | rl_config = individual.rl_config 51 | rl_params = self.cfg.mutation.rl_hp_selection 52 | mutate_param = self.rng.choice(rl_params, 1)[0] 53 | 54 | random_num = self.rng.uniform(0, 1) 55 | if mutate_param == 'train_frames_fraction': 56 | if random_num > 0.5: 57 | setattr(rl_config, mutate_param, min(0.1, max(3.0, getattr(rl_config, mutate_param) * 1.2))) 58 | else: 59 | setattr(rl_config, mutate_param, min(0.1, max(3.0, getattr(rl_config, mutate_param) * 0.8))) 60 | elif mutate_param == 'batch_size': 61 | if random_num > 0.5: 62 | setattr(rl_config, mutate_param, min(128, max(8, int(getattr(rl_config, mutate_param) * 1.2)))) 63 | else: 64 | setattr(rl_config, mutate_param, min(128, max(8, int(getattr(rl_config, mutate_param) * 0.8)))) 65 | elif mutate_param == 'lr_actor': 66 | if random_num > 0.5: 67 | setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 1.2))) 68 | else: 69 | setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 0.8))) 70 | elif mutate_param == 'lr_critic': 71 | if random_num > 0.5: 72 | setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 1.2))) 73 | else: 74 | setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 0.8))) 75 | elif mutate_param == 'td3_policy_noise': 76 | if getattr(rl_config, mutate_param): 77 | setattr(rl_config, mutate_param, False) 78 | else: 79 | setattr(rl_config, mutate_param, 0.1) 80 | elif mutate_param == 'td3_update_freq': 81 | if random_num > 0.5: 82 | setattr(rl_config, mutate_param, min(10, max(1, int(getattr(rl_config, mutate_param) + 1)))) 83 | else: 84 | setattr(rl_config, mutate_param, min(10, max(1, int(getattr(rl_config, mutate_param) - 1)))) 85 | elif mutate_param == 'optimizer': 86 | opti_selection = ["adam", "adamax", "rmsprop", "sdg"] 87 | opti_selection.remove(getattr(rl_config, mutate_param)) 88 | opti = self.rng.choice(opti_selection, 1) 89 | setattr(rl_config, mutate_param, opti) 90 | 91 | individual.train_log["mutation"] = "rl_" + mutate_param 92 | individual.rl_config = rl_config 93 | return individual 94 | 95 | def activation_mutation(self, individual): 96 | individual.actor = self._permutate_activation(individual.actor) 97 | individual.train_log["mutation"] = "activation" 98 | return individual 99 | 100 | def _permutate_activation(self, network): 101 | 102 | possible_activations = ['relu', 'elu', 'gelu'] 103 | current_activation = network.mlp_activation 104 | possible_activations.remove(current_activation) 105 | new_activation = self.rng.choice(possible_activations, size=1)[0] 106 | net_dict = network.init_dict 107 | net_dict['mlp_activation'] = new_activation 108 | net_dict['cnn_activation'] = new_activation 109 | new_network = type(network)(**net_dict) 110 | new_network.load_state_dict(network.state_dict()) 111 | network = new_network 112 | 113 | return network 114 | 115 | def parameter_mutation(self, individual): 116 | 117 | offspring = individual.actor 118 | 119 | offspring.cpu() 120 | 121 | offspring = self.classic_parameter_mutation(offspring) 122 | individual.train_log["mutation"] = "classic_parameter" 123 | 124 | individual.actor = offspring 125 | return individual 126 | 127 | def regularize_weight(self, weight, mag): 128 | if weight > mag: weight = mag 129 | if weight < -mag: weight = -mag 130 | return weight 131 | 132 | def classic_parameter_mutation(self, network): 133 | mut_strength = self.cfg.mutation.mutation_sd 134 | num_mutation_frac = 0.1 135 | super_mut_strength = 10 136 | super_mut_prob = 0.05 137 | reset_prob = super_mut_prob + 0.05 138 | 139 | model_params = network.state_dict() 140 | 141 | potential_keys = [] 142 | for i, key in enumerate(model_params): # Mutate each param 143 | if not 'norm' in key: 144 | W = model_params[key] 145 | if len(W.shape) == 2: # Weights, no bias 146 | potential_keys.append(key) 147 | 148 | how_many = np.random.randint(1, len(potential_keys) + 1, 1)[0] 149 | chosen_keys = np.random.choice(potential_keys, how_many, replace=False) 150 | 151 | for key in chosen_keys: 152 | # References to the variable keys 153 | W = model_params[key] 154 | num_weights = W.shape[0] * W.shape[1] 155 | # Number of mutation instances 156 | num_mutations = fastrand.pcg32bounded(int(np.ceil(num_mutation_frac * num_weights))) 157 | for _ in range(num_mutations): 158 | ind_dim1 = fastrand.pcg32bounded(W.shape[0]) 159 | ind_dim2 = fastrand.pcg32bounded(W.shape[-1]) 160 | random_num = self.rng.uniform(0, 1) 161 | 162 | if random_num < super_mut_prob: # Super Mutation probability 163 | W[ind_dim1, ind_dim2] += self.rng.normal(0, np.abs(super_mut_strength * W[ind_dim1, ind_dim2])) 164 | elif random_num < reset_prob: # Reset probability 165 | W[ind_dim1, ind_dim2] = self.rng.normal(0, 1) 166 | else: # mutauion even normal 167 | W[ind_dim1, ind_dim2] += self.rng.normal(0, np.abs(mut_strength * W[ind_dim1, ind_dim2])) 168 | 169 | # Regularization hard limit 170 | W[ind_dim1, ind_dim2] = self.regularize_weight(W[ind_dim1, ind_dim2], 1000000) 171 | return network 172 | 173 | def architecture_mutate(self, individual): 174 | 175 | offspring_actor = individual.actor.clone() 176 | offspring_actor.cpu() 177 | 178 | rand_numb = self.rng.uniform(0, 1) 179 | if 0 <= rand_numb < 0.1: 180 | offspring_actor.add_mlp_layer() 181 | individual.train_log["mutation"] = "architecture_new_mlp_layer" 182 | 183 | elif 0.1 <= rand_numb < 0.2: 184 | offspring_actor.add_cnn_layer() 185 | individual.train_log["mutation"] = "architecture_new_cnn_layer" 186 | 187 | elif 0.2 <= rand_numb < 0.3: 188 | offspring_actor.change_cnn_kernal() 189 | individual.train_log["mutation"] = "architecture_change_cnn_kernal" 190 | elif 0.3 <= rand_numb < 0.65: 191 | offspring_actor.add_cnn_channel() 192 | individual.train_log["mutation"] = "architecture_add_cnn_channel" 193 | else: 194 | offspring_actor.add_mlp_node() 195 | individual.train_log["mutation"] = "architecture_add_mlp_node" 196 | 197 | individual.actor = offspring_actor 198 | 199 | return individual 200 | -------------------------------------------------------------------------------- /searl/neuroevolution/mutation_mlp.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import fastrand 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | from torch.optim import Adam 8 | 9 | 10 | class Mutations(): 11 | 12 | def __init__(self, config, replay_sample_queue): 13 | self.cfg = config 14 | self.rng = np.random.RandomState(self.cfg.seed.mutation) 15 | self.replay_sample_queue = replay_sample_queue 16 | 17 | def no_mutation(self, individual): 18 | individual.train_log["mutation"] = "no_mutation" 19 | return individual 20 | 21 | def mutation(self, population): 22 | 23 | mutation_options = [] 24 | mutation_proba = [] 25 | if self.cfg.mutation.no_mutation: 26 | mutation_options.append(self.no_mutation) 27 | mutation_proba.append(float(self.cfg.mutation.no_mutation)) 28 | if self.cfg.mutation.architecture: 29 | mutation_options.append(self.architecture_mutate) 30 | mutation_proba.append(float(self.cfg.mutation.architecture)) 31 | if self.cfg.mutation.parameters: 32 | mutation_options.append(self.parameter_mutation) 33 | mutation_proba.append(float(self.cfg.mutation.parameters)) 34 | if self.cfg.mutation.activation: 35 | mutation_options.append(self.activation_mutation) 36 | mutation_proba.append(float(self.cfg.mutation.activation)) 37 | if self.cfg.mutation.rl_hyperparam: 38 | mutation_options.append(self.rl_hyperparam_mutation) 39 | mutation_proba.append(float(self.cfg.mutation.rl_hyperparam)) 40 | 41 | if len(mutation_options) == 0: 42 | return population 43 | 44 | mutation_proba = np.array(mutation_proba) / np.sum(mutation_proba) 45 | 46 | mutation_choice = self.rng.choice(mutation_options, len(population), p=mutation_proba) 47 | 48 | mutated_population = [] 49 | for mutation, individual in zip(mutation_choice, population): 50 | mutated_population.append(mutation(individual)) 51 | 52 | return mutated_population 53 | 54 | def rl_hyperparam_mutation(self, individual): 55 | 56 | rl_config = individual.rl_config 57 | rl_params = self.cfg.mutation.rl_hp_selection 58 | mutate_param = self.rng.choice(rl_params, 1)[0] 59 | 60 | random_num = self.rng.uniform(0, 1) 61 | if mutate_param == 'train_frames_fraction': 62 | if random_num > 0.5: 63 | setattr(rl_config, mutate_param, min(0.1, max(3.0, getattr(rl_config, mutate_param) * 1.2))) 64 | else: 65 | setattr(rl_config, mutate_param, min(0.1, max(3.0, getattr(rl_config, mutate_param) * 0.8))) 66 | elif mutate_param == 'batch_size': 67 | if random_num > 0.5: 68 | setattr(rl_config, mutate_param, min(128, max(8, int(getattr(rl_config, mutate_param) * 1.2)))) 69 | else: 70 | setattr(rl_config, mutate_param, min(128, max(8, int(getattr(rl_config, mutate_param) * 0.8)))) 71 | elif mutate_param == 'lr_actor': 72 | if random_num > 0.5: 73 | setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 1.2))) 74 | else: 75 | setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 0.8))) 76 | elif mutate_param == 'lr_critic': 77 | if random_num > 0.5: 78 | setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 1.2))) 79 | else: 80 | setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 0.8))) 81 | elif mutate_param == 'td3_policy_noise': 82 | if getattr(rl_config, mutate_param): 83 | setattr(rl_config, mutate_param, False) 84 | else: 85 | setattr(rl_config, mutate_param, 0.1) 86 | elif mutate_param == 'td3_update_freq': 87 | if random_num > 0.5: 88 | setattr(rl_config, mutate_param, min(10, max(1, int(getattr(rl_config, mutate_param) + 1)))) 89 | else: 90 | setattr(rl_config, mutate_param, min(10, max(1, int(getattr(rl_config, mutate_param) - 1)))) 91 | elif mutate_param == 'optimizer': 92 | opti_selection = ["adam", "adamax", "rmsprop", "sdg"] 93 | opti_selection.remove(getattr(rl_config, mutate_param)) 94 | opti = self.rng.choice(opti_selection, 1) 95 | setattr(rl_config, mutate_param, opti) 96 | 97 | individual.train_log["mutation"] = "rl_" + mutate_param 98 | individual.rl_config = rl_config 99 | return individual 100 | 101 | def activation_mutation(self, individual): 102 | individual.actor = self._permutate_activation(individual.actor) 103 | individual.critic_1 = self._permutate_activation(individual.critic_1) 104 | if self.cfg.train.td3_double_q: 105 | individual.critic_2 = self._permutate_activation(individual.critic_2) 106 | individual.train_log["mutation"] = "activation" 107 | return individual 108 | 109 | def _permutate_activation(self, network): 110 | 111 | possible_activations = ['relu', 'elu', 'tanh'] 112 | current_activation = network.activation 113 | possible_activations.remove(current_activation) 114 | new_activation = self.rng.choice(possible_activations, size=1)[0] 115 | net_dict = network.init_dict 116 | net_dict['activation'] = new_activation 117 | new_network = type(network)(**net_dict) 118 | new_network.load_state_dict(network.state_dict()) 119 | network = new_network 120 | 121 | return network 122 | 123 | def parameter_mutation(self, individual): 124 | 125 | offspring = individual.actor 126 | 127 | offspring = self.classic_parameter_mutation(offspring) 128 | individual.train_log["mutation"] = "classic_parameter" 129 | 130 | individual.actor = offspring 131 | return individual 132 | 133 | def regularize_weight(self, weight, mag): 134 | if weight > mag: weight = mag 135 | if weight < -mag: weight = -mag 136 | return weight 137 | 138 | def classic_parameter_mutation(self, network): 139 | mut_strength = self.cfg.mutation.mutation_sd 140 | num_mutation_frac = 0.1 141 | super_mut_strength = 10 142 | super_mut_prob = 0.05 143 | reset_prob = super_mut_prob + 0.05 144 | 145 | model_params = network.state_dict() 146 | 147 | potential_keys = [] 148 | for i, key in enumerate(model_params): # Mutate each param 149 | if not 'norm' in key: 150 | W = model_params[key] 151 | if len(W.shape) == 2: # Weights, no bias 152 | potential_keys.append(key) 153 | 154 | how_many = np.random.randint(1, len(potential_keys) + 1, 1)[0] 155 | chosen_keys = np.random.choice(potential_keys, how_many, replace=False) 156 | 157 | for key in chosen_keys: 158 | # References to the variable keys 159 | W = model_params[key] 160 | num_weights = W.shape[0] * W.shape[1] 161 | # Number of mutation instances 162 | num_mutations = fastrand.pcg32bounded(int(np.ceil(num_mutation_frac * num_weights))) 163 | for _ in range(num_mutations): 164 | ind_dim1 = fastrand.pcg32bounded(W.shape[0]) 165 | ind_dim2 = fastrand.pcg32bounded(W.shape[-1]) 166 | random_num = self.rng.uniform(0, 1) 167 | 168 | if random_num < super_mut_prob: # Super Mutation probability 169 | W[ind_dim1, ind_dim2] += self.rng.normal(0, np.abs(super_mut_strength * W[ind_dim1, ind_dim2])) 170 | elif random_num < reset_prob: # Reset probability 171 | W[ind_dim1, ind_dim2] = self.rng.normal(0, 1) 172 | else: # mutauion even normal 173 | W[ind_dim1, ind_dim2] += self.rng.normal(0, np.abs(mut_strength * W[ind_dim1, ind_dim2])) 174 | 175 | # Regularization hard limit 176 | W[ind_dim1, ind_dim2] = self.regularize_weight(W[ind_dim1, ind_dim2], 1000000) 177 | return network 178 | 179 | 180 | def architecture_mutate(self, individual): 181 | 182 | offspring_actor = individual.actor.clone() 183 | offspring_critic_1 = individual.critic_1.clone() 184 | if self.cfg.train.td3_double_q: 185 | offspring_critic_2 = individual.critic_2.clone() 186 | 187 | rand_numb = self.rng.uniform(0, 1) 188 | if rand_numb < self.cfg.mutation.new_layer_prob: 189 | offspring_actor.add_layer() 190 | offspring_critic_1.add_layer() 191 | if self.cfg.train.td3_double_q: 192 | offspring_critic_2.add_layer() 193 | individual.train_log["mutation"] = "architecture_new_layer" 194 | else: 195 | node_dict = offspring_actor.add_node() 196 | offspring_critic_1.add_node(**node_dict) 197 | if self.cfg.train.td3_double_q: 198 | offspring_critic_2.add_node(**node_dict) 199 | individual.train_log["mutation"] = "architecture_new_node" 200 | 201 | individual.actor = offspring_actor 202 | individual.critic_1 = offspring_critic_1 203 | if self.cfg.train.td3_double_q: 204 | individual.critic_2 = offspring_critic_2 205 | return individual 206 | -------------------------------------------------------------------------------- /searl/neuroevolution/searl_dqn.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import time 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from searl.neuroevolution.components.individual_dqn import DQNIndividual 8 | from searl.neuroevolution.components.replay_memory import ReplayMemory 9 | from searl.neuroevolution.evaluation_dqn import MPEvaluation 10 | from searl.neuroevolution.mutation_cnn import Mutations 11 | from searl.neuroevolution.tournament_selection import TournamentSelection 12 | from searl.neuroevolution.training_dqn import DQNTraining 13 | from searl.rl_algorithms.components.wrappers import make_atari, wrap_deepmind, wrap_pytorch 14 | from searl.utils.supporter import Supporter 15 | 16 | 17 | class SEARLforDQN(): 18 | 19 | def __init__(self, config, logger, checkpoint): 20 | 21 | self.cfg = config 22 | self.log = logger 23 | self.ckp = checkpoint 24 | 25 | torch.manual_seed(self.cfg.seed.torch) 26 | np.random.seed(self.cfg.seed.numpy) 27 | 28 | self.log.print_config(self.cfg) 29 | self.log.csv.fieldnames( 30 | ["epoch", "time_string", "eval_eps", "pre_fitness", "pre_rank", "post_fitness", "post_rank", "index", 31 | "parent_index", "mutation", "train_iterations", "train_losses", 32 | ] + list(self.cfg.rl.get_dict.keys())) 33 | 34 | self.log.log("initialize replay memory") 35 | 36 | self.replay_memory = ReplayMemory(capacity=self.cfg.train.replay_memory_size, batch_size=self.cfg.rl.batch_size) 37 | 38 | self.eval = MPEvaluation(config=self.cfg, logger=self.log, replay_memory=self.replay_memory) 39 | 40 | self.tournament = TournamentSelection(config=self.cfg) 41 | 42 | self.mutation = Mutations(config=self.cfg) 43 | 44 | self.training = DQNTraining(config=self.cfg, replay_memory=self.replay_memory) 45 | 46 | def initial_population(self): 47 | self.log.log("initialize population") 48 | population = [] 49 | for idx in range(self.cfg.nevo.population_size): 50 | 51 | if self.cfg.nevo.ind_memory: 52 | replay_memory = ReplayMemory(capacity=self.cfg.train.replay_memory_size, 53 | batch_size=self.cfg.rl.batch_size) 54 | else: 55 | replay_memory = False 56 | 57 | actor_config = copy.deepcopy(self.cfg.actor.get_dict) 58 | rl_config = copy.deepcopy(self.cfg.rl) 59 | 60 | indi = DQNIndividual(state_dim=self.cfg.state_dim, action_dim=self.cfg.action_dim, 61 | actor_config=actor_config, 62 | rl_config=rl_config, index=idx, replay_memory=replay_memory) 63 | population.append(indi) 64 | return population 65 | 66 | def evolve_population(self, population, epoch=1, num_frames=0): 67 | 68 | frames_since_mut = 0 69 | num_frames = num_frames 70 | epoch = epoch 71 | 72 | while True: 73 | epoch_time = time.time() 74 | self.log(f"##### START EPOCH {epoch}", time_step=num_frames) 75 | 76 | for ind in population: 77 | ind.train_log['epoch'] = epoch 78 | 79 | population_mean_fitness, population_var_fitness, eval_frames = \ 80 | self.log.log_func(self.eval.evaluate_population, population=population, 81 | exploration_noise=self.cfg.eval.exploration_noise, 82 | total_frames=num_frames) 83 | self.log("eval_frames", eval_frames) 84 | num_frames += eval_frames 85 | frames_since_mut += eval_frames 86 | 87 | self.log.population_info(population_mean_fitness, population_var_fitness, population, num_frames, epoch) 88 | 89 | self.ckp.save_object(population, name="population") 90 | self.log.log("save population") 91 | 92 | if num_frames >= self.cfg.train.num_frames: 93 | break 94 | 95 | if self.cfg.nevo.selection: 96 | elite, population = self.log.log_func(self.tournament.select, population) 97 | test_fitness = self.eval.test_individual(elite, epoch) 98 | self.log(f"##### ELITE INFO {epoch}", time_step=num_frames) 99 | self.log("best_test_fitness", test_fitness, num_frames) 100 | 101 | if self.cfg.nevo.mutation: 102 | population = self.log.log_func(self.mutation.mutation, population) 103 | 104 | if self.cfg.nevo.training: 105 | iterations = min( 106 | max(self.cfg.train.min_train_steps, int(self.cfg.rl.train_frames_fraction * eval_frames)), 107 | self.cfg.train.max_train_steps) 108 | self.log("training_iterations", iterations) 109 | population = self.log.log_func(self.training.train, population=population, iterations=iterations) 110 | 111 | self.log(f"##### END EPOCH {epoch} - runtime {time.time() - epoch_time:6.1f}", time_step=num_frames) 112 | self.log("epoch", epoch, time_step=num_frames) 113 | self.log(f"##### ################################################# #####") 114 | self.cfg.expt.set_attr("epoch", epoch) 115 | self.cfg.expt.set_attr("num_frames", num_frames) 116 | epoch += 1 117 | 118 | self.log("FINISH", time_step=num_frames) 119 | self.replay_memory.close() 120 | 121 | def close(self): 122 | self.replay_memory.close() 123 | 124 | 125 | def start_searl_dqn_run(config_dict, expt_dir): 126 | sup = Supporter(experiments_dir=expt_dir, config_dict=config_dict, count_expt=True) 127 | cfg = sup.get_config() 128 | log = sup.get_logger() 129 | 130 | env = make_atari(cfg.env.name) 131 | env = wrap_deepmind(env) 132 | env = wrap_pytorch(env) 133 | cfg.set_attr("action_dim", env.action_space.n) 134 | cfg.set_attr("state_dim", env.observation_space.shape) 135 | 136 | searl = SEARLforDQN(config=cfg, logger=log, checkpoint=sup.ckp) 137 | 138 | population = searl.initial_population() 139 | searl.evolve_population(population) 140 | -------------------------------------------------------------------------------- /searl/neuroevolution/searl_td3.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import time 3 | 4 | import gym 5 | import numpy as np 6 | import torch 7 | import torch.multiprocessing as mp 8 | 9 | from searl.neuroevolution.components.individual_td3 import Individual 10 | from searl.neuroevolution.components.replay_memory import MPReplayMemory, ReplayMemory 11 | from searl.neuroevolution.evaluation_td3 import MPEvaluation 12 | from searl.neuroevolution.mutation_mlp import Mutations 13 | from searl.neuroevolution.tournament_selection import TournamentSelection 14 | from searl.neuroevolution.training_td3 import TD3Training 15 | from searl.utils.supporter import Supporter 16 | 17 | 18 | class SEARLforTD3(): 19 | 20 | def __init__(self, config, logger, checkpoint): 21 | 22 | self.cfg = config 23 | self.log = logger 24 | self.ckp = checkpoint 25 | 26 | torch.manual_seed(self.cfg.seed.torch) 27 | np.random.seed(self.cfg.seed.numpy) 28 | 29 | self.log.print_config(self.cfg) 30 | self.log.csv.fieldnames( 31 | ["epoch", "time_string", "eval_eps", "pre_fitness", "pre_rank", "post_fitness", "post_rank", "index", 32 | "parent_index", "mutation", "train_iterations", 33 | ] + list(self.cfg.rl.get_dict.keys())) 34 | 35 | self.log.log("initialize replay memory") 36 | if self.cfg.nevo.ind_memory: 37 | push_queue = None 38 | sample_queue = None 39 | else: 40 | self.replay_memory = MPReplayMemory(seed=self.cfg.seed.replay_memory, 41 | capacity=self.cfg.train.replay_memory_size, 42 | batch_size=self.cfg.rl.batch_size, 43 | reuse_batch=self.cfg.nevo.reuse_batch) 44 | push_queue = self.replay_memory.get_push_queue() 45 | sample_queue = self.replay_memory.get_sample_queue() 46 | 47 | self.eval = MPEvaluation(config=self.cfg, logger=self.log, push_queue=push_queue) 48 | 49 | self.tournament = TournamentSelection(config=self.cfg) 50 | 51 | self.mutation = Mutations(config=self.cfg, replay_sample_queue=sample_queue) 52 | 53 | self.training = TD3Training(config=self.cfg, replay_sample_queue=sample_queue) 54 | 55 | def initial_population(self): 56 | self.log.log("initialize population") 57 | population = [] 58 | for idx in range(self.cfg.nevo.population_size): 59 | 60 | if self.cfg.nevo.ind_memory: 61 | replay_memory = ReplayMemory(capacity=self.cfg.train.replay_memory_size, 62 | batch_size=self.cfg.rl.batch_size) 63 | else: 64 | replay_memory = False 65 | 66 | if self.cfg.nevo.init_random: 67 | 68 | min_lr = 0.00001 69 | max_lr = 0.005 70 | 71 | actor_config = copy.deepcopy(self.cfg.actor.get_dict) 72 | critic_config = copy.deepcopy(self.cfg.critic.get_dict) 73 | rl_config = copy.deepcopy(self.cfg.rl) 74 | 75 | actor_config["activation"] = np.random.choice(['relu', 'tanh', 'elu'], 1)[0] 76 | critic_config["activation"] = np.random.choice(['relu', 'tanh', 'elu'], 1)[0] 77 | 78 | lr_actor = np.exp(np.random.uniform(np.log(min_lr), np.log(max_lr), 1))[0] 79 | lr_critic = np.exp(np.random.uniform(np.log(min_lr), np.log(max_lr), 1))[0] 80 | 81 | rl_config.set_attr("lr_actor", lr_actor) 82 | rl_config.set_attr("lr_critic", lr_critic) 83 | self.log(f"init {idx} rl_config: ", rl_config.get_dict) 84 | self.log(f"init {idx} actor_config: ", actor_config) 85 | 86 | else: 87 | actor_config = copy.deepcopy(self.cfg.actor.get_dict) 88 | critic_config = copy.deepcopy(self.cfg.critic.get_dict) 89 | rl_config = copy.deepcopy(self.cfg.rl) 90 | 91 | indi = Individual(state_dim=self.cfg.state_dim, action_dim=self.cfg.action_dim, 92 | actor_config=actor_config, 93 | critic_config=critic_config, 94 | rl_config=rl_config, index=idx, td3_double_q=self.cfg.train.td3_double_q, 95 | replay_memory=replay_memory) 96 | population.append(indi) 97 | return population 98 | 99 | def evolve_population(self, population, epoch=1, num_frames=0): 100 | 101 | frames_since_mut = 0 102 | num_frames = num_frames 103 | epoch = epoch 104 | ctx = mp.get_context('spawn') 105 | 106 | while True: 107 | pool = ctx.Pool(processes=self.cfg.nevo.worker, maxtasksperchild=1000) 108 | epoch_time = time.time() 109 | self.log(f"##### START EPOCH {epoch}", time_step=num_frames) 110 | 111 | for ind in population: 112 | ind.train_log['epoch'] = epoch 113 | 114 | population_mean_fitness, population_var_fitness, eval_frames = \ 115 | self.log.log_func(self.eval.evaluate_population, population=population, 116 | exploration_noise=self.cfg.eval.exploration_noise, 117 | total_frames=num_frames, pool=pool) 118 | num_frames += eval_frames 119 | frames_since_mut += eval_frames 120 | 121 | self.log.population_info(population_mean_fitness, population_var_fitness, population, num_frames, epoch) 122 | 123 | self.ckp.save_object(population, name="population") 124 | self.log.log("save population") 125 | if not self.cfg.nevo.ind_memory: 126 | rm_dict = self.replay_memory.save() 127 | if isinstance(rm_dict, str): 128 | self.log("save replay memory failed") 129 | else: 130 | self.log("replay memory size", len(rm_dict['memory'])) 131 | self.ckp.save_object([rm_dict], name="replay_memory") 132 | self.log("save replay memory") 133 | 134 | if num_frames >= self.cfg.train.num_frames: 135 | break 136 | 137 | if self.cfg.nevo.selection: 138 | elite, population = self.log.log_func(self.tournament.select, population) 139 | test_fitness = self.eval.test_individual(elite, epoch) 140 | self.log(f"##### ELITE INFO {epoch}", time_step=num_frames) 141 | self.log("best_test_fitness", test_fitness, num_frames) 142 | 143 | if self.cfg.nevo.mutation: 144 | population = self.log.log_func(self.mutation.mutation, population) 145 | 146 | if self.cfg.nevo.training: 147 | population = self.log.log_func(self.training.train, population=population, eval_frames=eval_frames, 148 | pool=pool) 149 | 150 | self.log(f"##### END EPOCH {epoch} - runtime {time.time() - epoch_time:6.1f}", time_step=num_frames) 151 | self.log("epoch", epoch, time_step=num_frames) 152 | self.log(f"##### ################################################# #####") 153 | self.cfg.expt.set_attr("epoch", epoch) 154 | self.cfg.expt.set_attr("num_frames", num_frames) 155 | epoch += 1 156 | 157 | pool.terminate() 158 | pool.join() 159 | 160 | self.log("FINISH", time_step=num_frames) 161 | self.replay_memory.close() 162 | 163 | def close(self): 164 | self.replay_memory.close() 165 | 166 | 167 | def start_searl_td3_run(config, expt_dir): 168 | with Supporter(experiments_dir=expt_dir, config_dict=config, count_expt=True) as sup: 169 | cfg = sup.get_config() 170 | log = sup.get_logger() 171 | 172 | env = gym.make(cfg.env.name) 173 | cfg.set_attr("action_dim", env.action_space.shape[0]) 174 | cfg.set_attr("state_dim", env.observation_space.shape[0]) 175 | 176 | searl = SEARLforTD3(config=cfg, logger=log, checkpoint=sup.ckp) 177 | 178 | population = searl.initial_population() 179 | searl.evolve_population(population) 180 | -------------------------------------------------------------------------------- /searl/neuroevolution/tournament_selection.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | 4 | 5 | class TournamentSelection(): 6 | 7 | def __init__(self, config): 8 | self.cfg = config 9 | 10 | def _tournament(self, fitness_values): 11 | selection = np.random.randint(0, len(fitness_values), size=self.cfg.nevo.tournament_size) 12 | selection_values = [fitness_values[i] for i in selection] 13 | winner = selection[np.argmax(selection_values)] 14 | return winner 15 | 16 | def select(self, population): 17 | last_fitness = [indi.fitness[-1] for indi in population] 18 | rank = np.argsort(last_fitness).argsort() 19 | 20 | max_id = max([ind.index for ind in population]) 21 | 22 | elite = copy.deepcopy([population[np.argsort(rank)[-1]]][0]) 23 | 24 | new_population = [] 25 | if self.cfg.nevo.elitism: 26 | new_population.append(elite.clone()) 27 | selection_size = self.cfg.nevo.population_size - 1 28 | else: 29 | selection_size = self.cfg.nevo.population_size 30 | 31 | for idx in range(selection_size): 32 | max_id += 1 33 | actor_parent = population[self._tournament(rank)] 34 | new_individual = actor_parent.clone(max_id) 35 | new_individual.train_log["parent_index"] = actor_parent.index 36 | new_population.append(new_individual) 37 | 38 | return elite, new_population 39 | -------------------------------------------------------------------------------- /searl/neuroevolution/training_dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 5 | 6 | 7 | def get_optimizer(name): 8 | if name == "adam": 9 | return torch.optim.Adam 10 | elif name == "adadelta": 11 | return torch.optim.Adadelta 12 | elif name == "adamax": 13 | return torch.optim.Adamax 14 | elif name == "rmsprop": 15 | return torch.optim.RMSprop 16 | elif name == "sdg": 17 | return torch.optim.SGD 18 | 19 | 20 | class DQNTraining(): 21 | 22 | def __init__(self, config, replay_memory, replay_priority_queue=None): 23 | self.cfg = config 24 | self.rng = np.random.RandomState(self.cfg.seed.training) 25 | self.replay_sample_queue = replay_memory 26 | self.replay_priority_queue = replay_priority_queue 27 | self.args = config.rl 28 | 29 | @staticmethod 30 | def update_parameters(indi, replay_sample_queue, iterations): 31 | args = indi.rl_config 32 | Opti = get_optimizer(args.optimizer) 33 | 34 | actor = indi.actor 35 | actor_target = type(actor)(**actor.init_dict) 36 | actor_target.load_state_dict(actor.state_dict()) 37 | actor.to(device) 38 | actor.train() 39 | actor_target.to(device) 40 | actor_optim = Opti(actor.parameters(), lr=args.lr_actor) 41 | 42 | losses = [] 43 | for it in range(iterations): 44 | transistion_list = replay_sample_queue.get() 45 | state_list = [] 46 | action_batch = [] 47 | next_state_batch = [] 48 | reward_batch = [] 49 | done_batch = [] 50 | for transition in transistion_list: 51 | state_list.append(transition.state) 52 | action_batch.append(transition.action) 53 | next_state_batch.append(transition.next_state) 54 | reward_batch.append(transition.reward) 55 | done_batch.append(transition.done) 56 | 57 | state = torch.stack(state_list, dim=0).to(device) 58 | action = torch.stack(action_batch, dim=0).squeeze().to(device) 59 | next_state = torch.stack(next_state_batch, dim=0).to(device) 60 | rewards = torch.stack(reward_batch, dim=0).squeeze().to(device) 61 | dones = torch.stack(done_batch, dim=0).squeeze().to(device) 62 | 63 | with torch.no_grad(): 64 | batch_size = next_state.size(0) 65 | 66 | delta_z = float(args.Vmax - args.Vmin) / (args.num_atoms - 1) 67 | support = torch.linspace(args.Vmin, args.Vmax, args.num_atoms).to(device) 68 | 69 | next_dist = actor_target(next_state) * support 70 | next_action = next_dist.sum(2).max(1)[1] 71 | next_action = next_action.unsqueeze(1).unsqueeze(1).expand(next_dist.size(0), 1, next_dist.size(2)) 72 | next_dist = next_dist.gather(1, next_action).squeeze(1) 73 | 74 | rewards = rewards.unsqueeze(1).expand_as(next_dist) 75 | dones = dones.unsqueeze(1).expand_as(next_dist) 76 | support = support.unsqueeze(0).expand_as(next_dist) 77 | 78 | Tz = rewards + (1 - dones) * 0.99 * support 79 | Tz = Tz.clamp(min=args.Vmin, max=args.Vmax) 80 | b = (Tz - args.Vmin) / delta_z 81 | l = b.floor().long() 82 | u = b.ceil().long() 83 | 84 | offset = torch.linspace(0, (batch_size - 1) * args.num_atoms, batch_size).long() \ 85 | .unsqueeze(1).expand(batch_size, args.num_atoms).to(device) 86 | 87 | proj_dist = torch.zeros(next_dist.size()).to(device) 88 | proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)) 89 | proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)) 90 | 91 | dist = actor(state) 92 | action = action.unsqueeze(1).unsqueeze(1).expand(args.batch_size, 1, args.num_atoms) 93 | dist = dist.gather(1, action).squeeze(1) 94 | dist.data.clamp_(0.01, 0.99) 95 | loss = -(proj_dist * dist.log()).sum(1) 96 | loss = loss.mean() 97 | 98 | actor_optim.zero_grad() 99 | loss.backward() 100 | actor_optim.step() 101 | 102 | if it % 5 == 0: 103 | actor.reset_noise() 104 | actor_target.reset_noise() 105 | 106 | losses.append(loss.detach().cpu().numpy()) 107 | 108 | if it % 2 == 0 and it != 0: 109 | for param, target_param in zip(actor.parameters(), actor_target.parameters()): 110 | target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data) 111 | 112 | indi.actor = actor.cpu().clone() 113 | indi.train_log['train_iterations'] = iterations 114 | indi.train_log['train_losses'] = np.mean(losses).tolist() 115 | indi.train_log.update(args.get_dict) 116 | 117 | return indi 118 | 119 | def train(self, population, iterations, pool=None): 120 | 121 | pop_id_lookup = [ind.index for ind in population] 122 | 123 | if self.cfg.nevo.ind_memory: 124 | args_list = [(indi, indi.replay_memory, iterations) for indi in population] 125 | else: 126 | args_list = [(indi, self.replay_sample_queue, iterations) for indi in population] 127 | 128 | trained_pop = [] 129 | for args in args_list: 130 | trained_pop.append(self.update_parameters(*args)) 131 | 132 | trained_pop = sorted(trained_pop, key=lambda i: pop_id_lookup.index(i.index)) 133 | 134 | return trained_pop 135 | -------------------------------------------------------------------------------- /searl/neuroevolution/training_td3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 6 | 7 | 8 | def get_optimizer(name): 9 | if name == "adam": 10 | return torch.optim.Adam 11 | elif name == "adadelta": 12 | return torch.optim.Adadelta 13 | elif name == "adamax": 14 | return torch.optim.Adamax 15 | elif name == "rmsprop": 16 | return torch.optim.RMSprop 17 | elif name == "sdg": 18 | return torch.optim.SGD 19 | 20 | 21 | class TD3Training(): 22 | 23 | def __init__(self, config, replay_sample_queue): 24 | self.cfg = config 25 | self.rng = np.random.RandomState(self.cfg.seed.training) 26 | self.replay_sample_queue = replay_sample_queue 27 | 28 | self.args = config.rl 29 | 30 | @staticmethod 31 | def update_parameters(indi, replay_sample_queue, iterations): 32 | args = indi.rl_config 33 | gamma = args.gamma 34 | tau = args.tau 35 | Opti = get_optimizer(args.optimizer) 36 | 37 | actor = indi.actor 38 | actor_target = type(actor)(**actor.init_dict) 39 | actor_target.load_state_dict(actor.state_dict()) 40 | actor.to(device) 41 | actor.train() 42 | actor_target.to(device) 43 | actor_optim = Opti(actor.parameters(), lr=args.lr_actor) 44 | 45 | critic_1 = indi.critic_1 46 | critic_1_target = type(critic_1)(**critic_1.init_dict) 47 | critic_1_target.load_state_dict(critic_1.state_dict()) 48 | critic_1.to(device) 49 | critic_1.train() 50 | critic_1_target.to(device) 51 | critic_1_optim = Opti(critic_1.parameters(), lr=args.lr_critic) 52 | 53 | critic_2 = indi.critic_2 54 | critic_2_target = type(critic_2)(**critic_2.init_dict) 55 | critic_2_target.load_state_dict(critic_2.state_dict()) 56 | critic_2.to(device) 57 | critic_2.train() 58 | critic_2_target.to(device) 59 | critic_2_optim = Opti(critic_2.parameters(), lr=args.lr_critic) 60 | 61 | for it in range(iterations): 62 | 63 | transistion_list = replay_sample_queue.get() 64 | 65 | state_list = [] 66 | action_batch = [] 67 | next_state_batch = [] 68 | reward_batch = [] 69 | done_batch = [] 70 | for transition in transistion_list: 71 | state_list.append(torch.Tensor(transition.state)) 72 | action_batch.append(torch.Tensor(transition.action)) 73 | next_state_batch.append(torch.Tensor(transition.next_state)) 74 | reward_batch.append(torch.Tensor(transition.reward)) 75 | done_batch.append(torch.Tensor(transition.done)) 76 | 77 | state_batch = torch.stack(state_list, dim=0) 78 | action_batch = torch.stack(action_batch, dim=0) 79 | next_state_batch = torch.stack(next_state_batch, dim=0) 80 | reward_batch = torch.stack(reward_batch, dim=0) 81 | done_batch = torch.stack(done_batch, dim=0) 82 | 83 | state = state_batch.to(device) 84 | action = action_batch.to(device) 85 | reward = reward_batch.to(device) 86 | done = 1 - done_batch.to(device) 87 | next_state = next_state_batch.to(device) 88 | 89 | with torch.no_grad(): 90 | noise = (torch.randn_like(action) * args.td3_policy_noise).clamp(-args.td3_noise_clip, 91 | args.td3_noise_clip) 92 | next_action = (actor_target(next_state) + noise).clamp(-1, 1) 93 | target_Q1 = critic_1_target(torch.cat([next_state, next_action], 1)) 94 | target_Q2 = critic_2_target(torch.cat([next_state, next_action], 1)) 95 | target_Q = torch.min(target_Q1, target_Q2) 96 | target_Q = reward + (done * gamma * target_Q) 97 | 98 | current_Q1 = critic_1(torch.cat([state, action], 1)) 99 | current_Q2 = critic_2(torch.cat([state, action], 1)) 100 | 101 | critic_loss_1 = F.mse_loss(current_Q1, target_Q) 102 | critic_1_optim.zero_grad() 103 | critic_loss_1.backward() 104 | for p in critic_1.parameters(): 105 | p.grad.data.clamp_(max=args.clip_grad_norm) 106 | critic_1_optim.step() 107 | 108 | critic_loss_2 = F.mse_loss(current_Q2, target_Q) 109 | critic_2_optim.zero_grad() 110 | critic_loss_2.backward() 111 | for p in critic_2.parameters(): 112 | p.grad.data.clamp_(max=args.clip_grad_norm) 113 | critic_2_optim.step() 114 | 115 | if it % args.td3_update_freq == 0: 116 | actor_loss = -critic_1(torch.cat([state, actor(state)], 1)) 117 | actor_loss = torch.mean(actor_loss) 118 | 119 | actor_optim.zero_grad() 120 | actor_loss.backward() 121 | for p in actor.parameters(): 122 | p.grad.data.clamp_(max=args.clip_grad_norm) 123 | actor_optim.step() 124 | 125 | for param, target_param in zip(actor.parameters(), actor_target.parameters()): 126 | target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) 127 | 128 | for param, target_param in zip(critic_1.parameters(), critic_1_target.parameters()): 129 | target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) 130 | 131 | for param, target_param in zip(critic_2.parameters(), critic_2_target.parameters()): 132 | target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) 133 | 134 | actor_optim.zero_grad() 135 | critic_1_optim.zero_grad() 136 | if indi.td3_double_q: 137 | critic_2_optim.zero_grad() 138 | 139 | indi.actor = actor.cpu().clone() 140 | indi.critic_1 = critic_1.cpu().clone() 141 | if indi.td3_double_q: 142 | indi.critic_2 = critic_2.cpu().clone() 143 | indi.train_log['train_iterations'] = iterations 144 | indi.train_log.update(args.get_dict) 145 | 146 | return indi 147 | 148 | def train(self, population, eval_frames, pool=None): 149 | pop_id_lookup = [ind.index for ind in population] 150 | iterations = max(self.cfg.train.min_train_steps, int(self.cfg.rl.train_frames_fraction * eval_frames)) 151 | 152 | if self.cfg.nevo.ind_memory: 153 | args_list = [(indi, indi.replay_memory, iterations) for indi in population] 154 | else: 155 | args_list = [(indi, self.replay_sample_queue, iterations) for indi in population] 156 | 157 | result_dicts = [pool.apply_async(self.update_parameters, args) for args in args_list] 158 | trained_pop = [res.get() for res in result_dicts] 159 | trained_pop = sorted(trained_pop, key=lambda i: pop_id_lookup.index(i.index)) 160 | 161 | return trained_pop 162 | -------------------------------------------------------------------------------- /searl/rl_algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/rl_algorithms/__init__.py -------------------------------------------------------------------------------- /searl/rl_algorithms/components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/rl_algorithms/components/__init__.py -------------------------------------------------------------------------------- /searl/rl_algorithms/components/replay_memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | def __init__(self, max_size=1e6): 5 | self.storage = [] 6 | self.max_size = max_size 7 | self.ptr = 0 8 | 9 | def add(self, transistion): 10 | if len(self.storage) == self.max_size: 11 | self.storage[int(self.ptr)] = transistion 12 | self.ptr = (self.ptr + 1) % self.max_size 13 | else: 14 | self.storage.append(transistion) 15 | 16 | def sample(self, batch_size): 17 | ind = np.random.randint(0, len(self.storage), size=batch_size) 18 | 19 | transition_list = [] 20 | for i in ind: 21 | transition_list.append(self.storage[i]) 22 | 23 | return transition_list 24 | -------------------------------------------------------------------------------- /searl/rl_algorithms/components/wrappers.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | 3 | import cv2 4 | import gym 5 | import numpy as np 6 | from gym import spaces 7 | 8 | cv2.ocl.setUseOpenCL(False) 9 | 10 | 11 | class NoopResetEnv(gym.Wrapper): 12 | def __init__(self, env, noop_max=30): 13 | """Sample initial states by taking random number of no-ops on reset. 14 | No-op is assumed to be action 0. 15 | """ 16 | gym.Wrapper.__init__(self, env) 17 | self.noop_max = noop_max 18 | self.override_num_noops = None 19 | self.noop_action = 0 20 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 21 | 22 | def reset(self, **kwargs): 23 | """ Do no-op action for a number of steps in [1, noop_max].""" 24 | self.env.reset(**kwargs) 25 | if self.override_num_noops is not None: 26 | noops = self.override_num_noops 27 | else: 28 | noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) # pylint: disable=E1101 29 | assert noops > 0 30 | obs = None 31 | for _ in range(noops): 32 | obs, _, done, _ = self.env.step(self.noop_action) 33 | if done: 34 | obs = self.env.reset(**kwargs) 35 | return obs 36 | 37 | def step(self, ac): 38 | return self.env.step(ac) 39 | 40 | 41 | class FireResetEnv(gym.Wrapper): 42 | def __init__(self, env): 43 | """Take action on reset for environments that are fixed until firing.""" 44 | gym.Wrapper.__init__(self, env) 45 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 46 | assert len(env.unwrapped.get_action_meanings()) >= 3 47 | 48 | def reset(self, **kwargs): 49 | self.env.reset(**kwargs) 50 | obs, _, done, _ = self.env.step(1) 51 | if done: 52 | self.env.reset(**kwargs) 53 | obs, _, done, _ = self.env.step(2) 54 | if done: 55 | self.env.reset(**kwargs) 56 | return obs 57 | 58 | def step(self, ac): 59 | return self.env.step(ac) 60 | 61 | 62 | class EpisodicLifeEnv(gym.Wrapper): 63 | def __init__(self, env): 64 | """Make end-of-life == end-of-episode, but only reset on true game over. 65 | Done by DeepMind for the DQN and co. since it helps value estimation. 66 | """ 67 | gym.Wrapper.__init__(self, env) 68 | self.lives = 0 69 | self.was_real_done = True 70 | 71 | def step(self, action): 72 | obs, reward, done, info = self.env.step(action) 73 | self.was_real_done = done 74 | # check current lives, make loss of life terminal, 75 | # then update lives to handle bonus lives 76 | lives = self.env.unwrapped.ale.lives() 77 | if lives < self.lives and lives > 0: 78 | # for Qbert sometimes we stay in lives == 0 condtion for a few frames 79 | # so its important to keep lives > 0, so that we only reset once 80 | # the environment advertises done. 81 | done = True 82 | self.lives = lives 83 | return obs, reward, done, info 84 | 85 | def reset(self, **kwargs): 86 | """Reset only when lives are exhausted. 87 | This way all states are still reachable even though lives are episodic, 88 | and the learner need not know about any of this behind-the-scenes. 89 | """ 90 | if self.was_real_done: 91 | obs = self.env.reset(**kwargs) 92 | else: 93 | # no-op step to advance from terminal/lost life state 94 | obs, _, _, _ = self.env.step(0) 95 | self.lives = self.env.unwrapped.ale.lives() 96 | return obs 97 | 98 | 99 | class MaxAndSkipEnv(gym.Wrapper): 100 | def __init__(self, env, skip=4): 101 | """Return only every `skip`-th frame""" 102 | gym.Wrapper.__init__(self, env) 103 | # most recent raw observations (for max pooling across time steps) 104 | self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8) 105 | self._skip = skip 106 | 107 | def reset(self): 108 | return self.env.reset() 109 | 110 | def step(self, action): 111 | """Repeat action, sum reward, and max over last observations.""" 112 | total_reward = 0.0 113 | done = None 114 | for i in range(self._skip): 115 | obs, reward, done, info = self.env.step(action) 116 | if i == self._skip - 2: self._obs_buffer[0] = obs 117 | if i == self._skip - 1: self._obs_buffer[1] = obs 118 | total_reward += reward 119 | if done: 120 | break 121 | # Note that the observation on the done=True frame 122 | # doesn't matter 123 | max_frame = self._obs_buffer.max(axis=0) 124 | 125 | return max_frame, total_reward, done, info 126 | 127 | def reset(self, **kwargs): 128 | return self.env.reset(**kwargs) 129 | 130 | 131 | class ClipRewardEnv(gym.RewardWrapper): 132 | def __init__(self, env): 133 | gym.RewardWrapper.__init__(self, env) 134 | 135 | def reward(self, reward): 136 | """Bin reward to {+1, 0, -1} by its sign.""" 137 | return np.sign(reward) 138 | 139 | 140 | class WarpFrame(gym.ObservationWrapper): 141 | def __init__(self, env): 142 | """Warp frames to 84x84 as done in the Nature paper and later work.""" 143 | gym.ObservationWrapper.__init__(self, env) 144 | self.width = 84 145 | self.height = 84 146 | self.observation_space = spaces.Box(low=0, high=255, 147 | shape=(self.height, self.width, 1), dtype=np.uint8) 148 | 149 | def observation(self, frame): 150 | frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) 151 | frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA) 152 | return frame[:, :, None] 153 | 154 | 155 | class FrameStack(gym.Wrapper): 156 | def __init__(self, env, k): 157 | """Stack k last frames. 158 | Returns lazy array, which is much more memory efficient. 159 | See Also 160 | -------- 161 | baselines.common.atari_wrappers.LazyFrames 162 | """ 163 | gym.Wrapper.__init__(self, env) 164 | self.k = k 165 | self.frames = deque([], maxlen=k) 166 | shp = env.observation_space.shape 167 | self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8) 168 | 169 | def reset(self): 170 | ob = self.env.reset() 171 | for _ in range(self.k): 172 | self.frames.append(ob) 173 | return self._get_ob() 174 | 175 | def step(self, action): 176 | ob, reward, done, info = self.env.step(action) 177 | self.frames.append(ob) 178 | return self._get_ob(), reward, done, info 179 | 180 | def _get_ob(self): 181 | assert len(self.frames) == self.k 182 | return LazyFrames(list(self.frames)) 183 | 184 | 185 | class ScaledFloatFrame(gym.ObservationWrapper): 186 | def __init__(self, env): 187 | gym.ObservationWrapper.__init__(self, env) 188 | 189 | def observation(self, observation): 190 | # careful! This undoes the memory optimization, use 191 | # with smaller replay buffers only. 192 | return np.array(observation).astype(np.float32) / 255.0 193 | 194 | 195 | class LazyFrames(object): 196 | def __init__(self, frames): 197 | """This object ensures that common frames between the observations are only stored once. 198 | It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay 199 | buffers. 200 | This object should only be converted to numpy array before being passed to the model. 201 | You'd not believe how complex the previous solution was.""" 202 | self._frames = frames 203 | self._out = None 204 | 205 | def _force(self): 206 | if self._out is None: 207 | self._out = np.concatenate(self._frames, axis=2) 208 | self._frames = None 209 | return self._out 210 | 211 | def __array__(self, dtype=None): 212 | out = self._force() 213 | if dtype is not None: 214 | out = out.astype(dtype) 215 | return out 216 | 217 | def __len__(self): 218 | return len(self._force()) 219 | 220 | def __getitem__(self, i): 221 | return self._force()[i] 222 | 223 | 224 | def make_atari(env_id): 225 | env = gym.make(env_id) 226 | assert 'NoFrameskip' in env.spec.id 227 | env = NoopResetEnv(env, noop_max=30) 228 | env = MaxAndSkipEnv(env, skip=4) 229 | return env 230 | 231 | 232 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False): 233 | """Configure environment for DeepMind-style Atari. 234 | """ 235 | if episode_life: 236 | env = EpisodicLifeEnv(env) 237 | if 'FIRE' in env.unwrapped.get_action_meanings(): 238 | env = FireResetEnv(env) 239 | env = WarpFrame(env) 240 | if scale: 241 | env = ScaledFloatFrame(env) 242 | if clip_rewards: 243 | env = ClipRewardEnv(env) 244 | if frame_stack: 245 | env = FrameStack(env, 4) 246 | return env 247 | 248 | 249 | class ImageToPyTorch(gym.ObservationWrapper): 250 | """ 251 | Image shape to num_channels x weight x height 252 | """ 253 | 254 | def __init__(self, env): 255 | super(ImageToPyTorch, self).__init__(env) 256 | old_shape = self.observation_space.shape 257 | self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.uint8) 258 | 259 | def observation(self, observation): 260 | return np.swapaxes(observation, 2, 0) 261 | 262 | 263 | def wrap_pytorch(env): 264 | return ImageToPyTorch(env) 265 | -------------------------------------------------------------------------------- /searl/rl_algorithms/dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from searl.neuroevolution.training_td3 import get_optimizer 5 | from searl.neuroevolution.components.utils import Transition 6 | from searl.neuroevolution.components.envolvable_cnn import EvolvableCnnDQN 7 | from searl.rl_algorithms.components.wrappers import make_atari, wrap_deepmind, wrap_pytorch 8 | from searl.rl_algorithms.components.replay_memory import ReplayBuffer 9 | from searl.utils.supporter import Supporter 10 | 11 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 12 | print("CUDA", device == torch.device("cuda"), device) 13 | 14 | 15 | class DQN(object): 16 | 17 | def __init__(self, config, logger, checkpoint): 18 | self.cfg = config 19 | self.log = logger 20 | self.ckp = checkpoint 21 | 22 | env = make_atari(self.cfg.env.name) 23 | env = wrap_deepmind(env) 24 | env = wrap_pytorch(env) 25 | self.env = env 26 | 27 | self.cfg.set_attr("action_dim", self.env.action_space.n) 28 | self.cfg.set_attr("state_dim", self.env.observation_space.shape) 29 | 30 | # Set seeds 31 | self.env.seed(seed=self.cfg.seed.env) 32 | torch.manual_seed(self.cfg.seed.torch) 33 | np.random.seed(self.cfg.seed.numpy) 34 | 35 | self.Vmin = self.cfg.actor.Vmin 36 | self.Vmax = self.cfg.actor.Vmax 37 | self.num_atoms = self.cfg.actor.num_atoms 38 | self.batch_size = self.cfg.dqn.batch_size 39 | 40 | self.tau = 0.005 41 | 42 | self.actor = EvolvableCnnDQN(input_shape=self.cfg.state_dim, num_actions=self.cfg.action_dim, device=device, 43 | **self.cfg.actor.get_dict).to(device) 44 | 45 | Opti = get_optimizer(self.cfg.dqn.optimizer) 46 | self.actor_optim = Opti(self.actor.parameters(), lr=self.cfg.dqn.lr_actor) 47 | 48 | self.actor_target = type(self.actor)(**self.actor.init_dict).to(device) 49 | self.actor_target.load_state_dict(self.actor.state_dict()) 50 | 51 | self.replay_memory = ReplayBuffer(self.cfg.dqn.rm_capacity) 52 | 53 | self.log.print_config(self.cfg) 54 | 55 | def evaluate_policy(self, eval_episodes): 56 | episode_reward_list = [] 57 | for _ in range(eval_episodes): 58 | state = self.env.reset() 59 | done = False 60 | episode_reward = 0 61 | while not done: 62 | action = self.actor.act(state) 63 | next_state, reward, done, info = self.env.step(action) # Simulate one step in environment 64 | state = next_state 65 | episode_reward += reward 66 | 67 | episode_reward_list.append(episode_reward) 68 | 69 | avg_reward = np.mean(episode_reward_list) 70 | 71 | self.log("---------------------------------------") 72 | self.log("Evaluation over %d episodes: %f" % (eval_episodes, avg_reward)) 73 | self.log("---------------------------------------") 74 | 75 | return avg_reward 76 | 77 | def perform_learning(self): 78 | 79 | self.log("START LEARNING") 80 | 81 | total_timesteps = 0 82 | timesteps_since_eval = 0 83 | episode_reward = 0 84 | episode_timesteps = 0 85 | reset_timesteps = 0 86 | episode_num = 0 87 | done = True 88 | 89 | while total_timesteps < self.cfg.dqn.num_frames: 90 | 91 | if done: 92 | if total_timesteps != 0 and self.replay_memory.storage.__len__() > self.cfg.dqn.replay_initial: 93 | if ( 94 | self.cfg.dqn.reset_target or self.cfg.dqn.recreate_optim) and reset_timesteps >= self.cfg.dqn.min_eval_steps: 95 | self.train(episode_timesteps, reinit_optim=self.cfg.dqn.recreate_optim, 96 | reinit_target=self.cfg.dqn.reset_target) 97 | reset_timesteps = 0 98 | else: 99 | self.train(episode_timesteps) 100 | 101 | # Evaluate episode 102 | if timesteps_since_eval >= self.cfg.dqn.eval_freq: 103 | timesteps_since_eval = 0 104 | test_mean_reward = self.evaluate_policy(eval_episodes=self.cfg.dqn.eval_episodes) 105 | self.log("test_mean_reward", test_mean_reward, time_step=total_timesteps) 106 | self.log("test_episode_num", episode_num, time_step=total_timesteps) 107 | 108 | if self.cfg.support.save_models: 109 | self.ckp.save_object(self.actor.state_dict(), name="actor_state_dict") 110 | 111 | # Reset environment 112 | state = self.env.reset() 113 | episode_reward = 0 114 | episode_timesteps = 0 115 | episode_num += 1 116 | 117 | # Select action randomly or according to policy 118 | if total_timesteps < self.cfg.dqn.start_timesteps: 119 | action = self.env.action_space.sample() 120 | else: 121 | action = self.actor.act(state) 122 | 123 | next_state, reward, done, info = self.env.step(action) # Simulate one step in environment 124 | 125 | transition = Transition(torch.FloatTensor(state), torch.LongTensor([action]), 126 | torch.FloatTensor(next_state), torch.FloatTensor(np.array([reward])), 127 | torch.FloatTensor(np.array([done]).astype('uint8')) 128 | ) 129 | self.replay_memory.add(transition) 130 | 131 | state = next_state 132 | 133 | episode_reward += reward 134 | episode_timesteps += 1 135 | reset_timesteps += 1 136 | total_timesteps += 1 137 | timesteps_since_eval += 1 138 | 139 | # Final evaluation 140 | self.log("training end", time_step=total_timesteps) 141 | test_mean_reward = self.evaluate_policy(eval_episodes=self.cfg.td3.eval_episodes) 142 | self.log("test_mean_reward", test_mean_reward, time_step=total_timesteps) 143 | if self.cfg.support.save_models: 144 | self.ckp.save_state_dict(self.actor.state_dict(), number=1) 145 | self.ckp.save_object(self.replay_memory.storage, name="replay_memory") 146 | 147 | def projection_distribution(self, next_state, rewards, dones): 148 | batch_size = next_state.size(0) 149 | 150 | delta_z = float(self.Vmax - self.Vmin) / (self.num_atoms - 1) 151 | support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms).to(device) 152 | 153 | next_dist = self.actor_target(next_state) * support 154 | next_action = next_dist.sum(2).max(1)[1] 155 | next_action = next_action.unsqueeze(1).unsqueeze(1).expand(next_dist.size(0), 1, next_dist.size(2)) 156 | next_dist = next_dist.gather(1, next_action).squeeze(1) 157 | 158 | rewards = rewards.unsqueeze(1).expand_as(next_dist) 159 | dones = dones.unsqueeze(1).expand_as(next_dist) 160 | support = support.unsqueeze(0).expand_as(next_dist) 161 | 162 | Tz = rewards + (1 - dones) * 0.99 * support 163 | Tz = Tz.clamp(min=self.Vmin, max=self.Vmax) 164 | b = (Tz - self.Vmin) / delta_z 165 | l = b.floor().long() 166 | u = b.ceil().long() 167 | 168 | offset = torch.linspace(0, (batch_size - 1) * self.num_atoms, batch_size).long() \ 169 | .unsqueeze(1).expand(batch_size, self.num_atoms).to(device) 170 | 171 | proj_dist = torch.zeros(next_dist.size()).to(device) 172 | proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)) 173 | proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)) 174 | 175 | return proj_dist 176 | 177 | def train(self, iterations, reinit_target=False, reinit_optim=False): 178 | 179 | iterations = min(iterations, 10000) 180 | 181 | if reinit_target: 182 | self.actor_target.load_state_dict(self.actor.state_dict()) 183 | 184 | if reinit_optim: 185 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) 186 | 187 | losses = [] 188 | 189 | for it in range(iterations): 190 | 191 | transition_list = self.replay_memory.sample(self.cfg.dqn.batch_size) 192 | 193 | state_list = [] 194 | action_batch = [] 195 | next_state_batch = [] 196 | reward_batch = [] 197 | done_batch = [] 198 | indexes = [] 199 | for transition in transition_list: 200 | state_list.append(transition.state) 201 | action_batch.append(transition.action) 202 | next_state_batch.append(transition.next_state) 203 | reward_batch.append(transition.reward) 204 | done_batch.append(transition.done) 205 | indexes.append(transition.index) 206 | 207 | state = torch.stack(state_list, dim=0).to(device) 208 | action = torch.stack(action_batch, dim=0).squeeze().to(device) 209 | next_state = torch.stack(next_state_batch, dim=0).to(device) 210 | reward = torch.stack(reward_batch, dim=0).squeeze().to(device) 211 | done = torch.stack(done_batch, dim=0).squeeze().to(device) 212 | 213 | with torch.no_grad(): 214 | proj_dist = self.projection_distribution(next_state, reward, done) 215 | 216 | dist = self.actor(state) 217 | action = action.unsqueeze(1).unsqueeze(1).expand(self.batch_size, 1, self.num_atoms) 218 | dist = dist.gather(1, action).squeeze(1) 219 | dist.data.clamp_(0.01, 0.99) 220 | loss = -(proj_dist * dist.log()).sum(1) 221 | loss = loss.mean() 222 | 223 | self.actor_optim.zero_grad() 224 | loss.backward() 225 | self.actor_optim.step() 226 | 227 | if it % 5 == 0: 228 | self.actor.reset_noise() 229 | self.actor_target.reset_noise() 230 | 231 | losses.append(loss.detach().cpu().numpy()) 232 | 233 | if self.cfg.dqn.soft_update: 234 | if it % 2 == 0 and it != 0: 235 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 236 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) 237 | else: 238 | if (it % 1000 == 0 and it != 0) or it == (iterations - 1): 239 | self.actor_target.load_state_dict(self.actor.state_dict()) 240 | return np.mean(losses).tolist() 241 | 242 | 243 | def start_DQN_training(config, expt_dir): 244 | with Supporter(experiments_dir=expt_dir, config_dict=config, count_expt=True) as sup: 245 | cfg = sup.get_config() 246 | log = sup.get_logger() 247 | 248 | dqn = DQN(config=cfg, logger=log, checkpoint=sup.ckp) 249 | dqn.perform_learning() 250 | -------------------------------------------------------------------------------- /searl/rl_algorithms/td3.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import gym 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | from searl.neuroevolution.components.envolvable_mlp import EvolvableMLP 9 | from searl.neuroevolution.training_td3 import get_optimizer 10 | from searl.neuroevolution.components.utils import to_tensor, Transition 11 | from searl.rl_algorithms.components.replay_memory import ReplayBuffer 12 | from searl.utils.supporter import Supporter 13 | 14 | 15 | class TD3(object): 16 | 17 | def __init__(self, config, logger, checkpoint): 18 | 19 | self.cfg = config 20 | self.log = logger 21 | self.ckp = checkpoint 22 | 23 | self.lr_rate = 0.001 24 | 25 | self.env = gym.make(self.cfg.env.name) 26 | self.cfg.set_attr("action_dim", self.env.action_space.shape[0]) 27 | self.cfg.set_attr("state_dim", self.env.observation_space.shape[0]) 28 | 29 | # Set seeds 30 | self.env.seed(seed=self.cfg.seed.env) 31 | torch.manual_seed(self.cfg.seed.torch) 32 | np.random.seed(self.cfg.seed.numpy) 33 | 34 | self.actor = EvolvableMLP(num_inputs=self.cfg.state_dim, num_outputs=self.cfg.action_dim, 35 | **self.cfg.actor.get_dict) 36 | 37 | self.actor_target = type(self.actor)(**self.actor.init_dict) 38 | self.actor_target.load_state_dict(self.actor.state_dict()) 39 | 40 | critic_1_config = copy.deepcopy(self.cfg.critic.get_dict) 41 | self.critic_1 = EvolvableMLP(num_inputs=self.cfg.state_dim + self.cfg.action_dim, num_outputs=1, 42 | **critic_1_config) 43 | self.critic_1_target = type(self.critic_1)(**self.critic_1.init_dict) 44 | self.critic_1_target.load_state_dict(self.critic_1.state_dict()) 45 | if self.cfg.td3.double_q: 46 | critic_2_config = copy.deepcopy(self.cfg.critic.get_dict) 47 | self.critic_2 = EvolvableMLP(num_inputs=self.cfg.state_dim + self.cfg.action_dim, num_outputs=1, 48 | **critic_2_config) 49 | self.critic_2_target = type(self.critic_2)(**self.critic_2.init_dict) 50 | self.critic_2_target.load_state_dict(self.critic_2.state_dict()) 51 | 52 | Opti = get_optimizer(self.cfg.td3.optimizer) 53 | self.actor_optim = Opti(self.actor.parameters(), lr=self.cfg.td3.lr_actor) 54 | self.critic_1_optim = Opti(self.critic_1.parameters(), lr=self.cfg.td3.lr_critic) 55 | if self.cfg.td3.double_q: 56 | self.critic_2_optim = Opti(self.critic_2.parameters(), lr=self.cfg.td3.lr_critic) 57 | 58 | self.replay_memory = ReplayBuffer(self.cfg.td3.rm_capacity) 59 | 60 | self.log.print_config(self.cfg) 61 | 62 | def evaluate_policy(self, eval_episodes): 63 | episode_reward_list = [] 64 | for _ in range(eval_episodes): 65 | 66 | state = self.env.reset() 67 | t_state = to_tensor(state).unsqueeze(0) 68 | done = False 69 | episode_reward = 0 70 | while not done: 71 | # Reset environment 72 | action = self.actor(t_state) 73 | action.clamp(-1, 1) # only for MuJoCo 74 | action = action.data.numpy() 75 | action = action.flatten() 76 | 77 | step_action = (action + 1) / 2 # [-1, 1] => [0, 1] 78 | step_action *= (self.env.action_space.high - self.env.action_space.low) 79 | step_action += self.env.action_space.low 80 | 81 | next_state, reward, done, info = self.env.step(step_action) # Simulate one step in environment 82 | t_state = to_tensor(next_state).unsqueeze(0) 83 | episode_reward += reward 84 | 85 | episode_reward_list.append(episode_reward) 86 | 87 | avg_reward = np.mean(episode_reward_list) 88 | 89 | self.log("---------------------------------------") 90 | self.log("Evaluation over %d episodes: %f" % (eval_episodes, avg_reward)) 91 | self.log("---------------------------------------") 92 | 93 | return avg_reward 94 | 95 | def select_action(self, state): 96 | state = torch.FloatTensor(state.reshape(1, -1)) 97 | return self.actor(state).cpu().data.numpy().flatten() 98 | 99 | def perform_learning(self): 100 | 101 | self.log("START LEARNING") 102 | 103 | total_timesteps = 0 104 | timesteps_since_eval = 0 105 | episode_num = 0 106 | done = True 107 | 108 | while total_timesteps < self.cfg.td3.max_timesteps: 109 | 110 | if done: 111 | 112 | if total_timesteps != 0: 113 | self.log("Start Training: Total timesteps: %d Episode Num: %d Episode T: %d Reward: %f" % ( 114 | total_timesteps, episode_num, episode_timesteps, episode_reward), time_step=total_timesteps) 115 | self.log("episode_reward", episode_reward, time_step=total_timesteps) 116 | self.train(episode_timesteps, reinit_optim=self.cfg.td3.recreate_optim, 117 | reinit_target=self.cfg.td3.reset_target, lr_rate=self.lr_rate) 118 | # Evaluate episode 119 | if timesteps_since_eval >= self.cfg.td3.eval_freq: 120 | timesteps_since_eval %= self.cfg.td3.eval_freq 121 | test_mean_reward = self.evaluate_policy(eval_episodes=self.cfg.td3.eval_episodes) 122 | self.log("test_mean_reward", test_mean_reward, time_step=total_timesteps) 123 | 124 | if self.cfg.support.save_models: 125 | self.ckp.save_object(self.actor.state_dict(), name="actor_state_dict") 126 | self.ckp.save_object(self.critic_1.state_dict(), name="critic_1_state_dict") 127 | 128 | # Reset environment 129 | state = self.env.reset() 130 | t_state = to_tensor(state).unsqueeze(0) 131 | done = False 132 | episode_reward = 0 133 | episode_timesteps = 0 134 | episode_num += 1 135 | 136 | # Select action randomly or according to policy 137 | if total_timesteps < self.cfg.td3.start_timesteps: 138 | action = self.env.action_space.sample() 139 | action = to_tensor(action) 140 | else: 141 | action = self.actor(t_state) 142 | action.clamp(-1, 1) # only for MuJoCo 143 | action = action.data.numpy() 144 | if self.cfg.td3.exploration_noise is not False: 145 | action += self.cfg.td3.exploration_noise * np.random.randn(self.cfg.action_dim) 146 | action = np.clip(action, -1, 1) 147 | action = action.flatten() 148 | 149 | step_action = (action + 1) / 2 # [-1, 1] => [0, 1] 150 | step_action *= (self.env.action_space.high - self.env.action_space.low) 151 | step_action += self.env.action_space.low 152 | 153 | next_state, reward, done, info = self.env.step(step_action) # Simulate one step in environment 154 | 155 | done_bool = 0 if episode_timesteps + 1 == self.env._max_episode_steps else float(done) 156 | 157 | t_next_state = to_tensor(next_state).unsqueeze(0) 158 | 159 | transition = Transition(state, action, next_state, np.array([reward]), 160 | np.array([done_bool]).astype('uint8')) 161 | self.replay_memory.add(transition) 162 | 163 | t_state = t_next_state 164 | state = next_state 165 | 166 | episode_reward += reward 167 | episode_timesteps += 1 168 | total_timesteps += 1 169 | timesteps_since_eval += 1 170 | 171 | # Final evaluation 172 | self.log("training end", time_step=total_timesteps) 173 | test_mean_reward = self.evaluate_policy(eval_episodes=self.cfg.td3.eval_episodes) 174 | self.log("test_mean_reward", test_mean_reward, time_step=total_timesteps) 175 | if self.cfg.support.save_models: 176 | self.ckp.save_state_dict(self.actor.state_dict(), number=1) 177 | self.ckp.save_state_dict(self.critic_1.state_dict(), number=2) 178 | if self.cfg.td3.double_q: 179 | self.ckp.save_state_dict(self.critic_2.state_dict(), number=3) 180 | self.ckp.save_object(self.replay_memory.storage, name="er_memory") 181 | 182 | def train(self, iterations, reinit_target=False, reinit_optim=False, lr_rate=0.001): 183 | 184 | if reinit_target: 185 | self.actor_target = type(self.actor)(**self.actor.init_dict) 186 | self.actor_target.load_state_dict(self.actor.state_dict()) 187 | 188 | self.critic_1_target = type(self.critic_1)(**self.critic_1.init_dict) 189 | self.critic_1_target.load_state_dict(self.critic_1.state_dict()) 190 | 191 | self.critic_2_target = type(self.critic_2)(**self.critic_2.init_dict) 192 | self.critic_2_target.load_state_dict(self.critic_2.state_dict()) 193 | 194 | if reinit_optim: 195 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr_rate) 196 | self.critic_1_optim = torch.optim.Adam(self.critic_1.parameters(), lr=lr_rate) 197 | self.critic_2_optim = torch.optim.Adam(self.critic_2.parameters(), lr=lr_rate) 198 | 199 | for it in range(iterations): 200 | 201 | transition_list = self.replay_memory.sample(self.cfg.td3.batch_size) 202 | 203 | state_list = [] 204 | action_batch = [] 205 | next_state_batch = [] 206 | reward_batch = [] 207 | done_batch = [] 208 | indexes = [] 209 | for transition in transition_list: 210 | state_list.append(torch.Tensor(transition.state)) 211 | action_batch.append(torch.Tensor(transition.action)) 212 | next_state_batch.append(torch.Tensor(transition.next_state)) 213 | reward_batch.append(torch.Tensor(transition.reward)) 214 | done_batch.append(torch.Tensor(transition.done)) 215 | indexes.append(transition.index) 216 | 217 | state = torch.stack(state_list, dim=0) 218 | action = torch.stack(action_batch, dim=0) 219 | next_state = torch.stack(next_state_batch, dim=0) 220 | reward = torch.stack(reward_batch, dim=0) 221 | done = 1 - torch.stack(done_batch, dim=0) 222 | 223 | with torch.no_grad(): 224 | noise = (torch.randn_like(action) * self.cfg.td3.td3_policy_noise).clamp(-self.cfg.td3.td3_noise_clip, 225 | self.cfg.td3.td3_noise_clip) 226 | next_action = (self.actor_target(next_state) + noise).clamp(-1, 1) 227 | target_Q1 = self.critic_1_target(torch.cat([next_state, next_action], 1)) 228 | target_Q2 = self.critic_2_target(torch.cat([next_state, next_action], 1)) 229 | target_Q = torch.min(target_Q1, target_Q2) 230 | target_Q = reward + (done * self.cfg.td3.gamma * target_Q) 231 | 232 | current_Q1 = self.critic_1(torch.cat([state, action], 1)) 233 | current_Q2 = self.critic_2(torch.cat([state, action], 1)) 234 | 235 | critic_loss_1 = F.mse_loss(current_Q1, target_Q) 236 | self.critic_1_optim.zero_grad() 237 | critic_loss_1.backward() 238 | for p in self.critic_1.parameters(): 239 | p.grad.data.clamp_(max=self.cfg.td3.clip_grad_norm) 240 | self.critic_1_optim.step() 241 | 242 | critic_loss_2 = F.mse_loss(current_Q2, target_Q) 243 | self.critic_2_optim.zero_grad() 244 | critic_loss_2.backward() 245 | for p in self.critic_2.parameters(): 246 | p.grad.data.clamp_(max=self.cfg.td3.clip_grad_norm) 247 | self.critic_2_optim.step() 248 | 249 | if it % self.cfg.td3.td3_update_freq == 0: 250 | 251 | actor_loss = -self.critic_1(torch.cat([state, self.actor(state)], 1)) 252 | 253 | actor_loss = torch.mean(actor_loss) 254 | 255 | self.actor_optim.zero_grad() 256 | actor_loss.backward() 257 | for p in self.actor.parameters(): 258 | p.grad.data.clamp_(max=self.cfg.td3.clip_grad_norm) 259 | self.actor_optim.step() 260 | 261 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 262 | target_param.data.copy_(self.cfg.td3.tau * param.data + (1 - self.cfg.td3.tau) * target_param.data) 263 | 264 | for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()): 265 | target_param.data.copy_(self.cfg.td3.tau * param.data + (1 - self.cfg.td3.tau) * target_param.data) 266 | 267 | for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()): 268 | target_param.data.copy_(self.cfg.td3.tau * param.data + (1 - self.cfg.td3.tau) * target_param.data) 269 | 270 | 271 | def start_TD3_training(config, expt_dir): 272 | with Supporter(experiments_dir=expt_dir, config_dict=config, count_expt=True) as sup: 273 | cfg = sup.get_config() 274 | log = sup.get_logger() 275 | 276 | env = gym.make(cfg.env.name) 277 | cfg.set_attr("action_dim", env.action_space.shape[0]) 278 | cfg.set_attr("state_dim", env.observation_space.shape[0]) 279 | 280 | td3 = TD3(config=cfg, logger=log, checkpoint=sup.ckp) 281 | 282 | td3.perform_learning() 283 | -------------------------------------------------------------------------------- /searl/utils/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .handler.config import ConfigHandler 3 | from .handler.folder import FolderHandler -------------------------------------------------------------------------------- /searl/utils/handler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/utils/handler/__init__.py -------------------------------------------------------------------------------- /searl/utils/handler/base_handler.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pathlib 3 | from datetime import datetime 4 | 5 | 6 | class Handler(): 7 | 8 | def __init__(self): 9 | pass 10 | 11 | def time_stamp(self) -> str: 12 | return datetime.utcnow().strftime('%Y-%m-%d_%H:%M:%S.%f')[:-4] 13 | 14 | def save_mkdir(self, dir): 15 | while not os.path.isdir(dir): 16 | try: 17 | os.mkdir(dir) 18 | except FileExistsError: 19 | pass 20 | 21 | def counting_name(self, dir, file_name, suffix=False): 22 | dir = pathlib.Path(dir) 23 | counter = 0 24 | split_file_name = file_name.split('.') 25 | if suffix: 26 | counting_file_name = '.'.join(split_file_name[:-1]) + f"-{counter}." + split_file_name[-1] 27 | else: 28 | counting_file_name = file_name + f"-{counter}" 29 | 30 | while os.path.isfile(dir / counting_file_name) or os.path.isdir(dir / counting_file_name): 31 | if suffix: 32 | counting_file_name = '.'.join(split_file_name[:-1]) + f"-{counter}." + split_file_name[-1] 33 | else: 34 | counting_file_name = file_name + f"-{counter}" 35 | counter += 1 36 | 37 | return counting_file_name 38 | -------------------------------------------------------------------------------- /searl/utils/handler/checkpoint.py: -------------------------------------------------------------------------------- 1 | """ 2 | save and restore checkpoints including parameters, rng states and env/data states 3 | """ 4 | import pathlib 5 | import numpy as np 6 | import torch 7 | 8 | 9 | class CheckpointHandler(): 10 | 11 | def __init__(self, checkpoint_dir, ): 12 | self.dir = pathlib.Path(checkpoint_dir) 13 | 14 | def save_training(self, mode_state_dict, optimizer_state_dict, epoch=None, loss=None, number=0): 15 | torch.save({ 16 | 'epoch': epoch, 17 | 'model_state_dict': mode_state_dict, 18 | 'optimizer_state_dict': optimizer_state_dict, 19 | 'loss': loss, 20 | }, self.dir / f"training_{number}.tar") 21 | 22 | def load_training(self, number=0): 23 | checkpoint = torch.load(self.dir / f"training_{number}.tar") 24 | mode_state_dict = checkpoint['model_state_dict'] 25 | optimizer_state_dict = checkpoint['optimizer_state_dict'] 26 | epoch = checkpoint['epoch'] 27 | loss = checkpoint['loss'] 28 | return mode_state_dict, optimizer_state_dict, epoch, loss 29 | 30 | def save_model(self, model, number=0): 31 | torch.save(model, self.dir / f"model_{number}.pth") 32 | 33 | def load_model(self, number=0): 34 | model = torch.load(self.dir / f"model_{number}.pth") 35 | return model 36 | 37 | def save_state_dict(self, state_dict, number=0): 38 | torch.save(state_dict, self.dir / f"state_dict_{number}.pth") 39 | 40 | def load_state_dict(self, number=0, cpu=True): 41 | if cpu: 42 | state_dict = torch.load(self.dir / f"state_dict_{number}.pth", map_location=torch.device('cpu')) 43 | else: 44 | state_dict = torch.load(self.dir / f"state_dict_{number}.pth") 45 | return state_dict 46 | 47 | def save_object(self, object, name="object_0"): 48 | np.save(self.dir / f"{name}.npy", object, allow_pickle=True) 49 | 50 | def load_object(self, name="object_0"): 51 | return np.load(self.dir / f"{name}.npy", allow_pickle=True) 52 | -------------------------------------------------------------------------------- /searl/utils/handler/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | reads a yml config or a dict and safes it into experiment folder 3 | 4 | """ 5 | import os 6 | import pathlib 7 | import yaml 8 | 9 | from searl.utils.handler.base_handler import Handler 10 | 11 | 12 | class AttributeDict(Handler): 13 | def __init__(self, dictionary, name): 14 | super().__init__() 15 | 16 | for key in dictionary: 17 | if isinstance(dictionary[key], dict): 18 | if not hasattr(self, "sub_config"): 19 | self.sub_config = [] 20 | self.sub_config.append(key) 21 | setattr(self, key, AttributeDict(dictionary[key], key)) 22 | else: 23 | setattr(self, key, dictionary[key]) 24 | 25 | def __repr__(self): 26 | return str(self.__dict__) 27 | 28 | def __str__(self): 29 | return str(self.__dict__) 30 | 31 | @property 32 | def get_dict(self): 33 | return self.__dict__ 34 | 35 | def set_attr(self, name, value): 36 | if isinstance(value, pathlib.Path): 37 | value = value.as_posix() 38 | self.__setattr__(name, value) 39 | 40 | 41 | class ConfigHandler(AttributeDict): 42 | 43 | def __init__(self, config_dir=None, config_dict=None): 44 | 45 | if config_dir is None and config_dict is None: 46 | raise UserWarning("ConfigHandler: config_dir and config_dict is None") 47 | 48 | elif config_dir is not None and config_dict is None: 49 | with open(config_dir, 'r') as f: 50 | config_dict = yaml.load(f, Loader=yaml.Loader) 51 | 52 | super().__init__(config_dict, "main") 53 | 54 | self.check_experiment_config() 55 | 56 | def check_experiment_config(self): 57 | if not hasattr(self, "expt"): 58 | raise UserWarning(f"ConfigHandler: 'expt' config section is missing") 59 | else: 60 | for attr_name in ['project_name', 'session_name', 'experiment_name']: 61 | if not hasattr(self.expt, attr_name): 62 | raise UserWarning(f"ConfigHandler: {attr_name} is missing") 63 | elif isinstance(self.expt.__getattribute__(attr_name), str): 64 | self.expt.__setattr__(attr_name, str(self.expt.__getattribute__(attr_name))) 65 | 66 | def save_config(self, dir, file_name="config.yml"): 67 | dir = pathlib.Path(dir) 68 | self.save_mkdir(dir) 69 | if os.path.isfile(dir / file_name): 70 | file_name = self.counting_name(dir, file_name, suffix=True) 71 | with open(dir / file_name, 'w+') as f: 72 | config_dict = self.get_dict 73 | yaml.dump(config_dict, f, default_flow_style=False, encoding='utf-8') 74 | return dir / file_name 75 | -------------------------------------------------------------------------------- /searl/utils/handler/folder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Handle the location, new folders and experiments sub-folder structure. 3 | 4 | base_dir / project / session / experiment 5 | 6 | experiment will be increased 7 | 8 | """ 9 | import pathlib 10 | 11 | from searl.utils.handler.base_handler import Handler 12 | 13 | 14 | class FolderHandler(Handler): 15 | 16 | def __init__(self, experiments_dir, project_name=None, session_name=None, experiment_name=None, count_expt=False): 17 | super().__init__() 18 | 19 | self.experiments_dir = pathlib.Path(experiments_dir) 20 | 21 | self.subfolder = ["log", "checkpoint", "config", "profile"] 22 | 23 | if project_name is not None: 24 | self.project_name = project_name 25 | self.session_name = session_name 26 | self.experiment_name = experiment_name 27 | self.count_expt = count_expt 28 | 29 | self.expt_dir = self.create_folders() 30 | else: 31 | self.expt_dir = self.experiments_dir 32 | 33 | def create_folders(self): 34 | 35 | dir = self.experiments_dir 36 | self.save_mkdir(dir) 37 | 38 | for folder in [self.project_name, self.session_name]: 39 | dir = dir / folder 40 | self.save_mkdir(dir) 41 | 42 | if self.count_expt: 43 | self.experiment_name = self.counting_name(dir, self.experiment_name) 44 | 45 | dir = dir / self.experiment_name 46 | self.save_mkdir(dir) 47 | 48 | for folder in self.subfolder: 49 | self.save_mkdir(dir / folder) 50 | 51 | return dir 52 | 53 | @property 54 | def dir(self): 55 | return self.expt_dir 56 | 57 | @property 58 | def config_dir(self): 59 | return self.expt_dir / "config" 60 | 61 | @property 62 | def log_dir(self): 63 | return self.expt_dir / "log" 64 | 65 | @property 66 | def profile_dir(self): 67 | return self.expt_dir / "profile" 68 | 69 | @property 70 | def checkpoint_dir(self): 71 | return self.expt_dir / "checkpoint" 72 | -------------------------------------------------------------------------------- /searl/utils/log/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/utils/log/__init__.py -------------------------------------------------------------------------------- /searl/utils/log/csv.py: -------------------------------------------------------------------------------- 1 | """ 2 | save log and show log 3 | 4 | """ 5 | import csv 6 | from pathlib import Path 7 | from typing import Dict, List 8 | 9 | from searl.utils.handler.base_handler import Handler 10 | 11 | 12 | class LogCSV(Handler): 13 | 14 | def __init__(self, log_dir, file_name="train_log.csv"): 15 | super().__init__() 16 | 17 | self.log_dir = Path(log_dir) 18 | self.log_file = file_name 19 | 20 | def fieldnames(self, fieldnames_list: List): 21 | self.csv_columns = fieldnames_list 22 | 23 | with open(self.log_dir / self.log_file, 'a') as csvfile: 24 | writer = csv.DictWriter(csvfile, fieldnames=self.csv_columns) 25 | writer.writeheader() 26 | 27 | def log_csv(self, dict_data: Dict): 28 | 29 | dict_data["time_string"] = f"{self.time_stamp()}" 30 | 31 | for key in self.csv_columns: 32 | if key not in dict_data.keys(): 33 | dict_data[key] = None 34 | with open(self.log_dir / self.log_file, 'a') as csvfile: 35 | writer = csv.DictWriter(csvfile, fieldnames=self.csv_columns) 36 | writer.writerow(dict_data) 37 | -------------------------------------------------------------------------------- /searl/utils/log/json.py: -------------------------------------------------------------------------------- 1 | """ 2 | save log and show log 3 | 4 | """ 5 | import json 6 | import os 7 | import time 8 | from pathlib import Path 9 | 10 | from searl.utils.handler.base_handler import Handler 11 | 12 | 13 | class LogJSON(Handler): 14 | 15 | def __init__(self, log_dir, file_name="json_log.json"): 16 | super().__init__() 17 | 18 | self.log_dir = Path(log_dir) 19 | 20 | self.file_name = file_name 21 | self.json_file = file_name 22 | 23 | def __enter__(self): 24 | self.open() 25 | return self 26 | 27 | def open(self): 28 | self.json_file = self.counting_name(self.log_dir, self.json_file, suffix=True) 29 | 30 | data = {"start": {'value': 0, 'time_step': None, 'time_stamp': self.time_stamp(), 'time': time.time()}} 31 | with open(self.log_dir / self.json_file, 'w+') as file: 32 | file.write(f"[ \n") 33 | file.write(json.dumps(data)) 34 | 35 | def __exit__(self, exc_type, exc_val, exc_tb): 36 | self.close() 37 | 38 | def close(self): 39 | with open(self.log_dir / self.json_file, 'a') as file: 40 | file.write(f"\n]") 41 | 42 | def jlog(self, key: str, value, time_step=None): 43 | data = {key: {'value': value, 'time_step': time_step, 'time_stamp': self.time_stamp(), 'time': time.time()}} 44 | with open(self.log_dir / self.json_file, 'a') as file: 45 | file.write(", \n") 46 | file.write(json.dumps(data)) 47 | 48 | def load_json(self): 49 | data_list = [] 50 | counter = 0 51 | counting_file_name = self.file_name.split('.')[0] + f"-{counter}" + self.file_name.split('.')[1] 52 | while os.path.isfile(self.log_dir / counting_file_name): 53 | with open(self.log_dir / counting_file_name, 'r') as file: 54 | data = json.load(file) 55 | data_list.append(data) 56 | 57 | counter += 1 58 | counting_file_name = self.file_name.split('.')[0] + f"-{counter}" + self.file_name.split('.')[1] 59 | 60 | return data_list 61 | -------------------------------------------------------------------------------- /searl/utils/log/logger.py: -------------------------------------------------------------------------------- 1 | """ 2 | save log and show log 3 | 4 | """ 5 | import time 6 | 7 | import numpy as np 8 | 9 | from searl.utils.handler.base_handler import Handler 10 | from searl.utils.log.csv import LogCSV 11 | from searl.utils.log.json import LogJSON 12 | from searl.utils.log.pkl import LogPKL 13 | from searl.utils.log.txt import LogTXT 14 | 15 | 16 | class Logger(Handler): 17 | 18 | def __init__(self, log_dir): 19 | super().__init__() 20 | 21 | self.txt_logger = LogTXT(log_dir) 22 | self.pkl_logger = LogPKL(log_dir) 23 | self.json_logger = LogJSON(log_dir) 24 | self.csv = LogCSV(log_dir) 25 | 26 | self.timer = {} 27 | 28 | def __enter__(self): 29 | self.open() 30 | return self 31 | 32 | def open(self): 33 | self.json_logger.open() 34 | 35 | def __exit__(self, exc_type, exc_val, exc_tb): 36 | self.close() 37 | 38 | def __call__(self, key, value=None, time_step=None, print_log=True): 39 | self.log(key, value, time_step, print_log) 40 | 41 | def close(self): 42 | self.json_logger.close() 43 | 44 | def log(self, key, value=None, time_step=None, print_log=True): 45 | if value is None: 46 | if print_log: 47 | if time_step is None: 48 | self.txt_logger.log(key) 49 | else: 50 | self.txt_logger.log(f"{key}-step:{time_step}") 51 | self.json_logger.jlog(key="MSG", value=key, time_step=time_step) 52 | 53 | else: 54 | if time_step is None: 55 | self.txt_logger.log(f"{key}: {value}") 56 | else: 57 | self.txt_logger.log(f"{key}: {value} step:{time_step}") 58 | self.json_logger.jlog(key=key, value=value, time_step=time_step) 59 | 60 | def dump(self, key, value, time_step=None): 61 | self.pkl_logger.dump(key, value, time_step) 62 | 63 | def print_config(self, config, name="main"): 64 | if name == "main": 65 | self.log("#" * 20 + " CONFIG:") 66 | else: 67 | self.log(f"sub config {name:8}", 68 | np.unique([f"{attr} : {str(value)} " for attr, value in config.get_dict.items()]).tolist()) 69 | 70 | if hasattr(config, "sub_config"): 71 | for cfg in config.sub_config: 72 | self.print_config(getattr(config, cfg), cfg) 73 | 74 | def start_timer(self, name): 75 | self.log(f"##### {name}") 76 | self.timer[name] = time.time() 77 | 78 | def log_time(self, name): 79 | self.log(f"timer {name:8}", f"{time.time() - self.timer[name]:3.1f}s") 80 | 81 | def log_func(self, function, *args, **kwargs): 82 | self.start_timer(function.__name__) 83 | rslt = function(*args, **kwargs) 84 | self.log_time(function.__name__) 85 | return rslt 86 | 87 | def population_info(self, population_mean_fitness, population_var_fitness, population, num_frames, epoch): 88 | 89 | best_idx = np.argmax(population_mean_fitness) 90 | self.log("#### POPULATION INFO", epoch, time_step=num_frames) 91 | self.log('Population fitness', [ind.fitness[-1] for ind in population], time_step=num_frames) 92 | self.log('Population improve', [ind.improvement for ind in population], time_step=num_frames) 93 | self.log('Population var fit', [float(var) for var in population_var_fitness], time_step=num_frames) 94 | self.log('Actors hidden size ', [[int(s) for s in ind.actor_config['hidden_size']] for ind in population], 95 | time_step=num_frames) 96 | self.log('Mutation: ', [ind.train_log["mutation"] for ind in population], time_step=num_frames) 97 | self.log('mean_fitness', np.mean(population_mean_fitness), time_step=num_frames) 98 | self.log('best_fitness', population[best_idx].fitness[-1], time_step=num_frames) 99 | self.log('best_improve', population[best_idx].improvement, time_step=num_frames) 100 | self.log('best rl config', population[best_idx].rl_config.__str__(), time_step=num_frames) 101 | self.log('Best Actors hidden size', [int(s) for s in population[best_idx].actor_config['hidden_size']], 102 | time_step=num_frames) 103 | -------------------------------------------------------------------------------- /searl/utils/log/pkl.py: -------------------------------------------------------------------------------- 1 | """ 2 | save log and show log 3 | 4 | """ 5 | import os 6 | import pickle 7 | import time 8 | from pathlib import Path 9 | 10 | from searl.utils.handler.base_handler import Handler 11 | 12 | 13 | class LogPKL(Handler): 14 | 15 | def __init__(self, log_dir, file_name="value_dump.pkl"): 16 | super().__init__() 17 | 18 | self.log_dir = Path(log_dir) 19 | 20 | self.pickle_file = file_name 21 | 22 | def dump(self, key: str, value=None, time_step=None): 23 | if value: 24 | data = {"key": key, 'value': value, 'time_step': time_step, 'time_stamp': self.time_stamp(), 25 | 'time': time.time()} 26 | else: 27 | data = key 28 | with open(self.log_dir / self.pickle_file, 'ab') as f: 29 | pickle.dump(data, f) 30 | 31 | def check_dump(self): 32 | return os.path.isfile(self.log_dir / self.pickle_file) 33 | 34 | def load_pickle(self): 35 | data = [] 36 | with open(self.log_dir / self.pickle_file, 'rb') as f: 37 | while True: 38 | try: 39 | data.append(pickle.load(f)) 40 | except EOFError: 41 | break 42 | return data 43 | -------------------------------------------------------------------------------- /searl/utils/log/txt.py: -------------------------------------------------------------------------------- 1 | """ 2 | save log and show log 3 | 4 | """ 5 | import os 6 | import sys 7 | from pathlib import Path 8 | 9 | from searl.utils.handler.base_handler import Handler 10 | 11 | 12 | class LogTXT(Handler): 13 | 14 | def __init__(self, log_dir, file_name="log_file.txt"): 15 | super().__init__() 16 | 17 | self.log_dir = Path(log_dir) 18 | self.log_file = file_name 19 | 20 | self.start_log() 21 | 22 | def start_log(self): 23 | if os.path.isfile(self.log_dir / self.log_file) and os.access(self.log_dir / self.log_file, os.R_OK): 24 | self.log("LOGGER: continue logging") 25 | else: 26 | with open(self.log_dir / self.log_file, 'w+') as file: 27 | file.write( 28 | f"{self.time_stamp()} LOGGER: start logging with Python version: {str(sys.version).split('(')[0]} \n") 29 | 30 | def log(self, string: str): 31 | timed_string = f"{self.time_stamp()} {string}" 32 | print(timed_string) 33 | with open(self.log_dir / self.log_file, 'a') as file: 34 | file.write(f"{timed_string} \n") 35 | -------------------------------------------------------------------------------- /searl/utils/supporter.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | 3 | from searl.utils.handler.checkpoint import CheckpointHandler 4 | from searl.utils.handler.config import ConfigHandler 5 | from searl.utils.handler.folder import FolderHandler 6 | from searl.utils.log.logger import Logger 7 | 8 | 9 | class Supporter(): 10 | 11 | def __init__(self, experiments_dir=None, config_dir=None, config_dict=None, count_expt=False, reload_expt=False): 12 | 13 | if reload_expt: 14 | experiments_dir = pathlib.Path(experiments_dir) 15 | self.cfg = ConfigHandler(config_dir=experiments_dir / "config" / "config.yml", config_dict=None) 16 | self.folder = FolderHandler(experiments_dir) 17 | else: 18 | 19 | self.cfg = ConfigHandler(config_dir, config_dict) 20 | 21 | if experiments_dir is None and self.cfg.expt.experiments_dir is None: 22 | raise UserWarning("ConfigHandler: experiment_dir and config.expt.experiment_dir is None") 23 | elif experiments_dir is not None: 24 | self.cfg.expt.set_attr("experiments_dir", experiments_dir) 25 | else: 26 | experiments_dir = pathlib.Path(self.cfg.expt.experiments_dir) 27 | 28 | self.folder = FolderHandler(experiments_dir, self.cfg.expt.project_name, self.cfg.expt.session_name, 29 | self.cfg.expt.experiment_name, count_expt) 30 | self.cfg.save_config(self.folder.config_dir) 31 | 32 | self.logger = Logger(self.folder.log_dir) 33 | self.ckp = CheckpointHandler(self.folder.checkpoint_dir) 34 | 35 | self.logger.log("project_name", self.cfg.expt.project_name) 36 | self.logger.log("session_name", self.cfg.expt.session_name) 37 | self.logger.log("experiment_name", self.cfg.expt.experiment_name) 38 | 39 | def __enter__(self): 40 | self.logger.open() 41 | return self 42 | 43 | def __exit__(self, exc_type, exc_val, exc_tb): 44 | self.logger.close() 45 | 46 | def get_logger(self): 47 | return self.logger 48 | 49 | def get_config(self): 50 | return self.cfg 51 | 52 | def get_checkpoint_handler(self): 53 | return self.ckp 54 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from distutils.core import setup 4 | 5 | req_file = "requirements.txt" 6 | 7 | def parse_requirements(filename): 8 | lineiter = (line.strip() for line in open(filename)) 9 | return [line for line in lineiter if line and not line.startswith("#")] 10 | 11 | install_reqs = parse_requirements(req_file) 12 | 13 | setup(name='searl', 14 | version='latest', 15 | install_requires=install_reqs, 16 | dependency_links=[], 17 | ) 18 | --------------------------------------------------------------------------------