├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── dqn_config.yml
    ├── searl_dqn_config.yml
    ├── searl_td3_config.yml
    └── td3_config.yml
├── images
    └── searl.png
├── requirements.txt
├── scripts
    ├── __init__.py
    ├── run_dqn.py
    ├── run_searl_dqn.py
    ├── run_searl_td3.py
    └── run_td3.py
├── searl
    ├── __init__.py
    ├── neuroevolution
    │   ├── __init__.py
    │   ├── components
    │   │   ├── __init__.py
    │   │   ├── envolvable_cnn.py
    │   │   ├── envolvable_mlp.py
    │   │   ├── individual_dqn.py
    │   │   ├── individual_td3.py
    │   │   ├── replay_memory.py
    │   │   └── utils.py
    │   ├── evaluation_dqn.py
    │   ├── evaluation_td3.py
    │   ├── mutation_cnn.py
    │   ├── mutation_mlp.py
    │   ├── searl_dqn.py
    │   ├── searl_td3.py
    │   ├── tournament_selection.py
    │   ├── training_dqn.py
    │   └── training_td3.py
    ├── rl_algorithms
    │   ├── __init__.py
    │   ├── components
    │   │   ├── __init__.py
    │   │   ├── replay_memory.py
    │   │   └── wrappers.py
    │   ├── dqn.py
    │   └── td3.py
    └── utils
    │   ├── __init__.py
    │   ├── handler
    │       ├── __init__.py
    │       ├── base_handler.py
    │       ├── checkpoint.py
    │       ├── config.py
    │       └── folder.py
    │   ├── log
    │       ├── __init__.py
    │       ├── csv.py
    │       ├── json.py
    │       ├── logger.py
    │       ├── pkl.py
    │       └── txt.py
    │   └── supporter.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Experiment Folders
  2 | experiments/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | venv*
 93 | .venv*
 94 | 
 95 | # Spyder project settings
 96 | .spyderproject
 97 | .spyproject
 98 | 
 99 | # Rope project settings
100 | .ropeproject
101 | 
102 | # mkdocs documentation
103 | /site
104 | 
105 | # mypy
106 | .mypy_cache/
107 | 
108 | # default data folder
109 | data_babi/
110 | data_cnn/
111 | data_tmp/
112 | 
113 | # pycharm
114 | .idea/
115 | 
116 | # folder
117 | .experiments/*
118 | .experiment/*
119 | .tmp/
120 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [2021] [Jörg Franke]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sample-Efficient Automated Deep Reinforcement Learning
 2 | 
 3 | [![Python](https://img.shields.io/badge/python-3.6-yellow.svg)](https://www.python.org/downloads/release/python-3611/)
 4 | [![TensorFLow](https://img.shields.io/badge/PyTorch-1.6-yellow.svg)](https://pytorch.org/)
 5 | 
 6 | This repository contains source code accompanying the ICLR 2021 publication:
 7 | 
 8 | > [Sample-Efficient Automated Deep Reinforcement Learning](https://openreview.net/forum?id=hSjxQ3B7GWq)  
 9 | > Jörg K.H. Franke, Gregor Koehler, André Biedenkapp, Frank Hutter \
10 | > In *Proceedings of the International Conference on Learning Representations (ICLR 2021)*, 2021.
11 | 
12 | Sample-Efficient Automated Deep Reinforcement Learning (SEARL) jointly trains an RL off-policy agent and optimized the
13 | hyperparameters including the neural networks architecture. SEARL uses a population of agents with different
14 | hyperparameters and an evolutionary outer loop (Figure below) similar to PBT. During evaluation, each
15 | agent/hyperparameter combination gets a fitness score and the environment interactions are stored as transitions in an
16 | shared replay memory. Based on the fitness score, the best agent/hyperparameter combinations are selected, the
17 | hyperparameters are mutated and trained with samples from a shared replay memory and afterwards evaluated again. The
18 | population in SEARL benefits from a diverse set of experience in the shared replay memory. SEARL trains an RL agent and
19 | jointly finds optimal hyperparameters and neural architectures with up to ten times less environment interactions as
20 | random search or PBT.    
21 | 
22 | ![searl algorithm](images/searl.png)
23 | 
24 | SEARL allows training an agent and simultaneously automatically tuning the hyperparameters with nearly the same amount
25 | of environment steps. For a fair evaluation we introduce a new evaluation protocol considering all
26 | environment interactions during an algorithms HPO.
27 | 
28 | 
29 | ## Usage
30 | 
31 | 
32 | ### Requirements
33 | 
34 | The source code is tested on Linux with Python 3.6. Please install the `requirements.txt` and make sure your have 
35 | MuJoCo version 2.0 binaries in your home folder as well as a valid licence. 
36 | You can obtain a 30-day free trial licence on the [MuJoCo website](https://www.roboti.us/license.html).
37 | The license key will arrive in an email with your username and password. You can download the MuJoCo version 2.0 
38 | binaries for [Linux](https://www.roboti.us/download/mujoco200_linux.zip). Please unzip the downloaded `mujoco200` 
39 | directory into `~/.mujoco/mujoco200`, and place your license key (the `mjkey.txt` file from your email)
40 | at `~/.mujoco/mjkey.txt`.
41 | 
42 | Please find help in case of any install issues at [openai/mujoco-py](https://github.com/openai/mujoco-py), in the 
43 | [full documentation](https://openai.github.io/mujoco-py/build/html/index.html) or in the 
44 | [mujoco-py issues scetion](https://github.com/openai/mujoco-py/issues).
45 | 
46 | ### Run experiments
47 | 
48 | Please find in the `scripts` folder run scripts for the RL experiments on TD3 and DQN as well as SEARL experiments on 
49 | TD3 and DQN.  By default, the script loads the configuration for the experiment from the `configs` folder. You can also 
50 | start with a custom config or experiment directory by using the `--config_file` or `--expt_dir` argument.
51 | 
52 | ## Cite
53 | 
54 | If you use this code in your own work, please cite SEARL using the following bibtex entry:
55 | 
56 | ```
57 | @inproceedings{franke2020searl,
58 |   title={Sample-Efficient Automated Deep Reinforcement Learning},
59 |   author={Franke, J{\"o}rg KH and K{\"o}hler, Gregor and Biedenkapp, Andr{\'e} and Hutter, Frank},
60 |   booktitle={International Conference on Learning Representations},
61 |   year={2021},
62 | }
63 | ```
64 | 


--------------------------------------------------------------------------------
/configs/dqn_config.yml:
--------------------------------------------------------------------------------
 1 | dqn:
 2 |   optimizer:                    'adam'
 3 |   lr_actor:                     0.0001
 4 |   rm_capacity:                  1000000
 5 |   batch_size:                   128
 6 |   gamma:                        0.99
 7 |   soft_update:                  True
 8 | 
 9 |   num_frames:                   1000000
10 |   replay_initial:               10000
11 |   start_timesteps:              5000
12 |   eval_episodes:                10
13 | 
14 |   eval_freq:                    10000
15 | 
16 |   reset_target:                 False
17 |   recreate_optim:               False
18 |   min_eval_steps:               250
19 | 
20 | seed:
21 |   numpy:                        123
22 |   torch:                        123
23 |   env:                          123
24 | 
25 | env:
26 |   name:                         'PongNoFrameskip-v4' # 'FreewayNoFrameskip-v4', 'EnduroNoFrameskip-v4', 'BoxingNoFrameskip-v4', 'RoadRunnerNoFrameskip-v4',
27 | 
28 | expt:
29 |   project_name:                 'searl'
30 |   session_name:                 'baseline'
31 |   experiment_name:              'default_dqn'
32 | 
33 | support:
34 |   save_models:                   False
35 | 
36 | actor:
37 |   channel_size:       [32, 64, 64]
38 |   kernal_size:        [8, 4, 3]
39 |   stride_size:        [4, 2, 1]
40 |   hidden_size:        [128]
41 |   num_atoms:          51
42 |   Vmin:               -10
43 |   Vmax:               10
44 |   mlp_activation:     "relu"
45 |   cnn_activation:     "relu"
46 |   layer_norm:         False
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/configs/searl_dqn_config.yml:
--------------------------------------------------------------------------------
  1 | #######################################
  2 | ###       Logging Configuration     ###
  3 | #######################################
  4 | expt:
  5 |   project_name:                 "searl"
  6 |   session_name:                 "neuroevolution"
  7 |   experiment_name:              "default_searl_dqn"
  8 | 
  9 | 
 10 | #######################################
 11 | ###      NEVO Configuration         ###
 12 | #######################################
 13 | nevo:
 14 |   population_size:              10
 15 |   tournament_size:              3
 16 |   selection:                    True
 17 |   mutation:                     True
 18 |   training:                     True
 19 |   elitism:                      True
 20 |   min_train_time:               250
 21 |   worker:                       2
 22 |   reuse_batch:                  1
 23 |   ind_memory:                   False
 24 |   init_random:                  False
 25 | 
 26 | 
 27 | mutation:
 28 |   no_mutation:                  0.2
 29 |   parameters:                   0.2
 30 |   architecture:                 0.2
 31 |   activation:                   0.2
 32 |   rl_hyperparam:                0.2
 33 |   rl_hp_selection:              ['lr_actor']
 34 |   new_layer_prob:               0.2
 35 |   mutation_sd:                  0.1
 36 | 
 37 | 
 38 | train:
 39 |   replay_memory_size:           2000000
 40 |   num_frames:                   2000000
 41 |   td3_double_q:                 False
 42 |   evo_warm_up:                  1
 43 |   min_train_steps:              1000
 44 |   max_train_steps:              50000
 45 | 
 46 | 
 47 | rl:
 48 |   train_frames_fraction:        0.5 # 5000 train_iternations
 49 |   gamma:                        0.99
 50 |   soft_update:                  True
 51 |   tau:                          0.005
 52 |   batch_size:                   128
 53 |   lr_actor:                     0.0001
 54 |   optimizer:                    "adam" ##  ["adam", "adamax", "rmsprop", "sdg"]
 55 |   start_timesteps:              10000
 56 | 
 57 |   rm_capacity: 2000000
 58 | 
 59 |   num_frames: 50000000
 60 |   replay_initial: 10000
 61 |   eval_episodes: 10
 62 | 
 63 |   eval_freq: 10000
 64 | 
 65 |   reset_target: False
 66 |   recreate_optim: False
 67 |   min_eval_steps: 200
 68 | 
 69 |   num_atoms: 51
 70 |   Vmin: -10
 71 |   Vmax: 10
 72 | 
 73 | 
 74 | seed:
 75 |   replay_memory:                123
 76 |   evaluation:                   123
 77 |   mutation:                     123
 78 |   training:                     123
 79 |   torch:                        123
 80 |   numpy:                        123
 81 | 
 82 | 
 83 | #######################################
 84 | ###    Environment Configuration    ###
 85 | #######################################
 86 | env:
 87 |   name:                         'PongNoFrameskip-v4'
 88 | 
 89 | 
 90 | eval:
 91 |   eval_episodes:                1
 92 |   min_eval_steps:               250
 93 |   exploration_noise:            0.1     # Default 0.1
 94 |   test_episodes:                10
 95 |   test_seed:                    123
 96 | 
 97 | 
 98 | #######################################
 99 | ###  Actor Starting Configuration   ###
100 | #######################################
101 | actor:
102 |   channel_size:       [32, 32]
103 |   kernal_size:        [8, 4]
104 |   stride_size:        [4, 2]
105 |   hidden_size:        [128]
106 |   num_atoms:          51
107 |   Vmin:               -10
108 |   Vmax:               10
109 |   mlp_activation:     "relu"
110 |   cnn_activation:     "relu"
111 |   layer_norm:         False
112 | 
113 | 


--------------------------------------------------------------------------------
/configs/searl_td3_config.yml:
--------------------------------------------------------------------------------
  1 | #######################################
  2 | ###       Logging Configuration     ###
  3 | #######################################
  4 | expt:
  5 |   project_name:                 "searl"
  6 |   session_name:                 "neuroevolution"
  7 |   experiment_name:              "default_searl_td3"
  8 | 
  9 | 
 10 | #######################################
 11 | ###      NEVO Configuration         ###
 12 | #######################################
 13 | nevo:
 14 |   population_size:              10
 15 |   tournament_size:              3
 16 |   selection:                    True
 17 |   mutation:                     True
 18 |   training:                     True
 19 |   elitism:                      True
 20 |   min_train_time:               200
 21 |   worker:                       5
 22 |   reuse_batch:                  1
 23 |   ind_memory:                   False
 24 |   init_random:                  False
 25 | 
 26 | 
 27 | mutation:
 28 |   no_mutation:                  0.2
 29 |   parameters:                   0.2
 30 |   architecture:                 0.2
 31 |   activation:                   0.2
 32 |   rl_hyperparam:                0.2
 33 |   rl_hp_selection:              ['lr_actor','lr_critic'] # 'train_frames_fraction','batch_size',,'td3_policy_noise','td3_update_freq', 'optimizer']
 34 |   new_layer_prob:               0.2
 35 |   mutation_sd:                  0.1
 36 | 
 37 | 
 38 | train:
 39 |   replay_memory_size:           1000000
 40 |   num_frames:                   2000000
 41 |   td3_double_q:                 True
 42 |   evo_warm_up:                  1
 43 |   min_train_steps:              250
 44 | 
 45 | 
 46 | rl:
 47 |   train_frames_fraction:        0.5 # 5000 train_iternations
 48 |   gamma:                        0.99
 49 |   tau:                          0.005
 50 |   batch_size:                   100
 51 |   lr_actor:                     0.001
 52 |   lr_critic:                    0.001
 53 |   clip_grad_norm:               100
 54 |   td3_policy_noise:             0.2     # False or TD3 default: 0.2
 55 |   td3_noise_clip:               0.5     # default 0.5
 56 |   td3_update_freq:              2       # 1 or TD3 default: 2
 57 |   optimizer:                    "adam" ##  ["adam", "adamax", "rmsprop", "sdg"]
 58 |   start_timesteps:              1
 59 | 
 60 | 
 61 | seed:
 62 |   replay_memory:                123
 63 |   evaluation:                   123
 64 |   mutation:                     123
 65 |   training:                     123
 66 |   torch:                        123
 67 |   numpy:                        123
 68 | 
 69 | 
 70 | #######################################
 71 | ###    Environment Configuration    ###
 72 | #######################################
 73 | env:
 74 |   name:                        'Walker2d-v2' #'Walker2d-v2' #'HalfCheetah-v2' # HalfCheetah-v2'
 75 | 
 76 | 
 77 | eval:
 78 |   eval_episodes:                1
 79 |   min_eval_steps:               250
 80 |   exploration_noise:            0.1     # Default 0.1
 81 |   test_episodes:                10
 82 |   test_seed:                    123
 83 | 
 84 | 
 85 | #######################################
 86 | ###  Actor Starting Configuration   ###
 87 | #######################################
 88 | actor:
 89 |   hidden_size:             [128]
 90 |   activation:              'relu' #'relu' , 'sigmoid' 'softplus',
 91 |   output_activation:       'tanh'
 92 |   layer_norm:              True
 93 |   output_vanish:           False
 94 | 
 95 | #######################################
 96 | ###  Critic Starting Configuration  ###
 97 | #######################################
 98 | critic:
 99 |   hidden_size:             [128]
100 |   activation:              'relu' #'relu' , 'sigmoid' 'softplus',
101 |   output_activation:       'linear'
102 |   layer_norm:              True
103 |   output_vanish:           True
104 | 


--------------------------------------------------------------------------------
/configs/td3_config.yml:
--------------------------------------------------------------------------------
 1 | td3:
 2 |   gamma:                        0.99
 3 |   tau:                          0.005
 4 |   lr_actor:                     0.001
 5 |   lr_critic:                    0.001
 6 |   batch_size:                   100
 7 |   double_q:                     True
 8 |   clip_grad_norm:               100
 9 |   td3_policy_noise:             0.2     # False or TD3 default: 0.2
10 |   td3_noise_clip:               0.5     # default 0.5
11 |   td3_update_freq:              2       # 1 or TD3 default: 2
12 |   optimizer:                    'adam'
13 |   rm_capacity:                  1000000
14 |   eval_freq:                    5000
15 |   start_timesteps:              10000
16 |   exploration_noise:            0.1
17 |   eval_episodes:                10
18 |   max_timesteps:                2000000
19 |   reset_target:                 False
20 |   recreate_optim:               False
21 | 
22 | seed:
23 |   numpy:                        123
24 |   torch:                        123
25 |   env:                          123
26 | 
27 | env:
28 |   name:                         'HalfCheetah-v2'
29 | 
30 | expt:
31 |   project_name:                 'searl'
32 |   session_name:                 'baseline'
33 |   experiment_name:              'default_td3'
34 | 
35 | support:
36 |   save_models:                    False
37 | 
38 | actor:
39 |   hidden_size:                    [128]
40 |   activation:                     'relu' # 'sigmoid' 'softplus',
41 |   output_activation:              'tanh'
42 |   layer_norm:                     True
43 |   output_vanish:                  False
44 | 
45 | critic:
46 |   hidden_size:                    [128]
47 |   activation:                     'relu' # 'sigmoid' 'softplus'
48 |   output_activation:              'linear'
49 |   layer_norm:                     True
50 |   output_vanish:                  True
51 | 


--------------------------------------------------------------------------------
/images/searl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/images/searl.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cffi==1.14.5
 2 | cloudpickle==1.3.0
 3 | Cython==0.29.22
 4 | fasteners==0.16
 5 | fastrand==1.3.0
 6 | future==0.18.2
 7 | glfw==2.1.0
 8 | gym==0.17.1
 9 | imageio==2.9.0
10 | mujoco-py==2.0.2.9
11 | numpy==1.20.1
12 | opencv-python==4.5.1.48
13 | Pillow==8.1.2
14 | pyaml==20.4.0
15 | pycparser==2.20
16 | pyglet==1.5.0
17 | PyYAML==5.4.1
18 | scipy==1.6.1
19 | six==1.15.0
20 | torch==1.6.0
21 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/run_dqn.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | from searl.rl_algorithms.dqn import start_DQN_training
 7 | 
 8 | parser = argparse.ArgumentParser(description='define cluster setup')
 9 | 
10 | parser.add_argument('--expt_dir', type=str, default=False, help='expt_dir')
11 | parser.add_argument('--config_file', type=str, default=False, help='config_dir')
12 | args = parser.parse_args()
13 | 
14 | if args.config_file == False:
15 |     print("no config file")
16 |     config_file = Path(os.getcwd()).parents[0] / "configs/dqn_config.yml"
17 | else:
18 |     config_file = args.config_file
19 | 
20 | if args.expt_dir == False:
21 |     print("no experiment dir")
22 |     expt_dir = Path(os.getcwd()).parents[0] / "experiments"
23 | else:
24 |     expt_dir = args.expt_dir
25 | 
26 | 
27 | with open(config_file, 'r') as f:
28 |     config_dict = yaml.load(f, Loader=yaml.Loader)
29 | 
30 | start_DQN_training(config_dict, expt_dir=expt_dir)
31 | 


--------------------------------------------------------------------------------
/scripts/run_searl_dqn.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | from searl.neuroevolution.searl_dqn import start_searl_dqn_run
 7 | 
 8 | parser = argparse.ArgumentParser(description='define cluster setup')
 9 | 
10 | parser.add_argument('--expt_dir', type=str, default=False, help='expt_dir')
11 | parser.add_argument('--config_file', type=str, default=False, help='config_dir')
12 | args = parser.parse_args()
13 | 
14 | if args.config_file == False:
15 |     print("no config file")
16 |     config_file = Path(os.getcwd()).parents[0] / "configs/searl_dqn_config.yml"
17 | else:
18 |     config_file = args.config_file
19 | 
20 | if args.expt_dir == False:
21 |     print("no experiment dir")
22 |     expt_dir = Path(os.getcwd()).parents[0] / "experiments"
23 | else:
24 |     expt_dir = args.expt_dir
25 | 
26 | with open(config_file, 'r') as f:
27 |     config_dict = yaml.load(f, Loader=yaml.Loader)
28 | 
29 | start_searl_dqn_run(config_dict, expt_dir=expt_dir)
30 | 


--------------------------------------------------------------------------------
/scripts/run_searl_td3.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | from searl.neuroevolution.searl_td3 import start_searl_td3_run
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser(description='define cluster setup')
10 | 
11 |     parser.add_argument('--expt_dir', type=str, default=False, help='expt_dir')
12 |     parser.add_argument('--config_file', type=str, default=False, help='config_dir')
13 |     args = parser.parse_args()
14 | 
15 |     if args.config_file == False:
16 |         print("no config file")
17 |         config_file = Path(os.getcwd()).parents[0] / "configs/searl_td3_config.yml"
18 |     else:
19 |         config_file = args.config_file
20 | 
21 |     if args.expt_dir == False:
22 |         print("no experiment dir")
23 |         expt_dir = Path(os.getcwd()).parents[0] / "experiments"
24 |     else:
25 |         expt_dir = args.expt_dir
26 | 
27 |     os.environ["LD_LIBRARY_PATH"] = f"$LD_LIBRARY_PATH:{str(Path.home())}/.mujoco/mujoco200/bin:/usr/lib/nvidia-384"
28 | 
29 |     with open(config_file, 'r') as f:
30 |         config_dict = yaml.load(f, Loader=yaml.Loader)
31 | 
32 |     start_searl_td3_run(config_dict, expt_dir=expt_dir)
33 | 


--------------------------------------------------------------------------------
/scripts/run_td3.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import os
 4 | from pathlib import Path
 5 | 
 6 | from searl.rl_algorithms.td3 import start_TD3_training
 7 | 
 8 | parser = argparse.ArgumentParser(description='define cluster setup')
 9 | 
10 | parser.add_argument('--expt_dir', type=str, default=False, help='expt_dir')
11 | parser.add_argument('--config_file', type=str, default=False, help='config_dir')
12 | args = parser.parse_args()
13 | 
14 | if args.config_file == False:
15 |     print("no config file")
16 |     config_file = Path(os.getcwd()).parents[0] / "configs/td3_config.yml"
17 | else:
18 |     config_file = args.config_file
19 | 
20 | if args.expt_dir == False:
21 |     print("no experiment dir")
22 |     expt_dir = Path(os.getcwd()).parents[0] / "experiments"
23 | else:
24 |     expt_dir = args.expt_dir
25 | 
26 | os.environ["LD_LIBRARY_PATH"] = f"$LD_LIBRARY_PATH:{str(Path.home())}/.mujoco/mujoco200/bin:/usr/lib/nvidia-384"
27 | 
28 | with open(config_file, 'r') as f:
29 |     config_dict = yaml.load(f, Loader=yaml.Loader)
30 | 
31 | start_TD3_training(config_dict, expt_dir=expt_dir)
32 | 


--------------------------------------------------------------------------------
/searl/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils.handler.config import ConfigHandler


--------------------------------------------------------------------------------
/searl/neuroevolution/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/neuroevolution/__init__.py


--------------------------------------------------------------------------------
/searl/neuroevolution/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/neuroevolution/components/__init__.py


--------------------------------------------------------------------------------
/searl/neuroevolution/components/envolvable_cnn.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import math
  3 | from collections import OrderedDict
  4 | from typing import List
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | import torch.autograd as autograd
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | 
 12 | 
 13 | class NoisyLinear(nn.Module):
 14 |     def __init__(self, in_features, out_features, std_init=0.4):
 15 |         super(NoisyLinear, self).__init__()
 16 | 
 17 |         self.in_features = in_features
 18 |         self.out_features = out_features
 19 |         self.std_init = std_init
 20 | 
 21 |         self.weight_mu = nn.Parameter(torch.FloatTensor(out_features, in_features))
 22 |         self.weight_sigma = nn.Parameter(torch.FloatTensor(out_features, in_features))
 23 |         self.register_buffer('weight_epsilon', torch.FloatTensor(out_features, in_features))
 24 | 
 25 |         self.bias_mu = nn.Parameter(torch.FloatTensor(out_features))
 26 |         self.bias_sigma = nn.Parameter(torch.FloatTensor(out_features))
 27 |         self.register_buffer('bias_epsilon', torch.FloatTensor(out_features))
 28 | 
 29 |         self.reset_parameters()
 30 |         self.reset_noise()
 31 | 
 32 |     def forward(self, x):
 33 | 
 34 |         weight_epsilon = self.weight_epsilon.to(x.device)
 35 |         bias_epsilon = self.bias_epsilon.to(x.device)
 36 | 
 37 |         if self.training:
 38 |             weight = self.weight_mu + self.weight_sigma.mul(weight_epsilon)
 39 |             bias = self.bias_mu + self.bias_sigma.mul(bias_epsilon)
 40 |         else:
 41 |             weight = self.weight_mu
 42 |             bias = self.bias_mu
 43 | 
 44 |         return F.linear(x, weight, bias)
 45 | 
 46 |     def reset_parameters(self):
 47 |         mu_range = 1 / math.sqrt(self.weight_mu.size(1))
 48 | 
 49 |         self.weight_mu.data.uniform_(-mu_range, mu_range)
 50 |         self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.weight_sigma.size(1)))
 51 | 
 52 |         self.bias_mu.data.uniform_(-mu_range, mu_range)
 53 |         self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.bias_sigma.size(0)))
 54 | 
 55 |     def reset_noise(self):
 56 |         epsilon_in = self._scale_noise(self.in_features)
 57 |         epsilon_out = self._scale_noise(self.out_features)
 58 | 
 59 |         self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in))
 60 |         self.bias_epsilon.copy_(epsilon_out)
 61 | 
 62 |     def _scale_noise(self, size):
 63 |         x = torch.randn(size)
 64 |         x = x.sign().mul(x.abs().sqrt())
 65 |         return x
 66 | 
 67 | 
 68 | class EvolvableCnnDQN(nn.Module):
 69 | 
 70 |     def __init__(self, input_shape: List[int],
 71 |                  channel_size: List[int],
 72 |                  kernal_size: List[int],
 73 |                  stride_size: List[int],
 74 |                  hidden_size: List[int],
 75 |                  num_actions: int,
 76 |                  num_atoms: int,
 77 |                  Vmin: int,
 78 |                  Vmax: int,
 79 |                  mlp_activation='relu',
 80 |                  cnn_activation='relu',
 81 |                  layer_norm=False, stored_values=None, device="cpu"):
 82 | 
 83 |         super(EvolvableCnnDQN, self).__init__()
 84 | 
 85 |         self.input_shape = input_shape
 86 |         self.channel_size = channel_size
 87 |         self.kernal_size = kernal_size
 88 |         self.stride_size = stride_size
 89 |         self.hidden_size = hidden_size
 90 |         self.num_actions = num_actions
 91 |         self.num_atoms = num_atoms
 92 |         self.Vmin = Vmin
 93 |         self.Vmax = Vmax
 94 |         self.mlp_activation = mlp_activation
 95 |         self.cnn_activation = cnn_activation
 96 |         self.layer_norm = layer_norm
 97 |         self.device = device
 98 | 
 99 |         self.net = self.create_nets()
100 |         self.feature_net, self.value_net, self.advantage_net = self.create_nets()
101 | 
102 |         if stored_values is not None:
103 |             self.inject_parameters(pvec=stored_values, without_layer_norm=False)
104 | 
105 |     def get_activation(self, activation_names):
106 | 
107 |         activation_functions = {'tanh': nn.Tanh, 'gelu': nn.GELU, 'relu': nn.ReLU, 'elu': nn.ELU,
108 |                                 'softsign': nn.Softsign, 'sigmoid': nn.Sigmoid, 'softplus': nn.Softplus,
109 |                                 'lrelu': nn.LeakyReLU, 'prelu': nn.PReLU, }
110 |         return activation_functions[activation_names]()
111 | 
112 |     def create_mlp(self, input_size, output_size, hidden_size, name):
113 | 
114 |         net_dict = OrderedDict()
115 | 
116 |         net_dict[f"{name}_linear_layer_0"] = NoisyLinear(input_size, hidden_size[0])
117 |         if self.layer_norm:
118 |             net_dict[f"{name}_layer_norm_0"] = nn.LayerNorm(hidden_size[0])
119 |         net_dict[f"{name}_activation_0"] = self.get_activation(self.mlp_activation)
120 | 
121 |         if len(hidden_size) > 1:
122 |             for l_no in range(1, len(hidden_size)):
123 |                 net_dict[f"{name}_linear_layer_{str(l_no)}"] = NoisyLinear(hidden_size[l_no - 1], hidden_size[l_no])
124 |                 if self.layer_norm:
125 |                     net_dict[f"{name}_layer_norm_{str(l_no)}"] = nn.LayerNorm(hidden_size[l_no])
126 |                 net_dict[f"{name}_activation_{str(l_no)}"] = self.get_activation(self.mlp_activation)
127 |         net_dict[f"{name}_linear_layer_output"] = NoisyLinear(hidden_size[-1], output_size)
128 |         return nn.Sequential(net_dict)
129 | 
130 |     def create_cnn(self, input_size, channel_size, kernal_size, stride_size, name):
131 | 
132 |         net_dict = OrderedDict()
133 | 
134 |         net_dict[f"{name}_conv_layer_0"] = nn.Conv2d(in_channels=input_size, out_channels=channel_size[0],
135 |                                                      kernel_size=kernal_size[0],
136 |                                                      stride=stride_size[0])
137 |         if self.layer_norm:
138 |             net_dict[f"{name}_layer_norm_0"] = nn.BatchNorm2d(channel_size[0])
139 |         net_dict[f"{name}_activation_0"] = self.get_activation(self.cnn_activation)
140 | 
141 |         if len(channel_size) > 1:
142 |             for l_no in range(1, len(channel_size)):
143 |                 net_dict[f"{name}_conv_layer_{str(l_no)}"] = nn.Conv2d(in_channels=channel_size[l_no - 1],
144 |                                                                        out_channels=channel_size[l_no],
145 |                                                                        kernel_size=kernal_size[l_no],
146 |                                                                        stride=stride_size[l_no])
147 |                 if self.layer_norm:
148 |                     net_dict[f"{name}_layer_norm_{str(l_no)}"] = nn.BatchNorm2d(channel_size[l_no])
149 |                 net_dict[f"{name}_activation_{str(l_no)}"] = self.get_activation(self.cnn_activation)
150 | 
151 |         return nn.Sequential(net_dict)
152 | 
153 |     def create_nets(self):
154 | 
155 |         feature_net = self.create_cnn(self.input_shape[0], self.channel_size, self.kernal_size, self.stride_size,
156 |                                       name="feature")
157 | 
158 |         input_size = feature_net(autograd.Variable(torch.zeros(1, *self.input_shape))).view(1, -1).size(1)
159 | 
160 |         value_net = self.create_mlp(input_size, output_size=self.num_atoms, hidden_size=self.hidden_size, name="value")
161 |         advantage_net = self.create_mlp(input_size, output_size=self.num_atoms * self.num_actions,
162 |                                         hidden_size=self.hidden_size,
163 |                                         name="adcantage")
164 | 
165 |         feature_net.to(self.device)
166 |         value_net.to(self.device)
167 |         advantage_net.to(self.device)
168 | 
169 |         return feature_net, value_net, advantage_net
170 | 
171 |     def reset_noise(self):
172 |         for l in self.value_net:
173 |             if isinstance(l, NoisyLinear):
174 |                 l.reset_noise()
175 |         for l in self.advantage_net:
176 |             if isinstance(l, NoisyLinear):
177 |                 l.reset_noise()
178 | 
179 |     def forward(self, x):
180 |         if not isinstance(x, torch.Tensor):
181 |             x = torch.FloatTensor(x)
182 | 
183 |         batch_size = x.size(0)
184 |         x = x / 255.
185 | 
186 |         x = self.feature_net(x)
187 |         x = x.view(batch_size, -1)
188 | 
189 |         value = self.value_net(x)
190 |         advantage = self.advantage_net(x)
191 | 
192 |         value = value.view(batch_size, 1, self.num_atoms)
193 |         advantage = advantage.view(batch_size, self.num_actions, self.num_atoms)
194 | 
195 |         x = value + advantage - advantage.mean(1, keepdim=True)
196 |         x = F.softmax(x.view(-1, self.num_atoms), dim=-1).view(-1, self.num_actions, self.num_atoms)
197 | 
198 |         return x
199 | 
200 |     def act(self, state):
201 | 
202 |         if not isinstance(state, torch.Tensor):
203 |             state = torch.FloatTensor(np.float32(state)).unsqueeze(0)
204 | 
205 |         state = state.to(self.device)
206 | 
207 |         dist = self.forward(state).data.cpu()
208 |         dist = dist * torch.linspace(self.Vmin, self.Vmax, self.num_atoms)
209 |         action = dist.sum(2).max(1)[1].numpy()[0]
210 |         return action
211 | 
212 |     @property
213 |     def short_dict(self):
214 |         short_dict = {"channel_size": self.channel_size, "kernal_size": self.kernal_size,
215 |                       "stride_size": self.stride_size, "hidden_size": self.hidden_size,
216 |                       "num_atoms": self.num_atoms,
217 |                       "Vmin": self.Vmin, "Vmax": self.Vmax,
218 |                       "mlp_activation": self.mlp_activation, "cnn_activation": self.cnn_activation,
219 |                       "layer_norm": self.layer_norm}
220 |         return short_dict
221 | 
222 |     @property
223 |     def init_dict(self):
224 |         initdict = {"input_shape": self.input_shape, "channel_size": self.channel_size, "kernal_size": self.kernal_size,
225 |                     "stride_size": self.stride_size, "hidden_size": self.hidden_size,
226 |                     "num_actions": self.num_actions, "num_atoms": self.num_atoms,
227 |                     "Vmin": self.Vmin, "Vmax": self.Vmax,
228 |                     "mlp_activation": self.mlp_activation, "cnn_activation": self.cnn_activation,
229 |                     "layer_norm": self.layer_norm, "device": self.device}
230 |         return initdict
231 | 
232 |     def get_model_dict(self):
233 | 
234 |         model_dict = self.init_dict
235 |         model_dict.update({'stored_values': self.extract_parameters(without_layer_norm=False)})
236 |         return model_dict
237 | 
238 |     def count_parameters(self, without_layer_norm=False):
239 |         count = 0
240 |         for name, param in self.named_parameters():
241 |             if not without_layer_norm or not 'layer_norm' in name:
242 |                 count += param.data.cpu().numpy().flatten().shape[0]
243 |         return count
244 | 
245 |     def extract_grad(self, without_layer_norm=False):
246 |         tot_size = self.count_parameters(without_layer_norm)
247 |         pvec = np.zeros(tot_size, np.float32)
248 |         count = 0
249 |         for name, param in self.named_parameters():
250 |             if not without_layer_norm or not 'layer_norm' in name:
251 |                 sz = param.grad.data.cpu().numpy().flatten().shape[0]
252 |                 pvec[count:count + sz] = param.grad.data.cpu().numpy().flatten()
253 |                 count += sz
254 |         return pvec.copy()
255 | 
256 |     def extract_parameters(self, without_layer_norm=False):
257 |         tot_size = self.count_parameters(without_layer_norm)
258 |         pvec = np.zeros(tot_size, np.float32)
259 |         count = 0
260 |         for name, param in self.named_parameters():
261 |             if not without_layer_norm or not 'layer_norm' in name:
262 |                 sz = param.data.cpu().detach().numpy().flatten().shape[0]
263 |                 pvec[count:count + sz] = param.data.cpu().detach().numpy().flatten()
264 |                 count += sz
265 |         return copy.deepcopy(pvec)
266 | 
267 |     def inject_parameters(self, pvec, without_layer_norm=False):
268 |         count = 0
269 | 
270 |         for name, param in self.named_parameters():
271 |             if not without_layer_norm or not 'layer_norm' in name:
272 |                 sz = param.data.cpu().numpy().flatten().shape[0]
273 |                 raw = pvec[count:count + sz]
274 |                 reshaped = raw.reshape(param.data.cpu().numpy().shape)
275 |                 param.data = torch.from_numpy(copy.deepcopy(reshaped)).type(torch.FloatTensor)
276 |                 count += sz
277 |         return pvec
278 | 
279 |     def add_mlp_layer(self):
280 |         if len(self.hidden_size) < 3:  # HARD LIMIT
281 |             self.hidden_size += [self.hidden_size[-1]]
282 | 
283 |             self.recreate_nets()
284 |         else:
285 |             self.add_mlp_node()
286 | 
287 |     def add_mlp_node(self, hidden_layer=None, numb_new_nodes=None):
288 |         if hidden_layer is None:
289 |             hidden_layer = np.random.randint(0, len(self.hidden_size), 1)[0]
290 |         else:
291 |             hidden_layer = min(hidden_layer, len(self.hidden_size) - 1)
292 |         if numb_new_nodes is None:
293 |             numb_new_nodes = np.random.choice([32, 64, 128], 1)[0]
294 | 
295 |         if self.hidden_size[hidden_layer] + numb_new_nodes <= 1024:  # HARD LIMIT
296 | 
297 |             self.hidden_size[hidden_layer] += numb_new_nodes
298 | 
299 |             self.recreate_nets()
300 |         return {"hidden_layer": hidden_layer, "numb_new_nodes": numb_new_nodes}
301 | 
302 |     def add_cnn_layer(self):
303 |         if len(self.channel_size) < 6:  # HARD LIMIT
304 |             self.channel_size += [self.channel_size[-1]]
305 |             self.kernal_size += [3]
306 | 
307 |             stride_size_list = [[4], [4, 2], [4, 2, 1], [2, 2, 2, 1], [2, 1, 2, 1, 2], [2, 1, 2, 1, 2, 1]]
308 |             self.stride_size = stride_size_list[len(self.channel_size) - 1]
309 | 
310 |             self.recreate_nets()
311 |         else:
312 |             self.add_cnn_channel()
313 | 
314 |     def change_cnn_kernal(self):
315 |         if len(self.channel_size) > 1:
316 |             hidden_layer = np.random.randint(1, min(4, len(self.channel_size)), 1)[0]
317 |             self.kernal_size[hidden_layer] = np.random.choice([3, 4, 5, 7])
318 | 
319 |             self.recreate_nets()
320 |         else:
321 |             self.add_cnn_layer()
322 | 
323 |     def add_cnn_channel(self, hidden_layer=None, numb_new_channels=None):
324 | 
325 |         if hidden_layer is None:
326 |             hidden_layer = np.random.randint(0, len(self.channel_size), 1)[0]
327 |         else:
328 |             hidden_layer = min(hidden_layer, len(self.channel_size) - 1)
329 |         if numb_new_channels is None:
330 |             numb_new_nodes = np.random.choice([8, 16, 32], 1)[0]
331 | 
332 |         if self.channel_size[hidden_layer] + numb_new_nodes <= 256:  # HARD LIMIT
333 | 
334 |             self.channel_size[hidden_layer] += numb_new_nodes
335 | 
336 |             self.recreate_nets()
337 | 
338 |         return {"hidden_layer": hidden_layer, "numb_new_channels": numb_new_channels}
339 | 
340 |     def recreate_nets(self):
341 |         new_feature_net, new_value_net, new_advantage_net = self.create_nets()
342 |         new_feature_net = self.preserve_parameters(old_net=self.feature_net, new_net=new_feature_net)
343 |         new_value_net = self.preserve_parameters(old_net=self.value_net, new_net=new_value_net)
344 |         new_advantage_net = self.preserve_parameters(old_net=self.advantage_net, new_net=new_advantage_net)
345 |         self.feature_net, self.value_net, self.advantage_net = new_feature_net, new_value_net, new_advantage_net
346 | 
347 |     def clone(self):
348 |         clone = EvolvableCnnDQN(**copy.deepcopy(self.init_dict))
349 |         clone.load_state_dict(self.state_dict())
350 |         return clone
351 | 
352 |     def preserve_parameters(self, old_net, new_net):
353 | 
354 |         old_net_dict = dict(old_net.named_parameters())
355 | 
356 |         for key, param in new_net.named_parameters():
357 |             if key in old_net_dict.keys():
358 |                 if old_net_dict[key].data.size() == param.data.size():
359 |                     param.data = old_net_dict[key].data
360 |                 else:
361 |                     if not "norm" in key:
362 |                         old_size = old_net_dict[key].data.size()
363 |                         new_size = param.data.size()
364 |                         if len(param.data.size()) == 1:
365 |                             param.data[:min(old_size[0], new_size[0])] = old_net_dict[key].data[
366 |                                                                          :min(old_size[0], new_size[0])]
367 |                         elif len(param.data.size()) == 2:
368 |                             param.data[:min(old_size[0], new_size[0]), :min(old_size[1], new_size[1])] = old_net_dict[
369 |                                                                                                              key].data[
370 |                                                                                                          :min(old_size[
371 |                                                                                                                   0],
372 |                                                                                                               new_size[
373 |                                                                                                                   0]),
374 |                                                                                                          :min(old_size[
375 |                                                                                                                   1],
376 |                                                                                                               new_size[
377 |                                                                                                                   1])]
378 |                         else:
379 |                             param.data[:min(old_size[0], new_size[0]), :min(old_size[1], new_size[1]),
380 |                             :min(old_size[2], new_size[2]),
381 |                             :min(old_size[3], new_size[3])] = old_net_dict[key].data[
382 |                                                               :min(old_size[0], new_size[0]),
383 |                                                               :min(old_size[1], new_size[1]),
384 |                                                               :min(old_size[2], new_size[2]),
385 |                                                               :min(old_size[3], new_size[3]),
386 |                                                               ]
387 | 
388 |         return new_net
389 | 
390 |     def shrink_preserve_parameters(self, old_net, new_net):
391 | 
392 |         old_net_dict = dict(old_net.named_parameters())
393 | 
394 |         for key, param in new_net.named_parameters():
395 |             if key in old_net_dict.keys():
396 |                 if old_net_dict[key].data.size() == param.data.size():
397 |                     param.data = old_net_dict[key].data
398 |                 else:
399 |                     if not "norm" in key:
400 |                         old_size = old_net_dict[key].data.size()
401 |                         new_size = param.data.size()
402 |                         min_0 = min(old_size[0], new_size[0])
403 |                         if len(param.data.size()) == 1:
404 |                             param.data[:min_0] = old_net_dict[key].data[:min_0]
405 |                         else:
406 |                             min_1 = min(old_size[1], new_size[1])
407 |                             param.data[:min_0, :min_1] = old_net_dict[key].data[:min_0, :min_1]
408 |         return new_net
409 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/components/envolvable_mlp.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from collections import OrderedDict
  3 | from typing import List
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | import torch.nn as nn
  8 | 
  9 | 
 10 | class EvolvableMLP(nn.Module):
 11 |     def __init__(self, num_inputs: int, num_outputs: int, hidden_size: List[int], activation='relu',
 12 |                  output_activation=None, layer_norm=False, output_vanish=True, stored_values=None):
 13 |         super(EvolvableMLP, self).__init__()
 14 | 
 15 |         self.num_inputs = num_inputs
 16 |         self.num_outputs = num_outputs
 17 |         self.activation = activation
 18 |         self.output_activation = output_activation
 19 |         self.layer_norm = layer_norm
 20 |         self.output_vanish = output_vanish
 21 | 
 22 |         self.hidden_size = hidden_size
 23 | 
 24 |         self.net = self.create_net()
 25 | 
 26 |         if stored_values is not None:
 27 |             self.inject_parameters(pvec=stored_values, without_layer_norm=False)
 28 | 
 29 |     def get_activation(self, activation_names):
 30 |         activation_functions = {'tanh': nn.Tanh, 'linear': nn.Identity, 'relu': nn.ReLU, 'elu': nn.ELU,
 31 |                                 'softsign': nn.Softsign, 'sigmoid': nn.Sigmoid, 'softplus': nn.Softplus,
 32 |                                 'lrelu': nn.LeakyReLU, 'prelu': nn.PReLU, }
 33 | 
 34 |         return activation_functions[activation_names]()
 35 | 
 36 |     def create_net(self):
 37 |         net_dict = OrderedDict()
 38 | 
 39 |         net_dict["linear_layer_0"] = nn.Linear(self.num_inputs, self.hidden_size[0])
 40 |         if self.layer_norm:
 41 |             net_dict["layer_norm_0"] = nn.LayerNorm(self.hidden_size[0])
 42 |         net_dict["activation_0"] = self.get_activation(self.activation)
 43 | 
 44 |         if len(self.hidden_size) > 1:
 45 |             for l_no in range(1, len(self.hidden_size)):
 46 |                 net_dict[f"linear_layer_{str(l_no)}"] = nn.Linear(self.hidden_size[l_no - 1], self.hidden_size[l_no])
 47 |                 if self.layer_norm:
 48 |                     net_dict[f"layer_norm_{str(l_no)}"] = nn.LayerNorm(self.hidden_size[l_no])
 49 |                 net_dict[f"activation_{str(l_no)}"] = self.get_activation(self.activation)
 50 | 
 51 |         output_layer = nn.Linear(self.hidden_size[-1], self.num_outputs)
 52 | 
 53 |         if self.output_vanish:
 54 |             output_layer.weight.data.mul_(0.1)
 55 |             output_layer.bias.data.mul_(0.1)
 56 | 
 57 |         net_dict[f"linear_layer_output"] = output_layer
 58 |         if self.output_activation is not None:
 59 |             net_dict[f"activation_output"] = self.get_activation(self.output_activation)
 60 | 
 61 |         return nn.Sequential(net_dict)
 62 | 
 63 |     def forward(self, x):
 64 |         if not isinstance(x, torch.Tensor):
 65 |             x = torch.FloatTensor(x)
 66 | 
 67 |         for value in self.net:
 68 |             x = value(x)
 69 |         return x
 70 | 
 71 |     def get_model_dict(self):
 72 | 
 73 |         model_dict = self.init_dict
 74 |         model_dict.update({'stored_values': self.extract_parameters(without_layer_norm=False)})
 75 |         return model_dict
 76 | 
 77 |     def count_parameters(self, without_layer_norm=False):
 78 |         count = 0
 79 |         for name, param in self.named_parameters():
 80 |             if not without_layer_norm or not 'layer_norm' in name:
 81 |                 count += param.data.cpu().numpy().flatten().shape[0]
 82 |         return count
 83 | 
 84 |     # function to return current pytorch gradient in same order as genome's flattened parameter vector
 85 |     def extract_grad(self, without_layer_norm=False):
 86 |         tot_size = self.count_parameters(without_layer_norm)
 87 |         pvec = np.zeros(tot_size, np.float32)
 88 |         count = 0
 89 |         for name, param in self.named_parameters():
 90 |             if not without_layer_norm or not 'layer_norm' in name:
 91 |                 sz = param.grad.data.cpu().numpy().flatten().shape[0]
 92 |                 pvec[count:count + sz] = param.grad.data.cpu().numpy().flatten()
 93 |                 count += sz
 94 |         return pvec.copy()
 95 | 
 96 |     # function to grab current flattened neural network weights
 97 |     def extract_parameters(self, without_layer_norm=False):
 98 |         tot_size = self.count_parameters(without_layer_norm)
 99 |         pvec = np.zeros(tot_size, np.float32)
100 |         count = 0
101 |         for name, param in self.named_parameters():
102 |             if not without_layer_norm or not 'layer_norm' in name:
103 |                 sz = param.data.cpu().detach().numpy().flatten().shape[0]
104 |                 pvec[count:count + sz] = param.data.cpu().detach().numpy().flatten()
105 |                 count += sz
106 |         return copy.deepcopy(pvec)
107 | 
108 |     # function to inject a flat vector of ANN parameters into the model's current neural network weights
109 |     def inject_parameters(self, pvec, without_layer_norm=False):
110 |         count = 0
111 | 
112 |         for name, param in self.named_parameters():
113 |             if not without_layer_norm or not 'layer_norm' in name:
114 |                 sz = param.data.cpu().numpy().flatten().shape[0]
115 |                 raw = pvec[count:count + sz]
116 |                 reshaped = raw.reshape(param.data.cpu().numpy().shape)
117 |                 param.data = torch.from_numpy(copy.deepcopy(reshaped)).type(torch.FloatTensor)
118 |                 count += sz
119 |         return pvec
120 | 
121 |     @property
122 |     def init_dict(self):
123 | 
124 |         init_dict = {"num_inputs": self.num_inputs, "num_outputs": self.num_outputs, "hidden_size": self.hidden_size,
125 |                      "activation": self.activation, "output_activation": self.output_activation,
126 |                      "layer_norm": self.layer_norm}
127 |         return init_dict
128 | 
129 |     @property
130 |     def short_dict(self):
131 | 
132 |         short_dict = {"hidden_size": self.hidden_size,
133 |                       "activation": self.activation, "output_activation": self.output_activation,
134 |                       "layer_norm": self.layer_norm}
135 |         return short_dict
136 | 
137 |     def add_layer(self):
138 | 
139 |         # add layer to hyper params
140 |         if len(self.hidden_size) < 3:  # HARD LIMIT
141 |             self.hidden_size += [self.hidden_size[-1]]
142 | 
143 |             # copy old params to new net
144 |             new_net = self.create_net()
145 |             new_net = self.preserve_parameters(old_net=self.net, new_net=new_net)
146 |             self.net = new_net
147 |         else:
148 |             self.add_node()
149 | 
150 |     def remove_layer(self):
151 |         if len(self.hidden_size) > 1:  # HARD LIMIT
152 |             self.hidden_size = self.hidden_size[:1]
153 |             new_net = self.create_net()
154 |             new_net = self.shrink_preserve_parameters(old_net=self.net, new_net=new_net)
155 |             self.net = new_net
156 |         else:
157 |             self.add_node()
158 | 
159 |     def add_node(self, hidden_layer=None, numb_new_nodes=None):
160 | 
161 |         if hidden_layer is None:
162 |             hidden_layer = np.random.randint(0, len(self.hidden_size), 1)[0]
163 |         else:
164 |             hidden_layer = min(hidden_layer, len(self.hidden_size) - 1)
165 |         if numb_new_nodes is None:
166 |             numb_new_nodes = np.random.choice([16, 32, 64], 1)[0]
167 | 
168 |         if self.hidden_size[hidden_layer] + numb_new_nodes <= 500:  # HARD LIMIT
169 |             self.hidden_size[hidden_layer] += numb_new_nodes
170 |             new_net = self.create_net()
171 |             new_net = self.preserve_parameters(old_net=self.net, new_net=new_net)
172 | 
173 |             self.net = new_net
174 | 
175 |         return {"hidden_layer": hidden_layer, "numb_new_nodes": numb_new_nodes}
176 | 
177 |     def remove_node(self, hidden_layer=None, numb_new_nodes=None):
178 | 
179 |         if hidden_layer is None:
180 |             hidden_layer = np.random.randint(0, len(self.hidden_size), 1)[0]
181 |         else:
182 |             hidden_layer = min(hidden_layer, len(self.hidden_size) - 1)
183 |         if numb_new_nodes is None:
184 |             numb_new_nodes = np.random.choice([16, 32, 64], 1)[0]
185 | 
186 |         if self.hidden_size[hidden_layer] - numb_new_nodes > 64:  # HARD LIMIT
187 |             self.hidden_size[hidden_layer] = self.hidden_size[hidden_layer] - numb_new_nodes
188 |             new_net = self.create_net()
189 |             new_net = self.shrink_preserve_parameters(old_net=self.net, new_net=new_net)
190 | 
191 |             self.net = new_net
192 | 
193 |         return {"hidden_layer": hidden_layer, "numb_new_nodes": numb_new_nodes}
194 | 
195 |     def clone(self):
196 |         clone = EvolvableMLP(**copy.deepcopy(self.init_dict))
197 |         clone.load_state_dict(self.state_dict())
198 |         return clone
199 | 
200 |     def preserve_parameters(self, old_net, new_net):
201 | 
202 |         old_net_dict = dict(old_net.named_parameters())
203 | 
204 |         for key, param in new_net.named_parameters():
205 |             if key in old_net_dict.keys():
206 |                 if old_net_dict[key].data.size() == param.data.size():
207 |                     param.data = old_net_dict[key].data
208 |                 else:
209 |                     if not "norm" in key:
210 |                         old_size = old_net_dict[key].data.size()
211 |                         new_size = param.data.size()
212 |                         if len(param.data.size()) == 1:
213 |                             param.data[:min(old_size[0], new_size[0])] = old_net_dict[key].data[
214 |                                                                          :min(old_size[0], new_size[0])]
215 |                         else:
216 |                             param.data[:min(old_size[0], new_size[0]), :min(old_size[1], new_size[1])] = old_net_dict[
217 |                                                                                                              key].data[
218 |                                                                                                          :min(old_size[
219 |                                                                                                                   0],
220 |                                                                                                               new_size[
221 |                                                                                                                   0]),
222 |                                                                                                          :min(old_size[
223 |                                                                                                                   1],
224 |                                                                                                               new_size[
225 |                                                                                                                   1])]
226 | 
227 |         return new_net
228 | 
229 |     def shrink_preserve_parameters(self, old_net, new_net):
230 | 
231 |         old_net_dict = dict(old_net.named_parameters())
232 | 
233 |         for key, param in new_net.named_parameters():
234 |             if key in old_net_dict.keys():
235 |                 if old_net_dict[key].data.size() == param.data.size():
236 |                     param.data = old_net_dict[key].data
237 |                 else:
238 |                     if not "norm" in key:
239 |                         old_size = old_net_dict[key].data.size()
240 |                         new_size = param.data.size()
241 |                         min_0 = min(old_size[0], new_size[0])
242 |                         if len(param.data.size()) == 1:
243 |                             param.data[:min_0] = old_net_dict[key].data[:min_0]
244 |                         else:
245 |                             min_1 = min(old_size[1], new_size[1])
246 |                             param.data[:min_0, :min_1] = old_net_dict[key].data[:min_0, :min_1]
247 |         return new_net
248 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/components/individual_dqn.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | from searl.neuroevolution.components.envolvable_cnn import EvolvableCnnDQN
 4 | 
 5 | 
 6 | class DQNIndividual():
 7 | 
 8 |     def __init__(self, state_dim, action_dim, actor_config, rl_config, index, device='cpu', replay_memory=None):
 9 |         self.state_dim = state_dim
10 |         self.action_dim = action_dim
11 |         self.actor_config = actor_config
12 |         self.rl_config = rl_config
13 |         self.index = index
14 |         self.device = device
15 | 
16 |         self.actor = EvolvableCnnDQN(input_shape=state_dim, num_actions=action_dim, device=device,
17 |                                      **actor_config).to(device)
18 | 
19 |         self.fitness = []
20 |         self.improvement = 0
21 |         self.train_log = {"pre_fitness": None, "pre_rank": None, "post_fitness": None, "post_rank": None, "eval_eps": 0,
22 |                           "index": None, "parent_index": None, "mutation": None}
23 | 
24 |         self.replay_memory = replay_memory
25 | 
26 |     def clone(self, index=None):
27 |         if index is None:
28 |             index = self.index
29 | 
30 |         clone = type(self)(state_dim=self.state_dim,
31 |                            action_dim=self.action_dim,
32 |                            actor_config=copy.deepcopy(self.actor.short_dict),
33 |                            rl_config=copy.deepcopy(self.rl_config),
34 |                            index=index,
35 |                            replay_memory=self.replay_memory,
36 |                            device=self.device
37 |                            )
38 | 
39 |         clone.fitness = copy.deepcopy(self.fitness)
40 |         clone.train_log = copy.deepcopy(self.train_log)
41 |         clone.actor = self.actor.clone()
42 | 
43 |         if self.replay_memory:
44 |             self.replay_memory = copy.deepcopy(self.replay_memory)
45 | 
46 |         return clone
47 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/components/individual_td3.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | from searl.neuroevolution.components.envolvable_mlp import EvolvableMLP
 4 | 
 5 | 
 6 | class Individual():
 7 | 
 8 |     def __init__(self, state_dim, action_dim, actor_config, critic_config, rl_config, index, td3_double_q,
 9 |                  critic_2_config=None, replay_memory=None):
10 | 
11 |         self.state_dim = state_dim
12 |         self.action_dim = action_dim
13 |         self.actor_config = actor_config
14 |         self.critic_config = critic_config
15 |         self.rl_config = rl_config
16 |         self.index = index
17 |         self.td3_double_q = td3_double_q
18 | 
19 |         if critic_2_config is None:
20 |             critic_2_config = copy.deepcopy(critic_config)
21 | 
22 |         self.actor = EvolvableMLP(num_inputs=state_dim, num_outputs=action_dim, **actor_config)
23 |         self.critic_1 = EvolvableMLP(num_inputs=state_dim + action_dim, num_outputs=1, **critic_config)
24 |         if td3_double_q:
25 |             self.critic_2 = EvolvableMLP(num_inputs=state_dim + action_dim, num_outputs=1, **critic_2_config)
26 | 
27 |         self.fitness = []
28 |         self.improvement = 0
29 |         self.train_log = {"pre_fitness": None, "pre_rank": None, "post_fitness": None, "post_rank": None, "eval_eps": 0,
30 |                           "index": None, "parent_index": None, "mutation": None}
31 | 
32 |         self.replay_memory = replay_memory
33 | 
34 |     def clone(self, index=None):
35 |         if index is None:
36 |             index = self.index
37 | 
38 |         if self.td3_double_q:
39 |             critic_2_config = copy.deepcopy(self.critic_2.short_dict)
40 |         else:
41 |             critic_2_config = None
42 | 
43 |         clone = type(self)(state_dim=self.state_dim,
44 |                            action_dim=self.action_dim,
45 |                            actor_config=copy.deepcopy(self.actor.short_dict),
46 |                            critic_config=copy.deepcopy(self.critic_1.short_dict),
47 |                            rl_config=copy.deepcopy(self.rl_config),
48 |                            index=index,
49 |                            td3_double_q=self.td3_double_q,
50 |                            critic_2_config=critic_2_config,
51 |                            replay_memory=self.replay_memory)
52 | 
53 |         clone.fitness = copy.deepcopy(self.fitness)
54 |         clone.train_log = copy.deepcopy(self.train_log)
55 |         clone.actor = self.actor.clone()
56 |         clone.critic_1 = self.critic_1.clone()
57 |         if self.td3_double_q:
58 |             clone.critic_2 = self.critic_2.clone()
59 | 
60 |         if self.replay_memory:
61 |             self.replay_memory = copy.deepcopy(self.replay_memory)
62 | 
63 |         return clone
64 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/components/replay_memory.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | import queue
  3 | import time
  4 | from typing import List, Dict
  5 | 
  6 | import fastrand
  7 | import numpy as np
  8 | 
  9 | 
 10 | class MPReplayMemory(object):
 11 | 
 12 |     def __init__(self, seed, capacity, batch_size, reuse_batch):
 13 | 
 14 |         ctx = mp.get_context('spawn')
 15 |         mp_manager = ctx.Manager()
 16 |         self.push_queue = mp_manager.Queue()
 17 |         self.sample_queue = mp_manager.Queue()
 18 |         self.save_queue = mp_manager.Queue()
 19 |         self.batch_size = batch_size
 20 | 
 21 |         np.random.seed(seed)
 22 |         self.memory_manager = ctx.Process(target=self._memory_manager,
 23 |                                           args=(capacity, batch_size, reuse_batch, self.push_queue, self.sample_queue,
 24 |                                                 self.save_queue))
 25 |         self.memory_manager.daemon = True
 26 |         self.memory_manager.start()
 27 | 
 28 |     def load(self, replay_memory_dict):
 29 |         self.push_queue.put(replay_memory_dict)
 30 | 
 31 |     def save(self):
 32 |         self.push_queue.put("SAVE")
 33 |         try:
 34 |             save_dict = self.save_queue.get(timeout=10)
 35 |             return save_dict
 36 |         except queue.Empty:
 37 |             print("save failed")
 38 |             return "no_save"
 39 | 
 40 |     @staticmethod
 41 |     def _memory_manager(capacity: int, batch_size: int, reuse_batch: int, push_queue: mp.Queue, sample_queue: mp.Queue,
 42 |                         save_queue: mp.Queue):
 43 |         memory = []
 44 |         position = 0
 45 | 
 46 |         while True:
 47 |             if not push_queue.empty():
 48 |                 queue_output = push_queue.get()
 49 |                 if queue_output == "QUIT":
 50 |                     return
 51 | 
 52 |                 elif queue_output == "SAVE":
 53 |                     save_queue.put({"memory": memory, "position": position})
 54 | 
 55 |                 elif isinstance(queue_output, Dict):
 56 |                     memory = queue_output["memory"]
 57 |                     position = queue_output["position"]
 58 | 
 59 |                 elif isinstance(queue_output, List):
 60 |                     for transition in queue_output:
 61 |                         if len(memory) < capacity:
 62 |                             memory.append(transition)
 63 |                         else:
 64 |                             memory[position] = transition
 65 |                             position = (position + 1) % capacity
 66 |                 else:
 67 |                     if len(memory) < capacity:
 68 |                         memory.append(queue_output)
 69 |                     else:
 70 |                         memory[position] = queue_output
 71 |                         position = (position + 1) % capacity
 72 | 
 73 |             if sample_queue.qsize() < 20 and len(memory) > batch_size:
 74 | 
 75 |                 transistion_list = []
 76 |                 for _ in range(batch_size):
 77 |                     idx = fastrand.pcg32bounded(len(memory))
 78 |                     transistion_list.append(memory[idx])
 79 |                 for _ in range(reuse_batch):
 80 |                     sample_queue.put(transistion_list)
 81 | 
 82 |     def close(self):
 83 |         print("CLOSE REPLAY MEMORY")
 84 |         self.push_queue.put("QUIT")
 85 |         while not self.push_queue.empty():
 86 |             time.sleep(1)
 87 | 
 88 |     def get_push_queue(self):
 89 |         return self.push_queue
 90 | 
 91 |     def get_sample_queue(self):
 92 |         return self.sample_queue
 93 | 
 94 | 
 95 | class ReplayMemory(object):
 96 | 
 97 |     def __init__(self, capacity: int, batch_size: int):
 98 |         self.storage = []
 99 |         self.capacity = capacity
100 |         self.batch_size = batch_size
101 |         self.ptr = 0
102 | 
103 |     def add(self, transistions):
104 |         if isinstance(transistions, List):
105 |             for transition in transistions:
106 |                 self._add(transition)
107 |         else:
108 |             self._add(transistions)
109 | 
110 |     def put(self, transistions):
111 |         self.add(transistions)
112 | 
113 |     def _add(self, transistion):
114 |         if len(self.storage) == self.capacity:
115 |             self.storage[int(self.ptr)] = transistion
116 |             self.ptr = (self.ptr + 1) % self.capacity
117 |         else:
118 |             self.storage.append(transistion)
119 | 
120 |     def get(self):
121 |         return self.sample()
122 | 
123 |     def sample(self):
124 |         ind = np.random.randint(0, len(self.storage), size=self.batch_size)
125 | 
126 |         transition_list = []
127 |         for i in ind:
128 |             transition_list.append(self.storage[i])
129 | 
130 |         return transition_list
131 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/components/utils.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | 
 3 | import numpy as np
 4 | import torch
 5 | 
 6 | fields = ('state', 'action', 'next_state', 'reward', 'done', 'weight', 'index')
 7 | Transition = namedtuple('Transition', fields)
 8 | Transition.__new__.__defaults__ = (None,) * len(Transition._fields)
 9 | 
10 | 
11 | def to_tensor(ndarray, requires_grad=False):
12 |     return torch.from_numpy(ndarray).float().requires_grad_(requires_grad)
13 | 
14 | 
15 | def feature_scaling(x):
16 |     return (x - np.min(x)) / (np.max(x) - np.min(x))
17 | 
18 | 
19 | def softmax(x):
20 |     return np.exp(x) / np.sum(np.exp(x))
21 | 
22 | 
23 | def soft_update(target, source, tau):
24 |     for target_param, source_param in zip(target.parameters(), source.parameters()):
25 |         target_param.data.copy_(target_param.data * (1.0 - tau) + source_param.data * tau)
26 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/evaluation_dqn.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | from searl.neuroevolution.components.utils import Transition
  7 | from searl.rl_algorithms.components.wrappers import make_atari, wrap_deepmind, wrap_pytorch
  8 | 
  9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 10 | print("train CUDA", device == torch.device("cuda"), device)
 11 | 
 12 | 
 13 | class MPEvaluation():
 14 |     def __init__(self, config, logger, replay_memory=None):
 15 | 
 16 |         self.rng = np.random.RandomState(config.seed.evaluation)
 17 |         self.cfg = config
 18 |         self.log = logger
 19 |         self.push_queue = replay_memory
 20 |         self.eval_episodes = config.eval.eval_episodes
 21 | 
 22 |     def test_individual(self, individual, epoch):
 23 |         return_dict = self._evaluate_individual(individual, self.cfg, self.cfg.eval.test_episodes, epoch, False)
 24 |         fitness = np.mean(return_dict[individual.index]["fitness_list"])
 25 |         return fitness
 26 | 
 27 |     @staticmethod
 28 |     def _evaluate_individual(individual, config, num_episodes, seed, exploration_noise=False, start_phase=False):
 29 | 
 30 |         actor_net = individual.actor
 31 | 
 32 |         num_frames = 0
 33 |         fitness_list = []
 34 |         transistions_list = []
 35 |         episodes = 0
 36 | 
 37 |         env = make_atari(config.env.name)
 38 |         env = wrap_deepmind(env)
 39 |         env = wrap_pytorch(env)
 40 |         env.seed(seed)
 41 | 
 42 |         actor_net.eval()
 43 |         actor_net.to(device)
 44 |         actor_net.device = device
 45 | 
 46 |         with torch.no_grad():
 47 |             while episodes < num_episodes or num_frames < config.eval.min_eval_steps:
 48 |                 episode_fitness = 0.0
 49 |                 episode_transitions = []
 50 |                 state = env.reset()
 51 | 
 52 |                 done = False
 53 |                 while not done:
 54 |                     action = actor_net.act(state)
 55 | 
 56 |                     next_state, reward, done, info = env.step(action)
 57 |                     episode_fitness += reward
 58 |                     num_frames += 1
 59 | 
 60 |                     transition = Transition(torch.FloatTensor(state), torch.LongTensor([action]),
 61 |                                             torch.FloatTensor(next_state), torch.FloatTensor(np.array([reward])),
 62 |                                             torch.FloatTensor(np.array([done]).astype('uint8'))
 63 |                                             )
 64 | 
 65 |                     episode_transitions.append(transition)
 66 |                     state = next_state
 67 |                 episodes += 1
 68 |                 fitness_list.append(episode_fitness)
 69 |                 transistions_list.append(episode_transitions)
 70 | 
 71 |         actor_net.to(torch.device("cpu"))
 72 | 
 73 |         return {individual.index: {"fitness_list": fitness_list, "num_episodes": num_episodes, "num_frames": num_frames,
 74 |                                    "id": individual.index, "transitions": transistions_list}}
 75 | 
 76 |     def evaluate_population(self, population: List, exploration_noise=False, total_frames=1):
 77 | 
 78 |         population_id_lookup = [ind.index for ind in population]
 79 |         new_population_mean_fitness = np.zeros(len(population))
 80 |         new_population_var_fitness = np.zeros(len(population))
 81 | 
 82 |         start_phase = total_frames <= self.cfg.rl.start_timesteps
 83 |         if start_phase:
 84 |             self.log("start phase", time_step=total_frames)
 85 | 
 86 |         args_list = [(ind, self.cfg, self.eval_episodes, self.rng.randint(0, 100000), exploration_noise, start_phase)
 87 |                      for ind in population]
 88 | 
 89 |         result_dict = []
 90 |         for args in args_list:
 91 |             result_dict.append(self._evaluate_individual(*args))
 92 | 
 93 |         eval_frames = 0
 94 |         for list_element in result_dict:
 95 |             for ind_id, value_dict in list_element.items():
 96 |                 pop_idx = population_id_lookup.index(ind_id)
 97 |                 new_population_mean_fitness[pop_idx] = np.mean(value_dict['fitness_list'])
 98 |                 new_population_var_fitness[pop_idx] = np.var(value_dict['fitness_list'])
 99 |                 eval_frames += value_dict['num_frames']
100 | 
101 |                 population[pop_idx].train_log["eval_eps"] = self.eval_episodes
102 | 
103 |                 for transitions in value_dict['transitions']:
104 |                     if self.cfg.nevo.ind_memory:
105 |                         population[pop_idx].replay_memory.add(transitions)
106 |                     else:
107 |                         self.push_queue.put(transitions)
108 | 
109 |         for idx in range(len(population)):
110 |             population[idx].train_log["post_fitness"] = new_population_mean_fitness[idx]
111 |             population[idx].train_log["index"] = population[idx].index
112 |             self.log.csv.log_csv(population[idx].train_log)
113 |             population[idx].train_log.update(
114 |                 {"pre_fitness": new_population_mean_fitness[idx],
115 |                  "eval_eps": 0})  # , "pre_rank": population_rank[idx], "eval_eps":0}
116 |             population[idx].fitness.append(new_population_mean_fitness[idx])
117 |             if len(population[idx].fitness) > 1:
118 |                 population[idx].improvement = population[idx].fitness[-1] - population[idx].fitness[-2]
119 |             else:
120 |                 population[idx].improvement = population[idx].fitness[-1]
121 | 
122 |         return new_population_mean_fitness, new_population_var_fitness, eval_frames
123 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/evaluation_td3.py:
--------------------------------------------------------------------------------
  1 | from collections import ChainMap
  2 | from typing import List
  3 | 
  4 | import gym
  5 | import numpy as np
  6 | import torch
  7 | 
  8 | from searl.neuroevolution.components.utils import to_tensor, Transition
  9 | 
 10 | 
 11 | class MPEvaluation():
 12 |     """
 13 |     evaluates and population and stores transitions an a push_queue
 14 | 
 15 |     """
 16 | 
 17 |     def __init__(self, config, logger, push_queue=None):
 18 | 
 19 |         self.rng = np.random.RandomState(config.seed.evaluation)
 20 |         self.cfg = config
 21 |         self.log = logger
 22 |         self.push_queue = push_queue
 23 |         self.eval_episodes = config.eval.eval_episodes
 24 | 
 25 |     def test_individual(self, individual, epoch):
 26 |         return_dict = self._evaluate_individual(individual, self.cfg, self.cfg.eval.test_episodes, epoch, False)
 27 |         fitness = np.mean(return_dict[individual.index]["fitness_list"])
 28 |         return fitness
 29 | 
 30 |     @staticmethod
 31 |     def _evaluate_individual(individual, config, num_episodes, seed, exploration_noise=False, start_phase=False):
 32 | 
 33 |         actor_net = individual.actor
 34 | 
 35 |         num_frames = 0
 36 |         fitness_list = []
 37 |         transistions_list = []
 38 |         episodes = 0
 39 | 
 40 |         env = gym.make(config.env.name)
 41 |         env.seed(seed)
 42 |         actor_net.eval()
 43 | 
 44 |         with torch.no_grad():
 45 |             while episodes < num_episodes or num_frames < config.eval.min_eval_steps:
 46 |                 episode_fitness = 0.0
 47 |                 episode_transitions = []
 48 |                 state = env.reset()
 49 |                 t_state = to_tensor(state).unsqueeze(0)
 50 |                 done = False
 51 |                 while not done:
 52 |                     if start_phase:
 53 |                         action = env.action_space.sample()
 54 |                         action = to_tensor(action)
 55 |                     else:
 56 |                         action = actor_net(t_state)
 57 |                     action.clamp(-1, 1)
 58 |                     action = action.data.numpy()
 59 |                     if exploration_noise is not False:
 60 |                         action += config.eval.exploration_noise * np.random.randn(config.action_dim)
 61 |                         action = np.clip(action, -1, 1)
 62 |                     action = action.flatten()
 63 | 
 64 |                     step_action = (action + 1) / 2  # [-1, 1] => [0, 1]
 65 |                     step_action *= (env.action_space.high - env.action_space.low)
 66 |                     step_action += env.action_space.low
 67 | 
 68 |                     next_state, reward, done, info = env.step(step_action)  # Simulate one step in environment
 69 | 
 70 |                     done_bool = 0 if num_frames + 1 == env._max_episode_steps else float(done)
 71 | 
 72 |                     t_next_state = to_tensor(next_state).unsqueeze(0)
 73 | 
 74 |                     episode_fitness += reward
 75 |                     num_frames += 1
 76 | 
 77 |                     transition = Transition(state, action, next_state, np.array([reward]),
 78 |                                             np.array([done_bool]).astype('uint8'))
 79 |                     episode_transitions.append(transition)
 80 |                     t_state = t_next_state
 81 |                     state = next_state
 82 |                 episodes += 1
 83 |                 fitness_list.append(episode_fitness)
 84 |                 transistions_list.append(episode_transitions)
 85 | 
 86 |         return {individual.index: {"fitness_list": fitness_list, "num_episodes": num_episodes, "num_frames": num_frames,
 87 |                                    "id": individual.index, "transitions": transistions_list}}
 88 | 
 89 |     def evaluate_population(self, population: List, exploration_noise=False, total_frames=1, pool=None):
 90 |         population_id_lookup = [ind.index for ind in population]
 91 |         new_population_mean_fitness = np.zeros(len(population))
 92 |         new_population_var_fitness = np.zeros(len(population))
 93 | 
 94 |         start_phase = total_frames <= self.cfg.rl.start_timesteps
 95 |         if start_phase:
 96 |             self.log("start phase", time_step=total_frames)
 97 | 
 98 |         args_list = [(ind, self.cfg, self.eval_episodes, self.rng.randint(0, 100000), exploration_noise, start_phase)
 99 |                      for ind in population]
100 |         result_dicts = [pool.apply(self._evaluate_individual, args) for args in args_list]
101 |         result_dict = dict(ChainMap(*result_dicts))
102 | 
103 |         eval_frames = 0
104 |         for ind_id, value_dict in result_dict.items():
105 |             pop_idx = population_id_lookup.index(ind_id)
106 |             new_population_mean_fitness[pop_idx] = np.mean(value_dict['fitness_list'])
107 |             new_population_var_fitness[pop_idx] = np.var(value_dict['fitness_list'])
108 |             eval_frames += value_dict['num_frames']
109 | 
110 |             population[pop_idx].train_log["eval_eps"] = self.eval_episodes
111 | 
112 |             for transitions in value_dict['transitions']:
113 |                 if self.cfg.nevo.ind_memory:
114 |                     population[pop_idx].replay_memory.add(transitions)
115 |                 else:
116 |                     self.push_queue.put(transitions)
117 | 
118 |         for idx in range(len(population)):
119 |             population[idx].train_log["post_fitness"] = new_population_mean_fitness[idx]
120 |             population[idx].train_log["index"] = population[idx].index
121 |             self.log.csv.log_csv(population[idx].train_log)
122 |             population[idx].train_log.update({"pre_fitness": new_population_mean_fitness[idx], "eval_eps": 0})
123 |             population[idx].fitness.append(new_population_mean_fitness[idx])
124 |             if len(population[idx].fitness) > 1:
125 |                 population[idx].improvement = population[idx].fitness[-1] - population[idx].fitness[-2]
126 |             else:
127 |                 population[idx].improvement = population[idx].fitness[-1]
128 | 
129 |         return new_population_mean_fitness, new_population_var_fitness, eval_frames
130 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/mutation_cnn.py:
--------------------------------------------------------------------------------
  1 | import fastrand
  2 | import numpy as np
  3 | 
  4 | 
  5 | class Mutations():
  6 | 
  7 |     def __init__(self, config):
  8 |         self.cfg = config
  9 |         self.rng = np.random.RandomState(self.cfg.seed.mutation)
 10 | 
 11 |     def no_mutation(self, individual):
 12 |         individual.train_log["mutation"] = "no_mutation"
 13 |         return individual
 14 | 
 15 |     def mutation(self, population):
 16 | 
 17 |         mutation_options = []
 18 |         mutation_proba = []
 19 |         if self.cfg.mutation.no_mutation:
 20 |             mutation_options.append(self.no_mutation)
 21 |             mutation_proba.append(float(self.cfg.mutation.no_mutation))
 22 |         if self.cfg.mutation.architecture:
 23 |             mutation_options.append(self.architecture_mutate)
 24 |             mutation_proba.append(float(self.cfg.mutation.architecture))
 25 |         if self.cfg.mutation.parameters:
 26 |             mutation_options.append(self.parameter_mutation)
 27 |             mutation_proba.append(float(self.cfg.mutation.parameters))
 28 |         if self.cfg.mutation.activation:
 29 |             mutation_options.append(self.activation_mutation)
 30 |             mutation_proba.append(float(self.cfg.mutation.activation))
 31 |         if self.cfg.mutation.rl_hyperparam:
 32 |             mutation_options.append(self.rl_hyperparam_mutation)
 33 |             mutation_proba.append(float(self.cfg.mutation.rl_hyperparam))
 34 | 
 35 |         if len(mutation_options) == 0:
 36 |             return population
 37 | 
 38 |         mutation_proba = np.array(mutation_proba) / np.sum(mutation_proba)
 39 | 
 40 |         mutation_choice = self.rng.choice(mutation_options, len(population), p=mutation_proba)
 41 | 
 42 |         mutated_population = []
 43 |         for mutation, individual in zip(mutation_choice, population):
 44 |             mutated_population.append(mutation(individual))
 45 | 
 46 |         return mutated_population
 47 | 
 48 |     def rl_hyperparam_mutation(self, individual):
 49 | 
 50 |         rl_config = individual.rl_config
 51 |         rl_params = self.cfg.mutation.rl_hp_selection
 52 |         mutate_param = self.rng.choice(rl_params, 1)[0]
 53 | 
 54 |         random_num = self.rng.uniform(0, 1)
 55 |         if mutate_param == 'train_frames_fraction':
 56 |             if random_num > 0.5:
 57 |                 setattr(rl_config, mutate_param, min(0.1, max(3.0, getattr(rl_config, mutate_param) * 1.2)))
 58 |             else:
 59 |                 setattr(rl_config, mutate_param, min(0.1, max(3.0, getattr(rl_config, mutate_param) * 0.8)))
 60 |         elif mutate_param == 'batch_size':
 61 |             if random_num > 0.5:
 62 |                 setattr(rl_config, mutate_param, min(128, max(8, int(getattr(rl_config, mutate_param) * 1.2))))
 63 |             else:
 64 |                 setattr(rl_config, mutate_param, min(128, max(8, int(getattr(rl_config, mutate_param) * 0.8))))
 65 |         elif mutate_param == 'lr_actor':
 66 |             if random_num > 0.5:
 67 |                 setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 1.2)))
 68 |             else:
 69 |                 setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 0.8)))
 70 |         elif mutate_param == 'lr_critic':
 71 |             if random_num > 0.5:
 72 |                 setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 1.2)))
 73 |             else:
 74 |                 setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 0.8)))
 75 |         elif mutate_param == 'td3_policy_noise':
 76 |             if getattr(rl_config, mutate_param):
 77 |                 setattr(rl_config, mutate_param, False)
 78 |             else:
 79 |                 setattr(rl_config, mutate_param, 0.1)
 80 |         elif mutate_param == 'td3_update_freq':
 81 |             if random_num > 0.5:
 82 |                 setattr(rl_config, mutate_param, min(10, max(1, int(getattr(rl_config, mutate_param) + 1))))
 83 |             else:
 84 |                 setattr(rl_config, mutate_param, min(10, max(1, int(getattr(rl_config, mutate_param) - 1))))
 85 |         elif mutate_param == 'optimizer':
 86 |             opti_selection = ["adam", "adamax", "rmsprop", "sdg"]
 87 |             opti_selection.remove(getattr(rl_config, mutate_param))
 88 |             opti = self.rng.choice(opti_selection, 1)
 89 |             setattr(rl_config, mutate_param, opti)
 90 | 
 91 |         individual.train_log["mutation"] = "rl_" + mutate_param
 92 |         individual.rl_config = rl_config
 93 |         return individual
 94 | 
 95 |     def activation_mutation(self, individual):
 96 |         individual.actor = self._permutate_activation(individual.actor)
 97 |         individual.train_log["mutation"] = "activation"
 98 |         return individual
 99 | 
100 |     def _permutate_activation(self, network):
101 | 
102 |         possible_activations = ['relu', 'elu', 'gelu']
103 |         current_activation = network.mlp_activation
104 |         possible_activations.remove(current_activation)
105 |         new_activation = self.rng.choice(possible_activations, size=1)[0]
106 |         net_dict = network.init_dict
107 |         net_dict['mlp_activation'] = new_activation
108 |         net_dict['cnn_activation'] = new_activation
109 |         new_network = type(network)(**net_dict)
110 |         new_network.load_state_dict(network.state_dict())
111 |         network = new_network
112 | 
113 |         return network
114 | 
115 |     def parameter_mutation(self, individual):
116 | 
117 |         offspring = individual.actor
118 | 
119 |         offspring.cpu()
120 | 
121 |         offspring = self.classic_parameter_mutation(offspring)
122 |         individual.train_log["mutation"] = "classic_parameter"
123 | 
124 |         individual.actor = offspring
125 |         return individual
126 | 
127 |     def regularize_weight(self, weight, mag):
128 |         if weight > mag: weight = mag
129 |         if weight < -mag: weight = -mag
130 |         return weight
131 | 
132 |     def classic_parameter_mutation(self, network):
133 |         mut_strength = self.cfg.mutation.mutation_sd
134 |         num_mutation_frac = 0.1
135 |         super_mut_strength = 10
136 |         super_mut_prob = 0.05
137 |         reset_prob = super_mut_prob + 0.05
138 | 
139 |         model_params = network.state_dict()
140 | 
141 |         potential_keys = []
142 |         for i, key in enumerate(model_params):  # Mutate each param
143 |             if not 'norm' in key:
144 |                 W = model_params[key]
145 |                 if len(W.shape) == 2:  # Weights, no bias
146 |                     potential_keys.append(key)
147 | 
148 |         how_many = np.random.randint(1, len(potential_keys) + 1, 1)[0]
149 |         chosen_keys = np.random.choice(potential_keys, how_many, replace=False)
150 | 
151 |         for key in chosen_keys:
152 |             # References to the variable keys
153 |             W = model_params[key]
154 |             num_weights = W.shape[0] * W.shape[1]
155 |             # Number of mutation instances
156 |             num_mutations = fastrand.pcg32bounded(int(np.ceil(num_mutation_frac * num_weights)))
157 |             for _ in range(num_mutations):
158 |                 ind_dim1 = fastrand.pcg32bounded(W.shape[0])
159 |                 ind_dim2 = fastrand.pcg32bounded(W.shape[-1])
160 |                 random_num = self.rng.uniform(0, 1)
161 | 
162 |                 if random_num < super_mut_prob:  # Super Mutation probability
163 |                     W[ind_dim1, ind_dim2] += self.rng.normal(0, np.abs(super_mut_strength * W[ind_dim1, ind_dim2]))
164 |                 elif random_num < reset_prob:  # Reset probability
165 |                     W[ind_dim1, ind_dim2] = self.rng.normal(0, 1)
166 |                 else:  # mutauion even normal
167 |                     W[ind_dim1, ind_dim2] += self.rng.normal(0, np.abs(mut_strength * W[ind_dim1, ind_dim2]))
168 | 
169 |                 # Regularization hard limit
170 |                 W[ind_dim1, ind_dim2] = self.regularize_weight(W[ind_dim1, ind_dim2], 1000000)
171 |         return network
172 | 
173 |     def architecture_mutate(self, individual):
174 | 
175 |         offspring_actor = individual.actor.clone()
176 |         offspring_actor.cpu()
177 | 
178 |         rand_numb = self.rng.uniform(0, 1)
179 |         if 0 <= rand_numb < 0.1:
180 |             offspring_actor.add_mlp_layer()
181 |             individual.train_log["mutation"] = "architecture_new_mlp_layer"
182 | 
183 |         elif 0.1 <= rand_numb < 0.2:
184 |             offspring_actor.add_cnn_layer()
185 |             individual.train_log["mutation"] = "architecture_new_cnn_layer"
186 | 
187 |         elif 0.2 <= rand_numb < 0.3:
188 |             offspring_actor.change_cnn_kernal()
189 |             individual.train_log["mutation"] = "architecture_change_cnn_kernal"
190 |         elif 0.3 <= rand_numb < 0.65:
191 |             offspring_actor.add_cnn_channel()
192 |             individual.train_log["mutation"] = "architecture_add_cnn_channel"
193 |         else:
194 |             offspring_actor.add_mlp_node()
195 |             individual.train_log["mutation"] = "architecture_add_mlp_node"
196 | 
197 |         individual.actor = offspring_actor
198 | 
199 |         return individual
200 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/mutation_mlp.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | 
  3 | import fastrand
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | from torch.optim import Adam
  8 | 
  9 | 
 10 | class Mutations():
 11 | 
 12 |     def __init__(self, config, replay_sample_queue):
 13 |         self.cfg = config
 14 |         self.rng = np.random.RandomState(self.cfg.seed.mutation)
 15 |         self.replay_sample_queue = replay_sample_queue
 16 | 
 17 |     def no_mutation(self, individual):
 18 |         individual.train_log["mutation"] = "no_mutation"
 19 |         return individual
 20 | 
 21 |     def mutation(self, population):
 22 | 
 23 |         mutation_options = []
 24 |         mutation_proba = []
 25 |         if self.cfg.mutation.no_mutation:
 26 |             mutation_options.append(self.no_mutation)
 27 |             mutation_proba.append(float(self.cfg.mutation.no_mutation))
 28 |         if self.cfg.mutation.architecture:
 29 |             mutation_options.append(self.architecture_mutate)
 30 |             mutation_proba.append(float(self.cfg.mutation.architecture))
 31 |         if self.cfg.mutation.parameters:
 32 |             mutation_options.append(self.parameter_mutation)
 33 |             mutation_proba.append(float(self.cfg.mutation.parameters))
 34 |         if self.cfg.mutation.activation:
 35 |             mutation_options.append(self.activation_mutation)
 36 |             mutation_proba.append(float(self.cfg.mutation.activation))
 37 |         if self.cfg.mutation.rl_hyperparam:
 38 |             mutation_options.append(self.rl_hyperparam_mutation)
 39 |             mutation_proba.append(float(self.cfg.mutation.rl_hyperparam))
 40 | 
 41 |         if len(mutation_options) == 0:
 42 |             return population
 43 | 
 44 |         mutation_proba = np.array(mutation_proba) / np.sum(mutation_proba)
 45 | 
 46 |         mutation_choice = self.rng.choice(mutation_options, len(population), p=mutation_proba)
 47 | 
 48 |         mutated_population = []
 49 |         for mutation, individual in zip(mutation_choice, population):
 50 |             mutated_population.append(mutation(individual))
 51 | 
 52 |         return mutated_population
 53 | 
 54 |     def rl_hyperparam_mutation(self, individual):
 55 | 
 56 |         rl_config = individual.rl_config
 57 |         rl_params = self.cfg.mutation.rl_hp_selection
 58 |         mutate_param = self.rng.choice(rl_params, 1)[0]
 59 | 
 60 |         random_num = self.rng.uniform(0, 1)
 61 |         if mutate_param == 'train_frames_fraction':
 62 |             if random_num > 0.5:
 63 |                 setattr(rl_config, mutate_param, min(0.1, max(3.0, getattr(rl_config, mutate_param) * 1.2)))
 64 |             else:
 65 |                 setattr(rl_config, mutate_param, min(0.1, max(3.0, getattr(rl_config, mutate_param) * 0.8)))
 66 |         elif mutate_param == 'batch_size':
 67 |             if random_num > 0.5:
 68 |                 setattr(rl_config, mutate_param, min(128, max(8, int(getattr(rl_config, mutate_param) * 1.2))))
 69 |             else:
 70 |                 setattr(rl_config, mutate_param, min(128, max(8, int(getattr(rl_config, mutate_param) * 0.8))))
 71 |         elif mutate_param == 'lr_actor':
 72 |             if random_num > 0.5:
 73 |                 setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 1.2)))
 74 |             else:
 75 |                 setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 0.8)))
 76 |         elif mutate_param == 'lr_critic':
 77 |             if random_num > 0.5:
 78 |                 setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 1.2)))
 79 |             else:
 80 |                 setattr(rl_config, mutate_param, min(0.005, max(0.00001, getattr(rl_config, mutate_param) * 0.8)))
 81 |         elif mutate_param == 'td3_policy_noise':
 82 |             if getattr(rl_config, mutate_param):
 83 |                 setattr(rl_config, mutate_param, False)
 84 |             else:
 85 |                 setattr(rl_config, mutate_param, 0.1)
 86 |         elif mutate_param == 'td3_update_freq':
 87 |             if random_num > 0.5:
 88 |                 setattr(rl_config, mutate_param, min(10, max(1, int(getattr(rl_config, mutate_param) + 1))))
 89 |             else:
 90 |                 setattr(rl_config, mutate_param, min(10, max(1, int(getattr(rl_config, mutate_param) - 1))))
 91 |         elif mutate_param == 'optimizer':
 92 |             opti_selection = ["adam", "adamax", "rmsprop", "sdg"]
 93 |             opti_selection.remove(getattr(rl_config, mutate_param))
 94 |             opti = self.rng.choice(opti_selection, 1)
 95 |             setattr(rl_config, mutate_param, opti)
 96 | 
 97 |         individual.train_log["mutation"] = "rl_" + mutate_param
 98 |         individual.rl_config = rl_config
 99 |         return individual
100 | 
101 |     def activation_mutation(self, individual):
102 |         individual.actor = self._permutate_activation(individual.actor)
103 |         individual.critic_1 = self._permutate_activation(individual.critic_1)
104 |         if self.cfg.train.td3_double_q:
105 |             individual.critic_2 = self._permutate_activation(individual.critic_2)
106 |         individual.train_log["mutation"] = "activation"
107 |         return individual
108 | 
109 |     def _permutate_activation(self, network):
110 | 
111 |         possible_activations = ['relu', 'elu', 'tanh']
112 |         current_activation = network.activation
113 |         possible_activations.remove(current_activation)
114 |         new_activation = self.rng.choice(possible_activations, size=1)[0]
115 |         net_dict = network.init_dict
116 |         net_dict['activation'] = new_activation
117 |         new_network = type(network)(**net_dict)
118 |         new_network.load_state_dict(network.state_dict())
119 |         network = new_network
120 | 
121 |         return network
122 | 
123 |     def parameter_mutation(self, individual):
124 | 
125 |         offspring = individual.actor
126 | 
127 |         offspring = self.classic_parameter_mutation(offspring)
128 |         individual.train_log["mutation"] = "classic_parameter"
129 | 
130 |         individual.actor = offspring
131 |         return individual
132 | 
133 |     def regularize_weight(self, weight, mag):
134 |         if weight > mag: weight = mag
135 |         if weight < -mag: weight = -mag
136 |         return weight
137 | 
138 |     def classic_parameter_mutation(self, network):
139 |         mut_strength = self.cfg.mutation.mutation_sd
140 |         num_mutation_frac = 0.1
141 |         super_mut_strength = 10
142 |         super_mut_prob = 0.05
143 |         reset_prob = super_mut_prob + 0.05
144 | 
145 |         model_params = network.state_dict()
146 | 
147 |         potential_keys = []
148 |         for i, key in enumerate(model_params):  # Mutate each param
149 |             if not 'norm' in key:
150 |                 W = model_params[key]
151 |                 if len(W.shape) == 2:  # Weights, no bias
152 |                     potential_keys.append(key)
153 | 
154 |         how_many = np.random.randint(1, len(potential_keys) + 1, 1)[0]
155 |         chosen_keys = np.random.choice(potential_keys, how_many, replace=False)
156 | 
157 |         for key in chosen_keys:
158 |             # References to the variable keys
159 |             W = model_params[key]
160 |             num_weights = W.shape[0] * W.shape[1]
161 |             # Number of mutation instances
162 |             num_mutations = fastrand.pcg32bounded(int(np.ceil(num_mutation_frac * num_weights)))
163 |             for _ in range(num_mutations):
164 |                 ind_dim1 = fastrand.pcg32bounded(W.shape[0])
165 |                 ind_dim2 = fastrand.pcg32bounded(W.shape[-1])
166 |                 random_num = self.rng.uniform(0, 1)
167 | 
168 |                 if random_num < super_mut_prob:  # Super Mutation probability
169 |                     W[ind_dim1, ind_dim2] += self.rng.normal(0, np.abs(super_mut_strength * W[ind_dim1, ind_dim2]))
170 |                 elif random_num < reset_prob:  # Reset probability
171 |                     W[ind_dim1, ind_dim2] = self.rng.normal(0, 1)
172 |                 else:  # mutauion even normal
173 |                     W[ind_dim1, ind_dim2] += self.rng.normal(0, np.abs(mut_strength * W[ind_dim1, ind_dim2]))
174 | 
175 |                 # Regularization hard limit
176 |                 W[ind_dim1, ind_dim2] = self.regularize_weight(W[ind_dim1, ind_dim2], 1000000)
177 |         return network
178 | 
179 | 
180 |     def architecture_mutate(self, individual):
181 | 
182 |         offspring_actor = individual.actor.clone()
183 |         offspring_critic_1 = individual.critic_1.clone()
184 |         if self.cfg.train.td3_double_q:
185 |             offspring_critic_2 = individual.critic_2.clone()
186 | 
187 |         rand_numb = self.rng.uniform(0, 1)
188 |         if rand_numb < self.cfg.mutation.new_layer_prob:
189 |             offspring_actor.add_layer()
190 |             offspring_critic_1.add_layer()
191 |             if self.cfg.train.td3_double_q:
192 |                 offspring_critic_2.add_layer()
193 |             individual.train_log["mutation"] = "architecture_new_layer"
194 |         else:
195 |             node_dict = offspring_actor.add_node()
196 |             offspring_critic_1.add_node(**node_dict)
197 |             if self.cfg.train.td3_double_q:
198 |                 offspring_critic_2.add_node(**node_dict)
199 |             individual.train_log["mutation"] = "architecture_new_node"
200 | 
201 |         individual.actor = offspring_actor
202 |         individual.critic_1 = offspring_critic_1
203 |         if self.cfg.train.td3_double_q:
204 |             individual.critic_2 = offspring_critic_2
205 |         return individual
206 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/searl_dqn.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import time
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | from searl.neuroevolution.components.individual_dqn import DQNIndividual
  8 | from searl.neuroevolution.components.replay_memory import ReplayMemory
  9 | from searl.neuroevolution.evaluation_dqn import MPEvaluation
 10 | from searl.neuroevolution.mutation_cnn import Mutations
 11 | from searl.neuroevolution.tournament_selection import TournamentSelection
 12 | from searl.neuroevolution.training_dqn import DQNTraining
 13 | from searl.rl_algorithms.components.wrappers import make_atari, wrap_deepmind, wrap_pytorch
 14 | from searl.utils.supporter import Supporter
 15 | 
 16 | 
 17 | class SEARLforDQN():
 18 | 
 19 |     def __init__(self, config, logger, checkpoint):
 20 | 
 21 |         self.cfg = config
 22 |         self.log = logger
 23 |         self.ckp = checkpoint
 24 | 
 25 |         torch.manual_seed(self.cfg.seed.torch)
 26 |         np.random.seed(self.cfg.seed.numpy)
 27 | 
 28 |         self.log.print_config(self.cfg)
 29 |         self.log.csv.fieldnames(
 30 |             ["epoch", "time_string", "eval_eps", "pre_fitness", "pre_rank", "post_fitness", "post_rank", "index",
 31 |              "parent_index", "mutation", "train_iterations", "train_losses",
 32 |              ] + list(self.cfg.rl.get_dict.keys()))
 33 | 
 34 |         self.log.log("initialize replay memory")
 35 | 
 36 |         self.replay_memory = ReplayMemory(capacity=self.cfg.train.replay_memory_size, batch_size=self.cfg.rl.batch_size)
 37 | 
 38 |         self.eval = MPEvaluation(config=self.cfg, logger=self.log, replay_memory=self.replay_memory)
 39 | 
 40 |         self.tournament = TournamentSelection(config=self.cfg)
 41 | 
 42 |         self.mutation = Mutations(config=self.cfg)
 43 | 
 44 |         self.training = DQNTraining(config=self.cfg, replay_memory=self.replay_memory)
 45 | 
 46 |     def initial_population(self):
 47 |         self.log.log("initialize population")
 48 |         population = []
 49 |         for idx in range(self.cfg.nevo.population_size):
 50 | 
 51 |             if self.cfg.nevo.ind_memory:
 52 |                 replay_memory = ReplayMemory(capacity=self.cfg.train.replay_memory_size,
 53 |                                              batch_size=self.cfg.rl.batch_size)
 54 |             else:
 55 |                 replay_memory = False
 56 | 
 57 |             actor_config = copy.deepcopy(self.cfg.actor.get_dict)
 58 |             rl_config = copy.deepcopy(self.cfg.rl)
 59 | 
 60 |             indi = DQNIndividual(state_dim=self.cfg.state_dim, action_dim=self.cfg.action_dim,
 61 |                                  actor_config=actor_config,
 62 |                                  rl_config=rl_config, index=idx, replay_memory=replay_memory)
 63 |             population.append(indi)
 64 |         return population
 65 | 
 66 |     def evolve_population(self, population, epoch=1, num_frames=0):
 67 | 
 68 |         frames_since_mut = 0
 69 |         num_frames = num_frames
 70 |         epoch = epoch
 71 | 
 72 |         while True:
 73 |             epoch_time = time.time()
 74 |             self.log(f"##### START EPOCH {epoch}", time_step=num_frames)
 75 | 
 76 |             for ind in population:
 77 |                 ind.train_log['epoch'] = epoch
 78 | 
 79 |             population_mean_fitness, population_var_fitness, eval_frames = \
 80 |                 self.log.log_func(self.eval.evaluate_population, population=population,
 81 |                                   exploration_noise=self.cfg.eval.exploration_noise,
 82 |                                   total_frames=num_frames)
 83 |             self.log("eval_frames", eval_frames)
 84 |             num_frames += eval_frames
 85 |             frames_since_mut += eval_frames
 86 | 
 87 |             self.log.population_info(population_mean_fitness, population_var_fitness, population, num_frames, epoch)
 88 | 
 89 |             self.ckp.save_object(population, name="population")
 90 |             self.log.log("save population")
 91 | 
 92 |             if num_frames >= self.cfg.train.num_frames:
 93 |                 break
 94 | 
 95 |             if self.cfg.nevo.selection:
 96 |                 elite, population = self.log.log_func(self.tournament.select, population)
 97 |                 test_fitness = self.eval.test_individual(elite, epoch)
 98 |                 self.log(f"##### ELITE INFO {epoch}", time_step=num_frames)
 99 |                 self.log("best_test_fitness", test_fitness, num_frames)
100 | 
101 |             if self.cfg.nevo.mutation:
102 |                 population = self.log.log_func(self.mutation.mutation, population)
103 | 
104 |             if self.cfg.nevo.training:
105 |                 iterations = min(
106 |                     max(self.cfg.train.min_train_steps, int(self.cfg.rl.train_frames_fraction * eval_frames)),
107 |                     self.cfg.train.max_train_steps)
108 |                 self.log("training_iterations", iterations)
109 |                 population = self.log.log_func(self.training.train, population=population, iterations=iterations)
110 | 
111 |             self.log(f"##### END EPOCH {epoch} - runtime {time.time() - epoch_time:6.1f}", time_step=num_frames)
112 |             self.log("epoch", epoch, time_step=num_frames)
113 |             self.log(f"##### ################################################# #####")
114 |             self.cfg.expt.set_attr("epoch", epoch)
115 |             self.cfg.expt.set_attr("num_frames", num_frames)
116 |             epoch += 1
117 | 
118 |         self.log("FINISH", time_step=num_frames)
119 |         self.replay_memory.close()
120 | 
121 |     def close(self):
122 |         self.replay_memory.close()
123 | 
124 | 
125 | def start_searl_dqn_run(config_dict, expt_dir):
126 |     sup = Supporter(experiments_dir=expt_dir, config_dict=config_dict, count_expt=True)
127 |     cfg = sup.get_config()
128 |     log = sup.get_logger()
129 | 
130 |     env = make_atari(cfg.env.name)
131 |     env = wrap_deepmind(env)
132 |     env = wrap_pytorch(env)
133 |     cfg.set_attr("action_dim", env.action_space.n)
134 |     cfg.set_attr("state_dim", env.observation_space.shape)
135 | 
136 |     searl = SEARLforDQN(config=cfg, logger=log, checkpoint=sup.ckp)
137 | 
138 |     population = searl.initial_population()
139 |     searl.evolve_population(population)
140 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/searl_td3.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import time
  3 | 
  4 | import gym
  5 | import numpy as np
  6 | import torch
  7 | import torch.multiprocessing as mp
  8 | 
  9 | from searl.neuroevolution.components.individual_td3 import Individual
 10 | from searl.neuroevolution.components.replay_memory import MPReplayMemory, ReplayMemory
 11 | from searl.neuroevolution.evaluation_td3 import MPEvaluation
 12 | from searl.neuroevolution.mutation_mlp import Mutations
 13 | from searl.neuroevolution.tournament_selection import TournamentSelection
 14 | from searl.neuroevolution.training_td3 import TD3Training
 15 | from searl.utils.supporter import Supporter
 16 | 
 17 | 
 18 | class SEARLforTD3():
 19 | 
 20 |     def __init__(self, config, logger, checkpoint):
 21 | 
 22 |         self.cfg = config
 23 |         self.log = logger
 24 |         self.ckp = checkpoint
 25 | 
 26 |         torch.manual_seed(self.cfg.seed.torch)
 27 |         np.random.seed(self.cfg.seed.numpy)
 28 | 
 29 |         self.log.print_config(self.cfg)
 30 |         self.log.csv.fieldnames(
 31 |             ["epoch", "time_string", "eval_eps", "pre_fitness", "pre_rank", "post_fitness", "post_rank", "index",
 32 |              "parent_index", "mutation", "train_iterations",
 33 |              ] + list(self.cfg.rl.get_dict.keys()))
 34 | 
 35 |         self.log.log("initialize replay memory")
 36 |         if self.cfg.nevo.ind_memory:
 37 |             push_queue = None
 38 |             sample_queue = None
 39 |         else:
 40 |             self.replay_memory = MPReplayMemory(seed=self.cfg.seed.replay_memory,
 41 |                                                 capacity=self.cfg.train.replay_memory_size,
 42 |                                                 batch_size=self.cfg.rl.batch_size,
 43 |                                                 reuse_batch=self.cfg.nevo.reuse_batch)
 44 |             push_queue = self.replay_memory.get_push_queue()
 45 |             sample_queue = self.replay_memory.get_sample_queue()
 46 | 
 47 |         self.eval = MPEvaluation(config=self.cfg, logger=self.log, push_queue=push_queue)
 48 | 
 49 |         self.tournament = TournamentSelection(config=self.cfg)
 50 | 
 51 |         self.mutation = Mutations(config=self.cfg, replay_sample_queue=sample_queue)
 52 | 
 53 |         self.training = TD3Training(config=self.cfg, replay_sample_queue=sample_queue)
 54 | 
 55 |     def initial_population(self):
 56 |         self.log.log("initialize population")
 57 |         population = []
 58 |         for idx in range(self.cfg.nevo.population_size):
 59 | 
 60 |             if self.cfg.nevo.ind_memory:
 61 |                 replay_memory = ReplayMemory(capacity=self.cfg.train.replay_memory_size,
 62 |                                              batch_size=self.cfg.rl.batch_size)
 63 |             else:
 64 |                 replay_memory = False
 65 | 
 66 |             if self.cfg.nevo.init_random:
 67 | 
 68 |                 min_lr = 0.00001
 69 |                 max_lr = 0.005
 70 | 
 71 |                 actor_config = copy.deepcopy(self.cfg.actor.get_dict)
 72 |                 critic_config = copy.deepcopy(self.cfg.critic.get_dict)
 73 |                 rl_config = copy.deepcopy(self.cfg.rl)
 74 | 
 75 |                 actor_config["activation"] = np.random.choice(['relu', 'tanh', 'elu'], 1)[0]
 76 |                 critic_config["activation"] = np.random.choice(['relu', 'tanh', 'elu'], 1)[0]
 77 | 
 78 |                 lr_actor = np.exp(np.random.uniform(np.log(min_lr), np.log(max_lr), 1))[0]
 79 |                 lr_critic = np.exp(np.random.uniform(np.log(min_lr), np.log(max_lr), 1))[0]
 80 | 
 81 |                 rl_config.set_attr("lr_actor", lr_actor)
 82 |                 rl_config.set_attr("lr_critic", lr_critic)
 83 |                 self.log(f"init {idx} rl_config: ", rl_config.get_dict)
 84 |                 self.log(f"init {idx} actor_config: ", actor_config)
 85 | 
 86 |             else:
 87 |                 actor_config = copy.deepcopy(self.cfg.actor.get_dict)
 88 |                 critic_config = copy.deepcopy(self.cfg.critic.get_dict)
 89 |                 rl_config = copy.deepcopy(self.cfg.rl)
 90 | 
 91 |             indi = Individual(state_dim=self.cfg.state_dim, action_dim=self.cfg.action_dim,
 92 |                               actor_config=actor_config,
 93 |                               critic_config=critic_config,
 94 |                               rl_config=rl_config, index=idx, td3_double_q=self.cfg.train.td3_double_q,
 95 |                               replay_memory=replay_memory)
 96 |             population.append(indi)
 97 |         return population
 98 | 
 99 |     def evolve_population(self, population, epoch=1, num_frames=0):
100 | 
101 |         frames_since_mut = 0
102 |         num_frames = num_frames
103 |         epoch = epoch
104 |         ctx = mp.get_context('spawn')
105 | 
106 |         while True:
107 |             pool = ctx.Pool(processes=self.cfg.nevo.worker, maxtasksperchild=1000)
108 |             epoch_time = time.time()
109 |             self.log(f"##### START EPOCH {epoch}", time_step=num_frames)
110 | 
111 |             for ind in population:
112 |                 ind.train_log['epoch'] = epoch
113 | 
114 |             population_mean_fitness, population_var_fitness, eval_frames = \
115 |                 self.log.log_func(self.eval.evaluate_population, population=population,
116 |                                   exploration_noise=self.cfg.eval.exploration_noise,
117 |                                   total_frames=num_frames, pool=pool)
118 |             num_frames += eval_frames
119 |             frames_since_mut += eval_frames
120 | 
121 |             self.log.population_info(population_mean_fitness, population_var_fitness, population, num_frames, epoch)
122 | 
123 |             self.ckp.save_object(population, name="population")
124 |             self.log.log("save population")
125 |             if not self.cfg.nevo.ind_memory:
126 |                 rm_dict = self.replay_memory.save()
127 |                 if isinstance(rm_dict, str):
128 |                     self.log("save replay memory failed")
129 |                 else:
130 |                     self.log("replay memory size", len(rm_dict['memory']))
131 |                 self.ckp.save_object([rm_dict], name="replay_memory")
132 |                 self.log("save replay memory")
133 | 
134 |             if num_frames >= self.cfg.train.num_frames:
135 |                 break
136 | 
137 |             if self.cfg.nevo.selection:
138 |                 elite, population = self.log.log_func(self.tournament.select, population)
139 |                 test_fitness = self.eval.test_individual(elite, epoch)
140 |                 self.log(f"##### ELITE INFO {epoch}", time_step=num_frames)
141 |                 self.log("best_test_fitness", test_fitness, num_frames)
142 | 
143 |             if self.cfg.nevo.mutation:
144 |                 population = self.log.log_func(self.mutation.mutation, population)
145 | 
146 |             if self.cfg.nevo.training:
147 |                 population = self.log.log_func(self.training.train, population=population, eval_frames=eval_frames,
148 |                                                pool=pool)
149 | 
150 |             self.log(f"##### END EPOCH {epoch} - runtime {time.time() - epoch_time:6.1f}", time_step=num_frames)
151 |             self.log("epoch", epoch, time_step=num_frames)
152 |             self.log(f"##### ################################################# #####")
153 |             self.cfg.expt.set_attr("epoch", epoch)
154 |             self.cfg.expt.set_attr("num_frames", num_frames)
155 |             epoch += 1
156 | 
157 |             pool.terminate()
158 |             pool.join()
159 | 
160 |         self.log("FINISH", time_step=num_frames)
161 |         self.replay_memory.close()
162 | 
163 |     def close(self):
164 |         self.replay_memory.close()
165 | 
166 | 
167 | def start_searl_td3_run(config, expt_dir):
168 |     with Supporter(experiments_dir=expt_dir, config_dict=config, count_expt=True) as sup:
169 |         cfg = sup.get_config()
170 |         log = sup.get_logger()
171 | 
172 |         env = gym.make(cfg.env.name)
173 |         cfg.set_attr("action_dim", env.action_space.shape[0])
174 |         cfg.set_attr("state_dim", env.observation_space.shape[0])
175 | 
176 |         searl = SEARLforTD3(config=cfg, logger=log, checkpoint=sup.ckp)
177 | 
178 |         population = searl.initial_population()
179 |         searl.evolve_population(population)
180 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/tournament_selection.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import numpy as np
 3 | 
 4 | 
 5 | class TournamentSelection():
 6 | 
 7 |     def __init__(self, config):
 8 |         self.cfg = config
 9 | 
10 |     def _tournament(self, fitness_values):
11 |         selection = np.random.randint(0, len(fitness_values), size=self.cfg.nevo.tournament_size)
12 |         selection_values = [fitness_values[i] for i in selection]
13 |         winner = selection[np.argmax(selection_values)]
14 |         return winner
15 | 
16 |     def select(self, population):
17 |         last_fitness = [indi.fitness[-1] for indi in population]
18 |         rank = np.argsort(last_fitness).argsort()
19 | 
20 |         max_id = max([ind.index for ind in population])
21 | 
22 |         elite = copy.deepcopy([population[np.argsort(rank)[-1]]][0])
23 | 
24 |         new_population = []
25 |         if self.cfg.nevo.elitism:
26 |             new_population.append(elite.clone())
27 |             selection_size = self.cfg.nevo.population_size - 1
28 |         else:
29 |             selection_size = self.cfg.nevo.population_size
30 | 
31 |         for idx in range(selection_size):
32 |             max_id += 1
33 |             actor_parent = population[self._tournament(rank)]
34 |             new_individual = actor_parent.clone(max_id)
35 |             new_individual.train_log["parent_index"] = actor_parent.index
36 |             new_population.append(new_individual)
37 | 
38 |         return elite, new_population
39 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/training_dqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  5 | 
  6 | 
  7 | def get_optimizer(name):
  8 |     if name == "adam":
  9 |         return torch.optim.Adam
 10 |     elif name == "adadelta":
 11 |         return torch.optim.Adadelta
 12 |     elif name == "adamax":
 13 |         return torch.optim.Adamax
 14 |     elif name == "rmsprop":
 15 |         return torch.optim.RMSprop
 16 |     elif name == "sdg":
 17 |         return torch.optim.SGD
 18 | 
 19 | 
 20 | class DQNTraining():
 21 | 
 22 |     def __init__(self, config, replay_memory, replay_priority_queue=None):
 23 |         self.cfg = config
 24 |         self.rng = np.random.RandomState(self.cfg.seed.training)
 25 |         self.replay_sample_queue = replay_memory
 26 |         self.replay_priority_queue = replay_priority_queue
 27 |         self.args = config.rl
 28 | 
 29 |     @staticmethod
 30 |     def update_parameters(indi, replay_sample_queue, iterations):
 31 |         args = indi.rl_config
 32 |         Opti = get_optimizer(args.optimizer)
 33 | 
 34 |         actor = indi.actor
 35 |         actor_target = type(actor)(**actor.init_dict)
 36 |         actor_target.load_state_dict(actor.state_dict())
 37 |         actor.to(device)
 38 |         actor.train()
 39 |         actor_target.to(device)
 40 |         actor_optim = Opti(actor.parameters(), lr=args.lr_actor)
 41 | 
 42 |         losses = []
 43 |         for it in range(iterations):
 44 |             transistion_list = replay_sample_queue.get()
 45 |             state_list = []
 46 |             action_batch = []
 47 |             next_state_batch = []
 48 |             reward_batch = []
 49 |             done_batch = []
 50 |             for transition in transistion_list:
 51 |                 state_list.append(transition.state)
 52 |                 action_batch.append(transition.action)
 53 |                 next_state_batch.append(transition.next_state)
 54 |                 reward_batch.append(transition.reward)
 55 |                 done_batch.append(transition.done)
 56 | 
 57 |             state = torch.stack(state_list, dim=0).to(device)
 58 |             action = torch.stack(action_batch, dim=0).squeeze().to(device)
 59 |             next_state = torch.stack(next_state_batch, dim=0).to(device)
 60 |             rewards = torch.stack(reward_batch, dim=0).squeeze().to(device)
 61 |             dones = torch.stack(done_batch, dim=0).squeeze().to(device)
 62 | 
 63 |             with torch.no_grad():
 64 |                 batch_size = next_state.size(0)
 65 | 
 66 |                 delta_z = float(args.Vmax - args.Vmin) / (args.num_atoms - 1)
 67 |                 support = torch.linspace(args.Vmin, args.Vmax, args.num_atoms).to(device)
 68 | 
 69 |                 next_dist = actor_target(next_state) * support
 70 |                 next_action = next_dist.sum(2).max(1)[1]
 71 |                 next_action = next_action.unsqueeze(1).unsqueeze(1).expand(next_dist.size(0), 1, next_dist.size(2))
 72 |                 next_dist = next_dist.gather(1, next_action).squeeze(1)
 73 | 
 74 |                 rewards = rewards.unsqueeze(1).expand_as(next_dist)
 75 |                 dones = dones.unsqueeze(1).expand_as(next_dist)
 76 |                 support = support.unsqueeze(0).expand_as(next_dist)
 77 | 
 78 |                 Tz = rewards + (1 - dones) * 0.99 * support
 79 |                 Tz = Tz.clamp(min=args.Vmin, max=args.Vmax)
 80 |                 b = (Tz - args.Vmin) / delta_z
 81 |                 l = b.floor().long()
 82 |                 u = b.ceil().long()
 83 | 
 84 |                 offset = torch.linspace(0, (batch_size - 1) * args.num_atoms, batch_size).long() \
 85 |                     .unsqueeze(1).expand(batch_size, args.num_atoms).to(device)
 86 | 
 87 |                 proj_dist = torch.zeros(next_dist.size()).to(device)
 88 |                 proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1))
 89 |                 proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1))
 90 | 
 91 |             dist = actor(state)
 92 |             action = action.unsqueeze(1).unsqueeze(1).expand(args.batch_size, 1, args.num_atoms)
 93 |             dist = dist.gather(1, action).squeeze(1)
 94 |             dist.data.clamp_(0.01, 0.99)
 95 |             loss = -(proj_dist * dist.log()).sum(1)
 96 |             loss = loss.mean()
 97 | 
 98 |             actor_optim.zero_grad()
 99 |             loss.backward()
100 |             actor_optim.step()
101 | 
102 |             if it % 5 == 0:
103 |                 actor.reset_noise()
104 |                 actor_target.reset_noise()
105 | 
106 |             losses.append(loss.detach().cpu().numpy())
107 | 
108 |             if it % 2 == 0 and it != 0:
109 |                 for param, target_param in zip(actor.parameters(), actor_target.parameters()):
110 |                     target_param.data.copy_(args.tau * param.data + (1 - args.tau) * target_param.data)
111 | 
112 |         indi.actor = actor.cpu().clone()
113 |         indi.train_log['train_iterations'] = iterations
114 |         indi.train_log['train_losses'] = np.mean(losses).tolist()
115 |         indi.train_log.update(args.get_dict)
116 | 
117 |         return indi
118 | 
119 |     def train(self, population, iterations, pool=None):
120 | 
121 |         pop_id_lookup = [ind.index for ind in population]
122 | 
123 |         if self.cfg.nevo.ind_memory:
124 |             args_list = [(indi, indi.replay_memory, iterations) for indi in population]
125 |         else:
126 |             args_list = [(indi, self.replay_sample_queue, iterations) for indi in population]
127 | 
128 |         trained_pop = []
129 |         for args in args_list:
130 |             trained_pop.append(self.update_parameters(*args))
131 | 
132 |         trained_pop = sorted(trained_pop, key=lambda i: pop_id_lookup.index(i.index))
133 | 
134 |         return trained_pop
135 | 


--------------------------------------------------------------------------------
/searl/neuroevolution/training_td3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | 
  5 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  6 | 
  7 | 
  8 | def get_optimizer(name):
  9 |     if name == "adam":
 10 |         return torch.optim.Adam
 11 |     elif name == "adadelta":
 12 |         return torch.optim.Adadelta
 13 |     elif name == "adamax":
 14 |         return torch.optim.Adamax
 15 |     elif name == "rmsprop":
 16 |         return torch.optim.RMSprop
 17 |     elif name == "sdg":
 18 |         return torch.optim.SGD
 19 | 
 20 | 
 21 | class TD3Training():
 22 | 
 23 |     def __init__(self, config, replay_sample_queue):
 24 |         self.cfg = config
 25 |         self.rng = np.random.RandomState(self.cfg.seed.training)
 26 |         self.replay_sample_queue = replay_sample_queue
 27 | 
 28 |         self.args = config.rl
 29 | 
 30 |     @staticmethod
 31 |     def update_parameters(indi, replay_sample_queue, iterations):
 32 |         args = indi.rl_config
 33 |         gamma = args.gamma
 34 |         tau = args.tau
 35 |         Opti = get_optimizer(args.optimizer)
 36 | 
 37 |         actor = indi.actor
 38 |         actor_target = type(actor)(**actor.init_dict)
 39 |         actor_target.load_state_dict(actor.state_dict())
 40 |         actor.to(device)
 41 |         actor.train()
 42 |         actor_target.to(device)
 43 |         actor_optim = Opti(actor.parameters(), lr=args.lr_actor)
 44 | 
 45 |         critic_1 = indi.critic_1
 46 |         critic_1_target = type(critic_1)(**critic_1.init_dict)
 47 |         critic_1_target.load_state_dict(critic_1.state_dict())
 48 |         critic_1.to(device)
 49 |         critic_1.train()
 50 |         critic_1_target.to(device)
 51 |         critic_1_optim = Opti(critic_1.parameters(), lr=args.lr_critic)
 52 | 
 53 |         critic_2 = indi.critic_2
 54 |         critic_2_target = type(critic_2)(**critic_2.init_dict)
 55 |         critic_2_target.load_state_dict(critic_2.state_dict())
 56 |         critic_2.to(device)
 57 |         critic_2.train()
 58 |         critic_2_target.to(device)
 59 |         critic_2_optim = Opti(critic_2.parameters(), lr=args.lr_critic)
 60 | 
 61 |         for it in range(iterations):
 62 | 
 63 |             transistion_list = replay_sample_queue.get()
 64 | 
 65 |             state_list = []
 66 |             action_batch = []
 67 |             next_state_batch = []
 68 |             reward_batch = []
 69 |             done_batch = []
 70 |             for transition in transistion_list:
 71 |                 state_list.append(torch.Tensor(transition.state))
 72 |                 action_batch.append(torch.Tensor(transition.action))
 73 |                 next_state_batch.append(torch.Tensor(transition.next_state))
 74 |                 reward_batch.append(torch.Tensor(transition.reward))
 75 |                 done_batch.append(torch.Tensor(transition.done))
 76 | 
 77 |             state_batch = torch.stack(state_list, dim=0)
 78 |             action_batch = torch.stack(action_batch, dim=0)
 79 |             next_state_batch = torch.stack(next_state_batch, dim=0)
 80 |             reward_batch = torch.stack(reward_batch, dim=0)
 81 |             done_batch = torch.stack(done_batch, dim=0)
 82 | 
 83 |             state = state_batch.to(device)
 84 |             action = action_batch.to(device)
 85 |             reward = reward_batch.to(device)
 86 |             done = 1 - done_batch.to(device)
 87 |             next_state = next_state_batch.to(device)
 88 | 
 89 |             with torch.no_grad():
 90 |                 noise = (torch.randn_like(action) * args.td3_policy_noise).clamp(-args.td3_noise_clip,
 91 |                                                                                  args.td3_noise_clip)
 92 |                 next_action = (actor_target(next_state) + noise).clamp(-1, 1)
 93 |                 target_Q1 = critic_1_target(torch.cat([next_state, next_action], 1))
 94 |                 target_Q2 = critic_2_target(torch.cat([next_state, next_action], 1))
 95 |                 target_Q = torch.min(target_Q1, target_Q2)
 96 |                 target_Q = reward + (done * gamma * target_Q)
 97 | 
 98 |             current_Q1 = critic_1(torch.cat([state, action], 1))
 99 |             current_Q2 = critic_2(torch.cat([state, action], 1))
100 | 
101 |             critic_loss_1 = F.mse_loss(current_Q1, target_Q)
102 |             critic_1_optim.zero_grad()
103 |             critic_loss_1.backward()
104 |             for p in critic_1.parameters():
105 |                 p.grad.data.clamp_(max=args.clip_grad_norm)
106 |             critic_1_optim.step()
107 | 
108 |             critic_loss_2 = F.mse_loss(current_Q2, target_Q)
109 |             critic_2_optim.zero_grad()
110 |             critic_loss_2.backward()
111 |             for p in critic_2.parameters():
112 |                 p.grad.data.clamp_(max=args.clip_grad_norm)
113 |             critic_2_optim.step()
114 | 
115 |             if it % args.td3_update_freq == 0:
116 |                 actor_loss = -critic_1(torch.cat([state, actor(state)], 1))
117 |                 actor_loss = torch.mean(actor_loss)
118 | 
119 |                 actor_optim.zero_grad()
120 |                 actor_loss.backward()
121 |                 for p in actor.parameters():
122 |                     p.grad.data.clamp_(max=args.clip_grad_norm)
123 |                 actor_optim.step()
124 | 
125 |                 for param, target_param in zip(actor.parameters(), actor_target.parameters()):
126 |                     target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
127 | 
128 |                 for param, target_param in zip(critic_1.parameters(), critic_1_target.parameters()):
129 |                     target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
130 | 
131 |                 for param, target_param in zip(critic_2.parameters(), critic_2_target.parameters()):
132 |                     target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
133 | 
134 |         actor_optim.zero_grad()
135 |         critic_1_optim.zero_grad()
136 |         if indi.td3_double_q:
137 |             critic_2_optim.zero_grad()
138 | 
139 |         indi.actor = actor.cpu().clone()
140 |         indi.critic_1 = critic_1.cpu().clone()
141 |         if indi.td3_double_q:
142 |             indi.critic_2 = critic_2.cpu().clone()
143 |         indi.train_log['train_iterations'] = iterations
144 |         indi.train_log.update(args.get_dict)
145 | 
146 |         return indi
147 | 
148 |     def train(self, population, eval_frames, pool=None):
149 |         pop_id_lookup = [ind.index for ind in population]
150 |         iterations = max(self.cfg.train.min_train_steps, int(self.cfg.rl.train_frames_fraction * eval_frames))
151 | 
152 |         if self.cfg.nevo.ind_memory:
153 |             args_list = [(indi, indi.replay_memory, iterations) for indi in population]
154 |         else:
155 |             args_list = [(indi, self.replay_sample_queue, iterations) for indi in population]
156 | 
157 |         result_dicts = [pool.apply_async(self.update_parameters, args) for args in args_list]
158 |         trained_pop = [res.get() for res in result_dicts]
159 |         trained_pop = sorted(trained_pop, key=lambda i: pop_id_lookup.index(i.index))
160 | 
161 |         return trained_pop
162 | 


--------------------------------------------------------------------------------
/searl/rl_algorithms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/rl_algorithms/__init__.py


--------------------------------------------------------------------------------
/searl/rl_algorithms/components/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/rl_algorithms/components/__init__.py


--------------------------------------------------------------------------------
/searl/rl_algorithms/components/replay_memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class ReplayBuffer(object):
 4 |     def __init__(self, max_size=1e6):
 5 |         self.storage = []
 6 |         self.max_size = max_size
 7 |         self.ptr = 0
 8 | 
 9 |     def add(self, transistion):
10 |         if len(self.storage) == self.max_size:
11 |             self.storage[int(self.ptr)] = transistion
12 |             self.ptr = (self.ptr + 1) % self.max_size
13 |         else:
14 |             self.storage.append(transistion)
15 | 
16 |     def sample(self, batch_size):
17 |         ind = np.random.randint(0, len(self.storage), size=batch_size)
18 | 
19 |         transition_list = []
20 |         for i in ind:
21 |             transition_list.append(self.storage[i])
22 | 
23 |         return transition_list
24 | 


--------------------------------------------------------------------------------
/searl/rl_algorithms/components/wrappers.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | 
  3 | import cv2
  4 | import gym
  5 | import numpy as np
  6 | from gym import spaces
  7 | 
  8 | cv2.ocl.setUseOpenCL(False)
  9 | 
 10 | 
 11 | class NoopResetEnv(gym.Wrapper):
 12 |     def __init__(self, env, noop_max=30):
 13 |         """Sample initial states by taking random number of no-ops on reset.
 14 |         No-op is assumed to be action 0.
 15 |         """
 16 |         gym.Wrapper.__init__(self, env)
 17 |         self.noop_max = noop_max
 18 |         self.override_num_noops = None
 19 |         self.noop_action = 0
 20 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 21 | 
 22 |     def reset(self, **kwargs):
 23 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 24 |         self.env.reset(**kwargs)
 25 |         if self.override_num_noops is not None:
 26 |             noops = self.override_num_noops
 27 |         else:
 28 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)  # pylint: disable=E1101
 29 |         assert noops > 0
 30 |         obs = None
 31 |         for _ in range(noops):
 32 |             obs, _, done, _ = self.env.step(self.noop_action)
 33 |             if done:
 34 |                 obs = self.env.reset(**kwargs)
 35 |         return obs
 36 | 
 37 |     def step(self, ac):
 38 |         return self.env.step(ac)
 39 | 
 40 | 
 41 | class FireResetEnv(gym.Wrapper):
 42 |     def __init__(self, env):
 43 |         """Take action on reset for environments that are fixed until firing."""
 44 |         gym.Wrapper.__init__(self, env)
 45 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 46 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 47 | 
 48 |     def reset(self, **kwargs):
 49 |         self.env.reset(**kwargs)
 50 |         obs, _, done, _ = self.env.step(1)
 51 |         if done:
 52 |             self.env.reset(**kwargs)
 53 |         obs, _, done, _ = self.env.step(2)
 54 |         if done:
 55 |             self.env.reset(**kwargs)
 56 |         return obs
 57 | 
 58 |     def step(self, ac):
 59 |         return self.env.step(ac)
 60 | 
 61 | 
 62 | class EpisodicLifeEnv(gym.Wrapper):
 63 |     def __init__(self, env):
 64 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 65 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 66 |         """
 67 |         gym.Wrapper.__init__(self, env)
 68 |         self.lives = 0
 69 |         self.was_real_done = True
 70 | 
 71 |     def step(self, action):
 72 |         obs, reward, done, info = self.env.step(action)
 73 |         self.was_real_done = done
 74 |         # check current lives, make loss of life terminal,
 75 |         # then update lives to handle bonus lives
 76 |         lives = self.env.unwrapped.ale.lives()
 77 |         if lives < self.lives and lives > 0:
 78 |             # for Qbert sometimes we stay in lives == 0 condtion for a few frames
 79 |             # so its important to keep lives > 0, so that we only reset once
 80 |             # the environment advertises done.
 81 |             done = True
 82 |         self.lives = lives
 83 |         return obs, reward, done, info
 84 | 
 85 |     def reset(self, **kwargs):
 86 |         """Reset only when lives are exhausted.
 87 |         This way all states are still reachable even though lives are episodic,
 88 |         and the learner need not know about any of this behind-the-scenes.
 89 |         """
 90 |         if self.was_real_done:
 91 |             obs = self.env.reset(**kwargs)
 92 |         else:
 93 |             # no-op step to advance from terminal/lost life state
 94 |             obs, _, _, _ = self.env.step(0)
 95 |         self.lives = self.env.unwrapped.ale.lives()
 96 |         return obs
 97 | 
 98 | 
 99 | class MaxAndSkipEnv(gym.Wrapper):
100 |     def __init__(self, env, skip=4):
101 |         """Return only every `skip`-th frame"""
102 |         gym.Wrapper.__init__(self, env)
103 |         # most recent raw observations (for max pooling across time steps)
104 |         self._obs_buffer = np.zeros((2,) + env.observation_space.shape, dtype=np.uint8)
105 |         self._skip = skip
106 | 
107 |     def reset(self):
108 |         return self.env.reset()
109 | 
110 |     def step(self, action):
111 |         """Repeat action, sum reward, and max over last observations."""
112 |         total_reward = 0.0
113 |         done = None
114 |         for i in range(self._skip):
115 |             obs, reward, done, info = self.env.step(action)
116 |             if i == self._skip - 2: self._obs_buffer[0] = obs
117 |             if i == self._skip - 1: self._obs_buffer[1] = obs
118 |             total_reward += reward
119 |             if done:
120 |                 break
121 |         # Note that the observation on the done=True frame
122 |         # doesn't matter
123 |         max_frame = self._obs_buffer.max(axis=0)
124 | 
125 |         return max_frame, total_reward, done, info
126 | 
127 |     def reset(self, **kwargs):
128 |         return self.env.reset(**kwargs)
129 | 
130 | 
131 | class ClipRewardEnv(gym.RewardWrapper):
132 |     def __init__(self, env):
133 |         gym.RewardWrapper.__init__(self, env)
134 | 
135 |     def reward(self, reward):
136 |         """Bin reward to {+1, 0, -1} by its sign."""
137 |         return np.sign(reward)
138 | 
139 | 
140 | class WarpFrame(gym.ObservationWrapper):
141 |     def __init__(self, env):
142 |         """Warp frames to 84x84 as done in the Nature paper and later work."""
143 |         gym.ObservationWrapper.__init__(self, env)
144 |         self.width = 84
145 |         self.height = 84
146 |         self.observation_space = spaces.Box(low=0, high=255,
147 |                                             shape=(self.height, self.width, 1), dtype=np.uint8)
148 | 
149 |     def observation(self, frame):
150 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
151 |         frame = cv2.resize(frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
152 |         return frame[:, :, None]
153 | 
154 | 
155 | class FrameStack(gym.Wrapper):
156 |     def __init__(self, env, k):
157 |         """Stack k last frames.
158 |         Returns lazy array, which is much more memory efficient.
159 |         See Also
160 |         --------
161 |         baselines.common.atari_wrappers.LazyFrames
162 |         """
163 |         gym.Wrapper.__init__(self, env)
164 |         self.k = k
165 |         self.frames = deque([], maxlen=k)
166 |         shp = env.observation_space.shape
167 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), dtype=np.uint8)
168 | 
169 |     def reset(self):
170 |         ob = self.env.reset()
171 |         for _ in range(self.k):
172 |             self.frames.append(ob)
173 |         return self._get_ob()
174 | 
175 |     def step(self, action):
176 |         ob, reward, done, info = self.env.step(action)
177 |         self.frames.append(ob)
178 |         return self._get_ob(), reward, done, info
179 | 
180 |     def _get_ob(self):
181 |         assert len(self.frames) == self.k
182 |         return LazyFrames(list(self.frames))
183 | 
184 | 
185 | class ScaledFloatFrame(gym.ObservationWrapper):
186 |     def __init__(self, env):
187 |         gym.ObservationWrapper.__init__(self, env)
188 | 
189 |     def observation(self, observation):
190 |         # careful! This undoes the memory optimization, use
191 |         # with smaller replay buffers only.
192 |         return np.array(observation).astype(np.float32) / 255.0
193 | 
194 | 
195 | class LazyFrames(object):
196 |     def __init__(self, frames):
197 |         """This object ensures that common frames between the observations are only stored once.
198 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
199 |         buffers.
200 |         This object should only be converted to numpy array before being passed to the model.
201 |         You'd not believe how complex the previous solution was."""
202 |         self._frames = frames
203 |         self._out = None
204 | 
205 |     def _force(self):
206 |         if self._out is None:
207 |             self._out = np.concatenate(self._frames, axis=2)
208 |             self._frames = None
209 |         return self._out
210 | 
211 |     def __array__(self, dtype=None):
212 |         out = self._force()
213 |         if dtype is not None:
214 |             out = out.astype(dtype)
215 |         return out
216 | 
217 |     def __len__(self):
218 |         return len(self._force())
219 | 
220 |     def __getitem__(self, i):
221 |         return self._force()[i]
222 | 
223 | 
224 | def make_atari(env_id):
225 |     env = gym.make(env_id)
226 |     assert 'NoFrameskip' in env.spec.id
227 |     env = NoopResetEnv(env, noop_max=30)
228 |     env = MaxAndSkipEnv(env, skip=4)
229 |     return env
230 | 
231 | 
232 | def wrap_deepmind(env, episode_life=True, clip_rewards=True, frame_stack=False, scale=False):
233 |     """Configure environment for DeepMind-style Atari.
234 |     """
235 |     if episode_life:
236 |         env = EpisodicLifeEnv(env)
237 |     if 'FIRE' in env.unwrapped.get_action_meanings():
238 |         env = FireResetEnv(env)
239 |     env = WarpFrame(env)
240 |     if scale:
241 |         env = ScaledFloatFrame(env)
242 |     if clip_rewards:
243 |         env = ClipRewardEnv(env)
244 |     if frame_stack:
245 |         env = FrameStack(env, 4)
246 |     return env
247 | 
248 | 
249 | class ImageToPyTorch(gym.ObservationWrapper):
250 |     """
251 |     Image shape to num_channels x weight x height
252 |     """
253 | 
254 |     def __init__(self, env):
255 |         super(ImageToPyTorch, self).__init__(env)
256 |         old_shape = self.observation_space.shape
257 |         self.observation_space = gym.spaces.Box(low=0.0, high=1.0, shape=(old_shape[-1], old_shape[0], old_shape[1]), dtype=np.uint8)
258 | 
259 |     def observation(self, observation):
260 |         return np.swapaxes(observation, 2, 0)
261 | 
262 | 
263 | def wrap_pytorch(env):
264 |     return ImageToPyTorch(env)
265 | 


--------------------------------------------------------------------------------
/searl/rl_algorithms/dqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | 
  4 | from searl.neuroevolution.training_td3 import get_optimizer
  5 | from searl.neuroevolution.components.utils import Transition
  6 | from searl.neuroevolution.components.envolvable_cnn import EvolvableCnnDQN
  7 | from searl.rl_algorithms.components.wrappers import make_atari, wrap_deepmind, wrap_pytorch
  8 | from searl.rl_algorithms.components.replay_memory import ReplayBuffer
  9 | from searl.utils.supporter import Supporter
 10 | 
 11 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 12 | print("CUDA", device == torch.device("cuda"), device)
 13 | 
 14 | 
 15 | class DQN(object):
 16 | 
 17 |     def __init__(self, config, logger, checkpoint):
 18 |         self.cfg = config
 19 |         self.log = logger
 20 |         self.ckp = checkpoint
 21 | 
 22 |         env = make_atari(self.cfg.env.name)
 23 |         env = wrap_deepmind(env)
 24 |         env = wrap_pytorch(env)
 25 |         self.env = env
 26 | 
 27 |         self.cfg.set_attr("action_dim", self.env.action_space.n)
 28 |         self.cfg.set_attr("state_dim", self.env.observation_space.shape)
 29 | 
 30 |         # Set seeds
 31 |         self.env.seed(seed=self.cfg.seed.env)
 32 |         torch.manual_seed(self.cfg.seed.torch)
 33 |         np.random.seed(self.cfg.seed.numpy)
 34 | 
 35 |         self.Vmin = self.cfg.actor.Vmin
 36 |         self.Vmax = self.cfg.actor.Vmax
 37 |         self.num_atoms = self.cfg.actor.num_atoms
 38 |         self.batch_size = self.cfg.dqn.batch_size
 39 | 
 40 |         self.tau = 0.005
 41 | 
 42 |         self.actor = EvolvableCnnDQN(input_shape=self.cfg.state_dim, num_actions=self.cfg.action_dim, device=device,
 43 |                                      **self.cfg.actor.get_dict).to(device)
 44 | 
 45 |         Opti = get_optimizer(self.cfg.dqn.optimizer)
 46 |         self.actor_optim = Opti(self.actor.parameters(), lr=self.cfg.dqn.lr_actor)
 47 | 
 48 |         self.actor_target = type(self.actor)(**self.actor.init_dict).to(device)
 49 |         self.actor_target.load_state_dict(self.actor.state_dict())
 50 | 
 51 |         self.replay_memory = ReplayBuffer(self.cfg.dqn.rm_capacity)
 52 | 
 53 |         self.log.print_config(self.cfg)
 54 | 
 55 |     def evaluate_policy(self, eval_episodes):
 56 |         episode_reward_list = []
 57 |         for _ in range(eval_episodes):
 58 |             state = self.env.reset()
 59 |             done = False
 60 |             episode_reward = 0
 61 |             while not done:
 62 |                 action = self.actor.act(state)
 63 |                 next_state, reward, done, info = self.env.step(action)  # Simulate one step in environment
 64 |                 state = next_state
 65 |                 episode_reward += reward
 66 | 
 67 |             episode_reward_list.append(episode_reward)
 68 | 
 69 |         avg_reward = np.mean(episode_reward_list)
 70 | 
 71 |         self.log("---------------------------------------")
 72 |         self.log("Evaluation over %d episodes: %f" % (eval_episodes, avg_reward))
 73 |         self.log("---------------------------------------")
 74 | 
 75 |         return avg_reward
 76 | 
 77 |     def perform_learning(self):
 78 | 
 79 |         self.log("START LEARNING")
 80 | 
 81 |         total_timesteps = 0
 82 |         timesteps_since_eval = 0
 83 |         episode_reward = 0
 84 |         episode_timesteps = 0
 85 |         reset_timesteps = 0
 86 |         episode_num = 0
 87 |         done = True
 88 | 
 89 |         while total_timesteps < self.cfg.dqn.num_frames:
 90 | 
 91 |             if done:
 92 |                 if total_timesteps != 0 and self.replay_memory.storage.__len__() > self.cfg.dqn.replay_initial:
 93 |                     if (
 94 |                             self.cfg.dqn.reset_target or self.cfg.dqn.recreate_optim) and reset_timesteps >= self.cfg.dqn.min_eval_steps:
 95 |                         self.train(episode_timesteps, reinit_optim=self.cfg.dqn.recreate_optim,
 96 |                                           reinit_target=self.cfg.dqn.reset_target)
 97 |                         reset_timesteps = 0
 98 |                     else:
 99 |                         self.train(episode_timesteps)
100 | 
101 |                 # Evaluate episode
102 |                 if timesteps_since_eval >= self.cfg.dqn.eval_freq:
103 |                     timesteps_since_eval = 0
104 |                     test_mean_reward = self.evaluate_policy(eval_episodes=self.cfg.dqn.eval_episodes)
105 |                     self.log("test_mean_reward", test_mean_reward, time_step=total_timesteps)
106 |                     self.log("test_episode_num", episode_num, time_step=total_timesteps)
107 | 
108 |                     if self.cfg.support.save_models:
109 |                         self.ckp.save_object(self.actor.state_dict(), name="actor_state_dict")
110 | 
111 |                 # Reset environment
112 |                 state = self.env.reset()
113 |                 episode_reward = 0
114 |                 episode_timesteps = 0
115 |                 episode_num += 1
116 | 
117 |             # Select action randomly or according to policy
118 |             if total_timesteps < self.cfg.dqn.start_timesteps:
119 |                 action = self.env.action_space.sample()
120 |             else:
121 |                 action = self.actor.act(state)
122 | 
123 |             next_state, reward, done, info = self.env.step(action)  # Simulate one step in environment
124 | 
125 |             transition = Transition(torch.FloatTensor(state), torch.LongTensor([action]),
126 |                                     torch.FloatTensor(next_state), torch.FloatTensor(np.array([reward])),
127 |                                     torch.FloatTensor(np.array([done]).astype('uint8'))
128 |                                     )
129 |             self.replay_memory.add(transition)
130 | 
131 |             state = next_state
132 | 
133 |             episode_reward += reward
134 |             episode_timesteps += 1
135 |             reset_timesteps += 1
136 |             total_timesteps += 1
137 |             timesteps_since_eval += 1
138 | 
139 |         # Final evaluation
140 |         self.log("training end", time_step=total_timesteps)
141 |         test_mean_reward = self.evaluate_policy(eval_episodes=self.cfg.td3.eval_episodes)
142 |         self.log("test_mean_reward", test_mean_reward, time_step=total_timesteps)
143 |         if self.cfg.support.save_models:
144 |             self.ckp.save_state_dict(self.actor.state_dict(), number=1)
145 |             self.ckp.save_object(self.replay_memory.storage, name="replay_memory")
146 | 
147 |     def projection_distribution(self, next_state, rewards, dones):
148 |         batch_size = next_state.size(0)
149 | 
150 |         delta_z = float(self.Vmax - self.Vmin) / (self.num_atoms - 1)
151 |         support = torch.linspace(self.Vmin, self.Vmax, self.num_atoms).to(device)
152 | 
153 |         next_dist = self.actor_target(next_state) * support
154 |         next_action = next_dist.sum(2).max(1)[1]
155 |         next_action = next_action.unsqueeze(1).unsqueeze(1).expand(next_dist.size(0), 1, next_dist.size(2))
156 |         next_dist = next_dist.gather(1, next_action).squeeze(1)
157 | 
158 |         rewards = rewards.unsqueeze(1).expand_as(next_dist)
159 |         dones = dones.unsqueeze(1).expand_as(next_dist)
160 |         support = support.unsqueeze(0).expand_as(next_dist)
161 | 
162 |         Tz = rewards + (1 - dones) * 0.99 * support
163 |         Tz = Tz.clamp(min=self.Vmin, max=self.Vmax)
164 |         b = (Tz - self.Vmin) / delta_z
165 |         l = b.floor().long()
166 |         u = b.ceil().long()
167 | 
168 |         offset = torch.linspace(0, (batch_size - 1) * self.num_atoms, batch_size).long() \
169 |             .unsqueeze(1).expand(batch_size, self.num_atoms).to(device)
170 | 
171 |         proj_dist = torch.zeros(next_dist.size()).to(device)
172 |         proj_dist.view(-1).index_add_(0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1))
173 |         proj_dist.view(-1).index_add_(0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1))
174 | 
175 |         return proj_dist
176 | 
177 |     def train(self, iterations, reinit_target=False, reinit_optim=False):
178 | 
179 |         iterations = min(iterations, 10000)
180 | 
181 |         if reinit_target:
182 |             self.actor_target.load_state_dict(self.actor.state_dict())
183 | 
184 |         if reinit_optim:
185 |             self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
186 | 
187 |         losses = []
188 | 
189 |         for it in range(iterations):
190 | 
191 |             transition_list = self.replay_memory.sample(self.cfg.dqn.batch_size)
192 | 
193 |             state_list = []
194 |             action_batch = []
195 |             next_state_batch = []
196 |             reward_batch = []
197 |             done_batch = []
198 |             indexes = []
199 |             for transition in transition_list:
200 |                 state_list.append(transition.state)
201 |                 action_batch.append(transition.action)
202 |                 next_state_batch.append(transition.next_state)
203 |                 reward_batch.append(transition.reward)
204 |                 done_batch.append(transition.done)
205 |                 indexes.append(transition.index)
206 | 
207 |             state = torch.stack(state_list, dim=0).to(device)
208 |             action = torch.stack(action_batch, dim=0).squeeze().to(device)
209 |             next_state = torch.stack(next_state_batch, dim=0).to(device)
210 |             reward = torch.stack(reward_batch, dim=0).squeeze().to(device)
211 |             done = torch.stack(done_batch, dim=0).squeeze().to(device)
212 | 
213 |             with torch.no_grad():
214 |                 proj_dist = self.projection_distribution(next_state, reward, done)
215 | 
216 |             dist = self.actor(state)
217 |             action = action.unsqueeze(1).unsqueeze(1).expand(self.batch_size, 1, self.num_atoms)
218 |             dist = dist.gather(1, action).squeeze(1)
219 |             dist.data.clamp_(0.01, 0.99)
220 |             loss = -(proj_dist * dist.log()).sum(1)
221 |             loss = loss.mean()
222 | 
223 |             self.actor_optim.zero_grad()
224 |             loss.backward()
225 |             self.actor_optim.step()
226 | 
227 |             if it % 5 == 0:
228 |                 self.actor.reset_noise()
229 |                 self.actor_target.reset_noise()
230 | 
231 |             losses.append(loss.detach().cpu().numpy())
232 | 
233 |             if self.cfg.dqn.soft_update:
234 |                 if it % 2 == 0 and it != 0:
235 |                     for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
236 |                         target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
237 |             else:
238 |                 if (it % 1000 == 0 and it != 0) or it == (iterations - 1):
239 |                     self.actor_target.load_state_dict(self.actor.state_dict())
240 |         return np.mean(losses).tolist()
241 | 
242 | 
243 | def start_DQN_training(config, expt_dir):
244 |     with Supporter(experiments_dir=expt_dir, config_dict=config, count_expt=True) as sup:
245 |         cfg = sup.get_config()
246 |         log = sup.get_logger()
247 | 
248 |         dqn = DQN(config=cfg, logger=log, checkpoint=sup.ckp)
249 |         dqn.perform_learning()
250 | 


--------------------------------------------------------------------------------
/searl/rl_algorithms/td3.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | import gym
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | 
  8 | from searl.neuroevolution.components.envolvable_mlp import EvolvableMLP
  9 | from searl.neuroevolution.training_td3 import get_optimizer
 10 | from searl.neuroevolution.components.utils import to_tensor, Transition
 11 | from searl.rl_algorithms.components.replay_memory import ReplayBuffer
 12 | from searl.utils.supporter import Supporter
 13 | 
 14 | 
 15 | class TD3(object):
 16 | 
 17 |     def __init__(self, config, logger, checkpoint):
 18 | 
 19 |         self.cfg = config
 20 |         self.log = logger
 21 |         self.ckp = checkpoint
 22 | 
 23 |         self.lr_rate = 0.001
 24 | 
 25 |         self.env = gym.make(self.cfg.env.name)
 26 |         self.cfg.set_attr("action_dim", self.env.action_space.shape[0])
 27 |         self.cfg.set_attr("state_dim", self.env.observation_space.shape[0])
 28 | 
 29 |         # Set seeds
 30 |         self.env.seed(seed=self.cfg.seed.env)
 31 |         torch.manual_seed(self.cfg.seed.torch)
 32 |         np.random.seed(self.cfg.seed.numpy)
 33 | 
 34 |         self.actor = EvolvableMLP(num_inputs=self.cfg.state_dim, num_outputs=self.cfg.action_dim,
 35 |                                   **self.cfg.actor.get_dict)
 36 | 
 37 |         self.actor_target = type(self.actor)(**self.actor.init_dict)
 38 |         self.actor_target.load_state_dict(self.actor.state_dict())
 39 | 
 40 |         critic_1_config = copy.deepcopy(self.cfg.critic.get_dict)
 41 |         self.critic_1 = EvolvableMLP(num_inputs=self.cfg.state_dim + self.cfg.action_dim, num_outputs=1,
 42 |                                      **critic_1_config)
 43 |         self.critic_1_target = type(self.critic_1)(**self.critic_1.init_dict)
 44 |         self.critic_1_target.load_state_dict(self.critic_1.state_dict())
 45 |         if self.cfg.td3.double_q:
 46 |             critic_2_config = copy.deepcopy(self.cfg.critic.get_dict)
 47 |             self.critic_2 = EvolvableMLP(num_inputs=self.cfg.state_dim + self.cfg.action_dim, num_outputs=1,
 48 |                                          **critic_2_config)
 49 |             self.critic_2_target = type(self.critic_2)(**self.critic_2.init_dict)
 50 |             self.critic_2_target.load_state_dict(self.critic_2.state_dict())
 51 | 
 52 |         Opti = get_optimizer(self.cfg.td3.optimizer)
 53 |         self.actor_optim = Opti(self.actor.parameters(), lr=self.cfg.td3.lr_actor)
 54 |         self.critic_1_optim = Opti(self.critic_1.parameters(), lr=self.cfg.td3.lr_critic)
 55 |         if self.cfg.td3.double_q:
 56 |             self.critic_2_optim = Opti(self.critic_2.parameters(), lr=self.cfg.td3.lr_critic)
 57 | 
 58 |         self.replay_memory = ReplayBuffer(self.cfg.td3.rm_capacity)
 59 | 
 60 |         self.log.print_config(self.cfg)
 61 | 
 62 |     def evaluate_policy(self, eval_episodes):
 63 |         episode_reward_list = []
 64 |         for _ in range(eval_episodes):
 65 | 
 66 |             state = self.env.reset()
 67 |             t_state = to_tensor(state).unsqueeze(0)
 68 |             done = False
 69 |             episode_reward = 0
 70 |             while not done:
 71 |                 # Reset environment
 72 |                 action = self.actor(t_state)
 73 |                 action.clamp(-1, 1)  # only for MuJoCo
 74 |                 action = action.data.numpy()
 75 |                 action = action.flatten()
 76 | 
 77 |                 step_action = (action + 1) / 2  # [-1, 1] => [0, 1]
 78 |                 step_action *= (self.env.action_space.high - self.env.action_space.low)
 79 |                 step_action += self.env.action_space.low
 80 | 
 81 |                 next_state, reward, done, info = self.env.step(step_action)  # Simulate one step in environment
 82 |                 t_state = to_tensor(next_state).unsqueeze(0)
 83 |                 episode_reward += reward
 84 | 
 85 |             episode_reward_list.append(episode_reward)
 86 | 
 87 |         avg_reward = np.mean(episode_reward_list)
 88 | 
 89 |         self.log("---------------------------------------")
 90 |         self.log("Evaluation over %d episodes: %f" % (eval_episodes, avg_reward))
 91 |         self.log("---------------------------------------")
 92 | 
 93 |         return avg_reward
 94 | 
 95 |     def select_action(self, state):
 96 |         state = torch.FloatTensor(state.reshape(1, -1))
 97 |         return self.actor(state).cpu().data.numpy().flatten()
 98 | 
 99 |     def perform_learning(self):
100 | 
101 |         self.log("START LEARNING")
102 | 
103 |         total_timesteps = 0
104 |         timesteps_since_eval = 0
105 |         episode_num = 0
106 |         done = True
107 | 
108 |         while total_timesteps < self.cfg.td3.max_timesteps:
109 | 
110 |             if done:
111 | 
112 |                 if total_timesteps != 0:
113 |                     self.log("Start Training: Total timesteps: %d Episode Num: %d Episode T: %d Reward: %f" % (
114 |                         total_timesteps, episode_num, episode_timesteps, episode_reward), time_step=total_timesteps)
115 |                     self.log("episode_reward", episode_reward, time_step=total_timesteps)
116 |                     self.train(episode_timesteps, reinit_optim=self.cfg.td3.recreate_optim,
117 |                                reinit_target=self.cfg.td3.reset_target, lr_rate=self.lr_rate)
118 |                 # Evaluate episode
119 |                 if timesteps_since_eval >= self.cfg.td3.eval_freq:
120 |                     timesteps_since_eval %= self.cfg.td3.eval_freq
121 |                     test_mean_reward = self.evaluate_policy(eval_episodes=self.cfg.td3.eval_episodes)
122 |                     self.log("test_mean_reward", test_mean_reward, time_step=total_timesteps)
123 | 
124 |                     if self.cfg.support.save_models:
125 |                         self.ckp.save_object(self.actor.state_dict(), name="actor_state_dict")
126 |                         self.ckp.save_object(self.critic_1.state_dict(), name="critic_1_state_dict")
127 | 
128 |                 # Reset environment
129 |                 state = self.env.reset()
130 |                 t_state = to_tensor(state).unsqueeze(0)
131 |                 done = False
132 |                 episode_reward = 0
133 |                 episode_timesteps = 0
134 |                 episode_num += 1
135 | 
136 |             # Select action randomly or according to policy
137 |             if total_timesteps < self.cfg.td3.start_timesteps:
138 |                 action = self.env.action_space.sample()
139 |                 action = to_tensor(action)
140 |             else:
141 |                 action = self.actor(t_state)
142 |             action.clamp(-1, 1)  # only for MuJoCo
143 |             action = action.data.numpy()
144 |             if self.cfg.td3.exploration_noise is not False:
145 |                 action += self.cfg.td3.exploration_noise * np.random.randn(self.cfg.action_dim)
146 |                 action = np.clip(action, -1, 1)
147 |             action = action.flatten()
148 | 
149 |             step_action = (action + 1) / 2  # [-1, 1] => [0, 1]
150 |             step_action *= (self.env.action_space.high - self.env.action_space.low)
151 |             step_action += self.env.action_space.low
152 | 
153 |             next_state, reward, done, info = self.env.step(step_action)  # Simulate one step in environment
154 | 
155 |             done_bool = 0 if episode_timesteps + 1 == self.env._max_episode_steps else float(done)
156 | 
157 |             t_next_state = to_tensor(next_state).unsqueeze(0)
158 | 
159 |             transition = Transition(state, action, next_state, np.array([reward]),
160 |                                     np.array([done_bool]).astype('uint8'))
161 |             self.replay_memory.add(transition)
162 | 
163 |             t_state = t_next_state
164 |             state = next_state
165 | 
166 |             episode_reward += reward
167 |             episode_timesteps += 1
168 |             total_timesteps += 1
169 |             timesteps_since_eval += 1
170 | 
171 |         # Final evaluation
172 |         self.log("training end", time_step=total_timesteps)
173 |         test_mean_reward = self.evaluate_policy(eval_episodes=self.cfg.td3.eval_episodes)
174 |         self.log("test_mean_reward", test_mean_reward, time_step=total_timesteps)
175 |         if self.cfg.support.save_models:
176 |             self.ckp.save_state_dict(self.actor.state_dict(), number=1)
177 |             self.ckp.save_state_dict(self.critic_1.state_dict(), number=2)
178 |             if self.cfg.td3.double_q:
179 |                 self.ckp.save_state_dict(self.critic_2.state_dict(), number=3)
180 |             self.ckp.save_object(self.replay_memory.storage, name="er_memory")
181 | 
182 |     def train(self, iterations, reinit_target=False, reinit_optim=False, lr_rate=0.001):
183 | 
184 |         if reinit_target:
185 |             self.actor_target = type(self.actor)(**self.actor.init_dict)
186 |             self.actor_target.load_state_dict(self.actor.state_dict())
187 | 
188 |             self.critic_1_target = type(self.critic_1)(**self.critic_1.init_dict)
189 |             self.critic_1_target.load_state_dict(self.critic_1.state_dict())
190 | 
191 |             self.critic_2_target = type(self.critic_2)(**self.critic_2.init_dict)
192 |             self.critic_2_target.load_state_dict(self.critic_2.state_dict())
193 | 
194 |         if reinit_optim:
195 |             self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr_rate)
196 |             self.critic_1_optim = torch.optim.Adam(self.critic_1.parameters(), lr=lr_rate)
197 |             self.critic_2_optim = torch.optim.Adam(self.critic_2.parameters(), lr=lr_rate)
198 | 
199 |         for it in range(iterations):
200 | 
201 |             transition_list = self.replay_memory.sample(self.cfg.td3.batch_size)
202 | 
203 |             state_list = []
204 |             action_batch = []
205 |             next_state_batch = []
206 |             reward_batch = []
207 |             done_batch = []
208 |             indexes = []
209 |             for transition in transition_list:
210 |                 state_list.append(torch.Tensor(transition.state))
211 |                 action_batch.append(torch.Tensor(transition.action))
212 |                 next_state_batch.append(torch.Tensor(transition.next_state))
213 |                 reward_batch.append(torch.Tensor(transition.reward))
214 |                 done_batch.append(torch.Tensor(transition.done))
215 |                 indexes.append(transition.index)
216 | 
217 |             state = torch.stack(state_list, dim=0)
218 |             action = torch.stack(action_batch, dim=0)
219 |             next_state = torch.stack(next_state_batch, dim=0)
220 |             reward = torch.stack(reward_batch, dim=0)
221 |             done = 1 - torch.stack(done_batch, dim=0)
222 | 
223 |             with torch.no_grad():
224 |                 noise = (torch.randn_like(action) * self.cfg.td3.td3_policy_noise).clamp(-self.cfg.td3.td3_noise_clip,
225 |                                                                                          self.cfg.td3.td3_noise_clip)
226 |                 next_action = (self.actor_target(next_state) + noise).clamp(-1, 1)
227 |                 target_Q1 = self.critic_1_target(torch.cat([next_state, next_action], 1))
228 |                 target_Q2 = self.critic_2_target(torch.cat([next_state, next_action], 1))
229 |                 target_Q = torch.min(target_Q1, target_Q2)
230 |                 target_Q = reward + (done * self.cfg.td3.gamma * target_Q)
231 | 
232 |             current_Q1 = self.critic_1(torch.cat([state, action], 1))
233 |             current_Q2 = self.critic_2(torch.cat([state, action], 1))
234 | 
235 |             critic_loss_1 = F.mse_loss(current_Q1, target_Q)
236 |             self.critic_1_optim.zero_grad()
237 |             critic_loss_1.backward()
238 |             for p in self.critic_1.parameters():
239 |                 p.grad.data.clamp_(max=self.cfg.td3.clip_grad_norm)
240 |             self.critic_1_optim.step()
241 | 
242 |             critic_loss_2 = F.mse_loss(current_Q2, target_Q)
243 |             self.critic_2_optim.zero_grad()
244 |             critic_loss_2.backward()
245 |             for p in self.critic_2.parameters():
246 |                 p.grad.data.clamp_(max=self.cfg.td3.clip_grad_norm)
247 |             self.critic_2_optim.step()
248 | 
249 |             if it % self.cfg.td3.td3_update_freq == 0:
250 | 
251 |                 actor_loss = -self.critic_1(torch.cat([state, self.actor(state)], 1))
252 | 
253 |                 actor_loss = torch.mean(actor_loss)
254 | 
255 |                 self.actor_optim.zero_grad()
256 |                 actor_loss.backward()
257 |                 for p in self.actor.parameters():
258 |                     p.grad.data.clamp_(max=self.cfg.td3.clip_grad_norm)
259 |                 self.actor_optim.step()
260 | 
261 |                 for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
262 |                     target_param.data.copy_(self.cfg.td3.tau * param.data + (1 - self.cfg.td3.tau) * target_param.data)
263 | 
264 |                 for param, target_param in zip(self.critic_1.parameters(), self.critic_1_target.parameters()):
265 |                     target_param.data.copy_(self.cfg.td3.tau * param.data + (1 - self.cfg.td3.tau) * target_param.data)
266 | 
267 |                 for param, target_param in zip(self.critic_2.parameters(), self.critic_2_target.parameters()):
268 |                     target_param.data.copy_(self.cfg.td3.tau * param.data + (1 - self.cfg.td3.tau) * target_param.data)
269 | 
270 | 
271 | def start_TD3_training(config, expt_dir):
272 |     with Supporter(experiments_dir=expt_dir, config_dict=config, count_expt=True) as sup:
273 |         cfg = sup.get_config()
274 |         log = sup.get_logger()
275 | 
276 |         env = gym.make(cfg.env.name)
277 |         cfg.set_attr("action_dim", env.action_space.shape[0])
278 |         cfg.set_attr("state_dim", env.observation_space.shape[0])
279 | 
280 |         td3 = TD3(config=cfg, logger=log, checkpoint=sup.ckp)
281 | 
282 |         td3.perform_learning()
283 | 


--------------------------------------------------------------------------------
/searl/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .handler.config import ConfigHandler
3 | from .handler.folder import FolderHandler


--------------------------------------------------------------------------------
/searl/utils/handler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/utils/handler/__init__.py


--------------------------------------------------------------------------------
/searl/utils/handler/base_handler.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pathlib
 3 | from datetime import datetime
 4 | 
 5 | 
 6 | class Handler():
 7 | 
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     def time_stamp(self) -> str:
12 |         return datetime.utcnow().strftime('%Y-%m-%d_%H:%M:%S.%f')[:-4]
13 | 
14 |     def save_mkdir(self, dir):
15 |         while not os.path.isdir(dir):
16 |             try:
17 |                 os.mkdir(dir)
18 |             except FileExistsError:
19 |                 pass
20 | 
21 |     def counting_name(self, dir, file_name, suffix=False):
22 |         dir = pathlib.Path(dir)
23 |         counter = 0
24 |         split_file_name = file_name.split('.')
25 |         if suffix:
26 |             counting_file_name = '.'.join(split_file_name[:-1]) + f"-{counter}." + split_file_name[-1]
27 |         else:
28 |             counting_file_name = file_name + f"-{counter}"
29 | 
30 |         while os.path.isfile(dir / counting_file_name) or os.path.isdir(dir / counting_file_name):
31 |             if suffix:
32 |                 counting_file_name = '.'.join(split_file_name[:-1]) + f"-{counter}." + split_file_name[-1]
33 |             else:
34 |                 counting_file_name = file_name + f"-{counter}"
35 |             counter += 1
36 | 
37 |         return counting_file_name
38 | 


--------------------------------------------------------------------------------
/searl/utils/handler/checkpoint.py:
--------------------------------------------------------------------------------
 1 | """
 2 | save and restore checkpoints including parameters, rng states and env/data states
 3 | """
 4 | import pathlib
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | 
 9 | class CheckpointHandler():
10 | 
11 |     def __init__(self, checkpoint_dir, ):
12 |         self.dir = pathlib.Path(checkpoint_dir)
13 | 
14 |     def save_training(self, mode_state_dict, optimizer_state_dict, epoch=None, loss=None, number=0):
15 |         torch.save({
16 |             'epoch': epoch,
17 |             'model_state_dict': mode_state_dict,
18 |             'optimizer_state_dict': optimizer_state_dict,
19 |             'loss': loss,
20 |         }, self.dir / f"training_{number}.tar")
21 | 
22 |     def load_training(self, number=0):
23 |         checkpoint = torch.load(self.dir / f"training_{number}.tar")
24 |         mode_state_dict = checkpoint['model_state_dict']
25 |         optimizer_state_dict = checkpoint['optimizer_state_dict']
26 |         epoch = checkpoint['epoch']
27 |         loss = checkpoint['loss']
28 |         return mode_state_dict, optimizer_state_dict, epoch, loss
29 | 
30 |     def save_model(self, model, number=0):
31 |         torch.save(model, self.dir / f"model_{number}.pth")
32 | 
33 |     def load_model(self, number=0):
34 |         model = torch.load(self.dir / f"model_{number}.pth")
35 |         return model
36 | 
37 |     def save_state_dict(self, state_dict, number=0):
38 |         torch.save(state_dict, self.dir / f"state_dict_{number}.pth")
39 | 
40 |     def load_state_dict(self, number=0, cpu=True):
41 |         if cpu:
42 |             state_dict = torch.load(self.dir / f"state_dict_{number}.pth", map_location=torch.device('cpu'))
43 |         else:
44 |             state_dict = torch.load(self.dir / f"state_dict_{number}.pth")
45 |         return state_dict
46 | 
47 |     def save_object(self, object, name="object_0"):
48 |         np.save(self.dir / f"{name}.npy", object, allow_pickle=True)
49 | 
50 |     def load_object(self, name="object_0"):
51 |         return np.load(self.dir / f"{name}.npy", allow_pickle=True)
52 | 


--------------------------------------------------------------------------------
/searl/utils/handler/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | reads a yml config or a dict and safes it into experiment folder
 3 | 
 4 | """
 5 | import os
 6 | import pathlib
 7 | import yaml
 8 | 
 9 | from searl.utils.handler.base_handler import Handler
10 | 
11 | 
12 | class AttributeDict(Handler):
13 |     def __init__(self, dictionary, name):
14 |         super().__init__()
15 | 
16 |         for key in dictionary:
17 |             if isinstance(dictionary[key], dict):
18 |                 if not hasattr(self, "sub_config"):
19 |                     self.sub_config = []
20 |                 self.sub_config.append(key)
21 |                 setattr(self, key, AttributeDict(dictionary[key], key))
22 |             else:
23 |                 setattr(self, key, dictionary[key])
24 | 
25 |     def __repr__(self):
26 |         return str(self.__dict__)
27 | 
28 |     def __str__(self):
29 |         return str(self.__dict__)
30 | 
31 |     @property
32 |     def get_dict(self):
33 |         return self.__dict__
34 | 
35 |     def set_attr(self, name, value):
36 |         if isinstance(value, pathlib.Path):
37 |             value = value.as_posix()
38 |         self.__setattr__(name, value)
39 | 
40 | 
41 | class ConfigHandler(AttributeDict):
42 | 
43 |     def __init__(self, config_dir=None, config_dict=None):
44 | 
45 |         if config_dir is None and config_dict is None:
46 |             raise UserWarning("ConfigHandler: config_dir and config_dict is None")
47 | 
48 |         elif config_dir is not None and config_dict is None:
49 |             with open(config_dir, 'r') as f:
50 |                 config_dict = yaml.load(f, Loader=yaml.Loader)
51 | 
52 |         super().__init__(config_dict, "main")
53 | 
54 |         self.check_experiment_config()
55 | 
56 |     def check_experiment_config(self):
57 |         if not hasattr(self, "expt"):
58 |             raise UserWarning(f"ConfigHandler: 'expt' config section is missing")
59 |         else:
60 |             for attr_name in ['project_name', 'session_name', 'experiment_name']:
61 |                 if not hasattr(self.expt, attr_name):
62 |                     raise UserWarning(f"ConfigHandler: {attr_name} is missing")
63 |                 elif isinstance(self.expt.__getattribute__(attr_name), str):
64 |                     self.expt.__setattr__(attr_name, str(self.expt.__getattribute__(attr_name)))
65 | 
66 |     def save_config(self, dir, file_name="config.yml"):
67 |         dir = pathlib.Path(dir)
68 |         self.save_mkdir(dir)
69 |         if os.path.isfile(dir / file_name):
70 |             file_name = self.counting_name(dir, file_name, suffix=True)
71 |         with open(dir / file_name, 'w+') as f:
72 |             config_dict = self.get_dict
73 |             yaml.dump(config_dict, f, default_flow_style=False, encoding='utf-8')
74 |         return dir / file_name
75 | 


--------------------------------------------------------------------------------
/searl/utils/handler/folder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Handle the location, new folders and experiments sub-folder structure.
 3 | 
 4 | base_dir / project / session / experiment
 5 | 
 6 | experiment will be increased
 7 | 
 8 | """
 9 | import pathlib
10 | 
11 | from searl.utils.handler.base_handler import Handler
12 | 
13 | 
14 | class FolderHandler(Handler):
15 | 
16 |     def __init__(self, experiments_dir, project_name=None, session_name=None, experiment_name=None, count_expt=False):
17 |         super().__init__()
18 | 
19 |         self.experiments_dir = pathlib.Path(experiments_dir)
20 | 
21 |         self.subfolder = ["log", "checkpoint", "config", "profile"]
22 | 
23 |         if project_name is not None:
24 |             self.project_name = project_name
25 |             self.session_name = session_name
26 |             self.experiment_name = experiment_name
27 |             self.count_expt = count_expt
28 | 
29 |             self.expt_dir = self.create_folders()
30 |         else:
31 |             self.expt_dir = self.experiments_dir
32 | 
33 |     def create_folders(self):
34 | 
35 |         dir = self.experiments_dir
36 |         self.save_mkdir(dir)
37 | 
38 |         for folder in [self.project_name, self.session_name]:
39 |             dir = dir / folder
40 |             self.save_mkdir(dir)
41 | 
42 |         if self.count_expt:
43 |             self.experiment_name = self.counting_name(dir, self.experiment_name)
44 | 
45 |         dir = dir / self.experiment_name
46 |         self.save_mkdir(dir)
47 | 
48 |         for folder in self.subfolder:
49 |             self.save_mkdir(dir / folder)
50 | 
51 |         return dir
52 | 
53 |     @property
54 |     def dir(self):
55 |         return self.expt_dir
56 | 
57 |     @property
58 |     def config_dir(self):
59 |         return self.expt_dir / "config"
60 | 
61 |     @property
62 |     def log_dir(self):
63 |         return self.expt_dir / "log"
64 | 
65 |     @property
66 |     def profile_dir(self):
67 |         return self.expt_dir / "profile"
68 | 
69 |     @property
70 |     def checkpoint_dir(self):
71 |         return self.expt_dir / "checkpoint"
72 | 


--------------------------------------------------------------------------------
/searl/utils/log/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automl/SEARL/bac75d8c9540ff4f0b5b340c612ec384b189bd84/searl/utils/log/__init__.py


--------------------------------------------------------------------------------
/searl/utils/log/csv.py:
--------------------------------------------------------------------------------
 1 | """
 2 | save log and show log
 3 | 
 4 | """
 5 | import csv
 6 | from pathlib import Path
 7 | from typing import Dict, List
 8 | 
 9 | from searl.utils.handler.base_handler import Handler
10 | 
11 | 
12 | class LogCSV(Handler):
13 | 
14 |     def __init__(self, log_dir, file_name="train_log.csv"):
15 |         super().__init__()
16 | 
17 |         self.log_dir = Path(log_dir)
18 |         self.log_file = file_name
19 | 
20 |     def fieldnames(self, fieldnames_list: List):
21 |         self.csv_columns = fieldnames_list
22 | 
23 |         with open(self.log_dir / self.log_file, 'a') as csvfile:
24 |             writer = csv.DictWriter(csvfile, fieldnames=self.csv_columns)
25 |             writer.writeheader()
26 | 
27 |     def log_csv(self, dict_data: Dict):
28 | 
29 |         dict_data["time_string"] = f"{self.time_stamp()}"
30 | 
31 |         for key in self.csv_columns:
32 |             if key not in dict_data.keys():
33 |                 dict_data[key] = None
34 |         with open(self.log_dir / self.log_file, 'a') as csvfile:
35 |             writer = csv.DictWriter(csvfile, fieldnames=self.csv_columns)
36 |             writer.writerow(dict_data)
37 | 


--------------------------------------------------------------------------------
/searl/utils/log/json.py:
--------------------------------------------------------------------------------
 1 | """
 2 | save log and show log
 3 | 
 4 | """
 5 | import json
 6 | import os
 7 | import time
 8 | from pathlib import Path
 9 | 
10 | from searl.utils.handler.base_handler import Handler
11 | 
12 | 
13 | class LogJSON(Handler):
14 | 
15 |     def __init__(self, log_dir, file_name="json_log.json"):
16 |         super().__init__()
17 | 
18 |         self.log_dir = Path(log_dir)
19 | 
20 |         self.file_name = file_name
21 |         self.json_file = file_name
22 | 
23 |     def __enter__(self):
24 |         self.open()
25 |         return self
26 | 
27 |     def open(self):
28 |         self.json_file = self.counting_name(self.log_dir, self.json_file, suffix=True)
29 | 
30 |         data = {"start": {'value': 0, 'time_step': None, 'time_stamp': self.time_stamp(), 'time': time.time()}}
31 |         with open(self.log_dir / self.json_file, 'w+') as file:
32 |             file.write(f"[ \n")
33 |             file.write(json.dumps(data))
34 | 
35 |     def __exit__(self, exc_type, exc_val, exc_tb):
36 |         self.close()
37 | 
38 |     def close(self):
39 |         with open(self.log_dir / self.json_file, 'a') as file:
40 |             file.write(f"\n]")
41 | 
42 |     def jlog(self, key: str, value, time_step=None):
43 |         data = {key: {'value': value, 'time_step': time_step, 'time_stamp': self.time_stamp(), 'time': time.time()}}
44 |         with open(self.log_dir / self.json_file, 'a') as file:
45 |             file.write(", \n")
46 |             file.write(json.dumps(data))
47 | 
48 |     def load_json(self):
49 |         data_list = []
50 |         counter = 0
51 |         counting_file_name = self.file_name.split('.')[0] + f"-{counter}" + self.file_name.split('.')[1]
52 |         while os.path.isfile(self.log_dir / counting_file_name):
53 |             with open(self.log_dir / counting_file_name, 'r') as file:
54 |                 data = json.load(file)
55 |             data_list.append(data)
56 | 
57 |             counter += 1
58 |             counting_file_name = self.file_name.split('.')[0] + f"-{counter}" + self.file_name.split('.')[1]
59 | 
60 |         return data_list
61 | 


--------------------------------------------------------------------------------
/searl/utils/log/logger.py:
--------------------------------------------------------------------------------
  1 | """
  2 | save log and show log
  3 | 
  4 | """
  5 | import time
  6 | 
  7 | import numpy as np
  8 | 
  9 | from searl.utils.handler.base_handler import Handler
 10 | from searl.utils.log.csv import LogCSV
 11 | from searl.utils.log.json import LogJSON
 12 | from searl.utils.log.pkl import LogPKL
 13 | from searl.utils.log.txt import LogTXT
 14 | 
 15 | 
 16 | class Logger(Handler):
 17 | 
 18 |     def __init__(self, log_dir):
 19 |         super().__init__()
 20 | 
 21 |         self.txt_logger = LogTXT(log_dir)
 22 |         self.pkl_logger = LogPKL(log_dir)
 23 |         self.json_logger = LogJSON(log_dir)
 24 |         self.csv = LogCSV(log_dir)
 25 | 
 26 |         self.timer = {}
 27 | 
 28 |     def __enter__(self):
 29 |         self.open()
 30 |         return self
 31 | 
 32 |     def open(self):
 33 |         self.json_logger.open()
 34 | 
 35 |     def __exit__(self, exc_type, exc_val, exc_tb):
 36 |         self.close()
 37 | 
 38 |     def __call__(self, key, value=None, time_step=None, print_log=True):
 39 |         self.log(key, value, time_step, print_log)
 40 | 
 41 |     def close(self):
 42 |         self.json_logger.close()
 43 | 
 44 |     def log(self, key, value=None, time_step=None, print_log=True):
 45 |         if value is None:
 46 |             if print_log:
 47 |                 if time_step is None:
 48 |                     self.txt_logger.log(key)
 49 |                 else:
 50 |                     self.txt_logger.log(f"{key}-step:{time_step}")
 51 |             self.json_logger.jlog(key="MSG", value=key, time_step=time_step)
 52 | 
 53 |         else:
 54 |             if time_step is None:
 55 |                 self.txt_logger.log(f"{key}: {value}")
 56 |             else:
 57 |                 self.txt_logger.log(f"{key}: {value}  step:{time_step}")
 58 |             self.json_logger.jlog(key=key, value=value, time_step=time_step)
 59 | 
 60 |     def dump(self, key, value, time_step=None):
 61 |         self.pkl_logger.dump(key, value, time_step)
 62 | 
 63 |     def print_config(self, config, name="main"):
 64 |         if name == "main":
 65 |             self.log("#" * 20 + " CONFIG:")
 66 |         else:
 67 |             self.log(f"sub config {name:8}",
 68 |                      np.unique([f"{attr} : {str(value)}  " for attr, value in config.get_dict.items()]).tolist())
 69 | 
 70 |         if hasattr(config, "sub_config"):
 71 |             for cfg in config.sub_config:
 72 |                 self.print_config(getattr(config, cfg), cfg)
 73 | 
 74 |     def start_timer(self, name):
 75 |         self.log(f"##### {name}")
 76 |         self.timer[name] = time.time()
 77 | 
 78 |     def log_time(self, name):
 79 |         self.log(f"timer {name:8}", f"{time.time() - self.timer[name]:3.1f}s")
 80 | 
 81 |     def log_func(self, function, *args, **kwargs):
 82 |         self.start_timer(function.__name__)
 83 |         rslt = function(*args, **kwargs)
 84 |         self.log_time(function.__name__)
 85 |         return rslt
 86 | 
 87 |     def population_info(self, population_mean_fitness, population_var_fitness, population, num_frames, epoch):
 88 | 
 89 |         best_idx = np.argmax(population_mean_fitness)
 90 |         self.log("#### POPULATION INFO", epoch, time_step=num_frames)
 91 |         self.log('Population fitness', [ind.fitness[-1] for ind in population], time_step=num_frames)
 92 |         self.log('Population improve', [ind.improvement for ind in population], time_step=num_frames)
 93 |         self.log('Population var fit', [float(var) for var in population_var_fitness], time_step=num_frames)
 94 |         self.log('Actors hidden size ', [[int(s) for s in ind.actor_config['hidden_size']] for ind in population],
 95 |                  time_step=num_frames)
 96 |         self.log('Mutation: ', [ind.train_log["mutation"] for ind in population], time_step=num_frames)
 97 |         self.log('mean_fitness', np.mean(population_mean_fitness), time_step=num_frames)
 98 |         self.log('best_fitness', population[best_idx].fitness[-1], time_step=num_frames)
 99 |         self.log('best_improve', population[best_idx].improvement, time_step=num_frames)
100 |         self.log('best rl config', population[best_idx].rl_config.__str__(), time_step=num_frames)
101 |         self.log('Best Actors hidden size', [int(s) for s in population[best_idx].actor_config['hidden_size']],
102 |                  time_step=num_frames)
103 | 


--------------------------------------------------------------------------------
/searl/utils/log/pkl.py:
--------------------------------------------------------------------------------
 1 | """
 2 | save log and show log
 3 | 
 4 | """
 5 | import os
 6 | import pickle
 7 | import time
 8 | from pathlib import Path
 9 | 
10 | from searl.utils.handler.base_handler import Handler
11 | 
12 | 
13 | class LogPKL(Handler):
14 | 
15 |     def __init__(self, log_dir, file_name="value_dump.pkl"):
16 |         super().__init__()
17 | 
18 |         self.log_dir = Path(log_dir)
19 | 
20 |         self.pickle_file = file_name
21 | 
22 |     def dump(self, key: str, value=None, time_step=None):
23 |         if value:
24 |             data = {"key": key, 'value': value, 'time_step': time_step, 'time_stamp': self.time_stamp(),
25 |                     'time': time.time()}
26 |         else:
27 |             data = key
28 |         with open(self.log_dir / self.pickle_file, 'ab') as f:
29 |             pickle.dump(data, f)
30 | 
31 |     def check_dump(self):
32 |         return os.path.isfile(self.log_dir / self.pickle_file)
33 | 
34 |     def load_pickle(self):
35 |         data = []
36 |         with open(self.log_dir / self.pickle_file, 'rb') as f:
37 |             while True:
38 |                 try:
39 |                     data.append(pickle.load(f))
40 |                 except EOFError:
41 |                     break
42 |         return data
43 | 


--------------------------------------------------------------------------------
/searl/utils/log/txt.py:
--------------------------------------------------------------------------------
 1 | """
 2 | save log and show log
 3 | 
 4 | """
 5 | import os
 6 | import sys
 7 | from pathlib import Path
 8 | 
 9 | from searl.utils.handler.base_handler import Handler
10 | 
11 | 
12 | class LogTXT(Handler):
13 | 
14 |     def __init__(self, log_dir, file_name="log_file.txt"):
15 |         super().__init__()
16 | 
17 |         self.log_dir = Path(log_dir)
18 |         self.log_file = file_name
19 | 
20 |         self.start_log()
21 | 
22 |     def start_log(self):
23 |         if os.path.isfile(self.log_dir / self.log_file) and os.access(self.log_dir / self.log_file, os.R_OK):
24 |             self.log("LOGGER: continue logging")
25 |         else:
26 |             with open(self.log_dir / self.log_file, 'w+') as file:
27 |                 file.write(
28 |                     f"{self.time_stamp()} LOGGER: start logging with Python version: {str(sys.version).split('(')[0]} \n")
29 | 
30 |     def log(self, string: str):
31 |         timed_string = f"{self.time_stamp()} {string}"
32 |         print(timed_string)
33 |         with open(self.log_dir / self.log_file, 'a') as file:
34 |             file.write(f"{timed_string} \n")
35 | 


--------------------------------------------------------------------------------
/searl/utils/supporter.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | 
 3 | from searl.utils.handler.checkpoint import CheckpointHandler
 4 | from searl.utils.handler.config import ConfigHandler
 5 | from searl.utils.handler.folder import FolderHandler
 6 | from searl.utils.log.logger import Logger
 7 | 
 8 | 
 9 | class Supporter():
10 | 
11 |     def __init__(self, experiments_dir=None, config_dir=None, config_dict=None, count_expt=False, reload_expt=False):
12 | 
13 |         if reload_expt:
14 |             experiments_dir = pathlib.Path(experiments_dir)
15 |             self.cfg = ConfigHandler(config_dir=experiments_dir / "config" / "config.yml", config_dict=None)
16 |             self.folder = FolderHandler(experiments_dir)
17 |         else:
18 | 
19 |             self.cfg = ConfigHandler(config_dir, config_dict)
20 | 
21 |             if experiments_dir is None and self.cfg.expt.experiments_dir is None:
22 |                 raise UserWarning("ConfigHandler: experiment_dir and config.expt.experiment_dir is None")
23 |             elif experiments_dir is not None:
24 |                 self.cfg.expt.set_attr("experiments_dir", experiments_dir)
25 |             else:
26 |                 experiments_dir = pathlib.Path(self.cfg.expt.experiments_dir)
27 | 
28 |             self.folder = FolderHandler(experiments_dir, self.cfg.expt.project_name, self.cfg.expt.session_name,
29 |                                         self.cfg.expt.experiment_name, count_expt)
30 |             self.cfg.save_config(self.folder.config_dir)
31 | 
32 |         self.logger = Logger(self.folder.log_dir)
33 |         self.ckp = CheckpointHandler(self.folder.checkpoint_dir)
34 | 
35 |         self.logger.log("project_name", self.cfg.expt.project_name)
36 |         self.logger.log("session_name", self.cfg.expt.session_name)
37 |         self.logger.log("experiment_name", self.cfg.expt.experiment_name)
38 | 
39 |     def __enter__(self):
40 |         self.logger.open()
41 |         return self
42 | 
43 |     def __exit__(self, exc_type, exc_val, exc_tb):
44 |         self.logger.close()
45 | 
46 |     def get_logger(self):
47 |         return self.logger
48 | 
49 |     def get_config(self):
50 |         return self.cfg
51 | 
52 |     def get_checkpoint_handler(self):
53 |         return self.ckp
54 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from distutils.core import setup
 4 | 
 5 | req_file = "requirements.txt"
 6 | 
 7 | def parse_requirements(filename):
 8 |     lineiter = (line.strip() for line in open(filename))
 9 |     return [line for line in lineiter if line and not line.startswith("#")]
10 | 
11 | install_reqs = parse_requirements(req_file)
12 | 
13 | setup(name='searl',
14 |       version='latest',
15 |       install_requires=install_reqs,
16 |       dependency_links=[],
17 |       )
18 | 


--------------------------------------------------------------------------------