├── .gitignore ├── 156_supplement.pdf ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── local_mnf_toy_regression.png ├── qlearn ├── atari │ ├── bayes_backprop_agent.py │ ├── bootstrapped_agent.py │ ├── dqn_agent.py │ ├── local_mnf_agent.py │ ├── mnf_agent.py │ ├── noisy_agent.py │ ├── prior_bootstrapped_agent.py │ ├── train_bayes_backprop_agent.py │ ├── train_bootstrapped_agent.py │ ├── train_dqn.py │ ├── train_mnf_agent.py │ ├── train_noisy_agent.py │ └── train_prior_bootstrapped_agent.py ├── commun │ ├── bayes_backprop_layer.py │ ├── dropout_toy_regression.py │ ├── local_mnf_layer.py │ ├── local_mnf_toy_regression.py │ ├── mnf_layer.py │ ├── mnf_toy_regression.py │ ├── noisy_layer.py │ ├── norm_flows.py │ ├── toy_regression.py │ ├── utils.py │ └── variational_dropout_layer.py ├── envs │ └── nchain.py └── toys │ ├── agent.py │ ├── bayes_backprop_agent.py │ ├── bootstrapped_agent.py │ ├── main_nchain.py │ ├── mnf_agent.py │ ├── model.py │ ├── noisy_agent.py │ └── test.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | __pycache__ 8 | .idea 9 | -------------------------------------------------------------------------------- /156_supplement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/RandomizedValueFunctions/e4bbcf33b489b8bc78b405df2eec45e1ebeb52ed/156_supplement.pdf -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | The aim of this repo is mainly for reproducibility purposes, thus, we won't be allowing contributions. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More_considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial 4.0 International Public 58 | License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial 4.0 International Public License ("Public 63 | License"). To the extent this Public License may be interpreted as a 64 | contract, You are granted the Licensed Rights in consideration of Your 65 | acceptance of these terms and conditions, and the Licensor grants You 66 | such rights in consideration of benefits the Licensor receives from 67 | making the Licensed Material available under these terms and 68 | conditions. 69 | 70 | Section 1 -- Definitions. 71 | 72 | a. Adapted Material means material subject to Copyright and Similar 73 | Rights that is derived from or based upon the Licensed Material 74 | and in which the Licensed Material is translated, altered, 75 | arranged, transformed, or otherwise modified in a manner requiring 76 | permission under the Copyright and Similar Rights held by the 77 | Licensor. For purposes of this Public License, where the Licensed 78 | Material is a musical work, performance, or sound recording, 79 | Adapted Material is always produced where the Licensed Material is 80 | synched in timed relation with a moving image. 81 | 82 | b. Adapter's License means the license You apply to Your Copyright 83 | and Similar Rights in Your contributions to Adapted Material in 84 | accordance with the terms and conditions of this Public License. 85 | 86 | c. Copyright and Similar Rights means copyright and/or similar rights 87 | closely related to copyright including, without limitation, 88 | performance, broadcast, sound recording, and Sui Generis Database 89 | Rights, without regard to how the rights are labeled or 90 | categorized. For purposes of this Public License, the rights 91 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 92 | Rights. 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. NonCommercial means not primarily intended for or directed towards 116 | commercial advantage or monetary compensation. For purposes of 117 | this Public License, the exchange of the Licensed Material for 118 | other material subject to Copyright and Similar Rights by digital 119 | file-sharing or similar means is NonCommercial provided there is 120 | no payment of monetary compensation in connection with the 121 | exchange. 122 | 123 | j. Share means to provide material to the public by any means or 124 | process that requires permission under the Licensed Rights, such 125 | as reproduction, public display, public performance, distribution, 126 | dissemination, communication, or importation, and to make material 127 | available to the public including in ways that members of the 128 | public may access the material from a place and at a time 129 | individually chosen by them. 130 | 131 | k. Sui Generis Database Rights means rights other than copyright 132 | resulting from Directive 96/9/EC of the European Parliament and of 133 | the Council of 11 March 1996 on the legal protection of databases, 134 | as amended and/or succeeded, as well as other essentially 135 | equivalent rights anywhere in the world. 136 | 137 | l. You means the individual or entity exercising the Licensed Rights 138 | under this Public License. Your has a corresponding meaning. 139 | 140 | Section 2 -- Scope. 141 | 142 | a. License grant. 143 | 144 | 1. Subject to the terms and conditions of this Public License, 145 | the Licensor hereby grants You a worldwide, royalty-free, 146 | non-sublicensable, non-exclusive, irrevocable license to 147 | exercise the Licensed Rights in the Licensed Material to: 148 | 149 | a. reproduce and Share the Licensed Material, in whole or 150 | in part, for NonCommercial purposes only; and 151 | 152 | b. produce, reproduce, and Share Adapted Material for 153 | NonCommercial purposes only. 154 | 155 | 2. Exceptions and Limitations. For the avoidance of doubt, where 156 | Exceptions and Limitations apply to Your use, this Public 157 | License does not apply, and You do not need to comply with 158 | its terms and conditions. 159 | 160 | 3. Term. The term of this Public License is specified in Section 161 | 6(a). 162 | 163 | 4. Media and formats; technical modifications allowed. The 164 | Licensor authorizes You to exercise the Licensed Rights in 165 | all media and formats whether now known or hereafter created, 166 | and to make technical modifications necessary to do so. The 167 | Licensor waives and/or agrees not to assert any right or 168 | authority to forbid You from making technical modifications 169 | necessary to exercise the Licensed Rights, including 170 | technical modifications necessary to circumvent Effective 171 | Technological Measures. For purposes of this Public License, 172 | simply making modifications authorized by this Section 2(a) 173 | (4) never produces Adapted Material. 174 | 175 | 5. Downstream recipients. 176 | 177 | a. Offer from the Licensor -- Licensed Material. Every 178 | recipient of the Licensed Material automatically 179 | receives an offer from the Licensor to exercise the 180 | Licensed Rights under the terms and conditions of this 181 | Public License. 182 | 183 | b. No downstream restrictions. You may not offer or impose 184 | any additional or different terms or conditions on, or 185 | apply any Effective Technological Measures to, the 186 | Licensed Material if doing so restricts exercise of the 187 | Licensed Rights by any recipient of the Licensed 188 | Material. 189 | 190 | 6. No endorsement. Nothing in this Public License constitutes or 191 | may be construed as permission to assert or imply that You 192 | are, or that Your use of the Licensed Material is, connected 193 | with, or sponsored, endorsed, or granted official status by, 194 | the Licensor or others designated to receive attribution as 195 | provided in Section 3(a)(1)(A)(i). 196 | 197 | b. Other rights. 198 | 199 | 1. Moral rights, such as the right of integrity, are not 200 | licensed under this Public License, nor are publicity, 201 | privacy, and/or other similar personality rights; however, to 202 | the extent possible, the Licensor waives and/or agrees not to 203 | assert any such rights held by the Licensor to the limited 204 | extent necessary to allow You to exercise the Licensed 205 | Rights, but not otherwise. 206 | 207 | 2. Patent and trademark rights are not licensed under this 208 | Public License. 209 | 210 | 3. To the extent possible, the Licensor waives any right to 211 | collect royalties from You for the exercise of the Licensed 212 | Rights, whether directly or through a collecting society 213 | under any voluntary or waivable statutory or compulsory 214 | licensing scheme. In all other cases the Licensor expressly 215 | reserves any right to collect such royalties, including when 216 | the Licensed Material is used other than for NonCommercial 217 | purposes. 218 | 219 | Section 3 -- License Conditions. 220 | 221 | Your exercise of the Licensed Rights is expressly made subject to the 222 | following conditions. 223 | 224 | a. Attribution. 225 | 226 | 1. If You Share the Licensed Material (including in modified 227 | form), You must: 228 | 229 | a. retain the following if it is supplied by the Licensor 230 | with the Licensed Material: 231 | 232 | i. identification of the creator(s) of the Licensed 233 | Material and any others designated to receive 234 | attribution, in any reasonable manner requested by 235 | the Licensor (including by pseudonym if 236 | designated); 237 | 238 | ii. a copyright notice; 239 | 240 | iii. a notice that refers to this Public License; 241 | 242 | iv. a notice that refers to the disclaimer of 243 | warranties; 244 | 245 | v. a URI or hyperlink to the Licensed Material to the 246 | extent reasonably practicable; 247 | 248 | b. indicate if You modified the Licensed Material and 249 | retain an indication of any previous modifications; and 250 | 251 | c. indicate the Licensed Material is licensed under this 252 | Public License, and include the text of, or the URI or 253 | hyperlink to, this Public License. 254 | 255 | 2. You may satisfy the conditions in Section 3(a)(1) in any 256 | reasonable manner based on the medium, means, and context in 257 | which You Share the Licensed Material. For example, it may be 258 | reasonable to satisfy the conditions by providing a URI or 259 | hyperlink to a resource that includes the required 260 | information. 261 | 262 | 3. If requested by the Licensor, You must remove any of the 263 | information required by Section 3(a)(1)(A) to the extent 264 | reasonably practicable. 265 | 266 | 4. If You Share Adapted Material You produce, the Adapter's 267 | License You apply must not prevent recipients of the Adapted 268 | Material from complying with this Public License. 269 | 270 | Section 4 -- Sui Generis Database Rights. 271 | 272 | Where the Licensed Rights include Sui Generis Database Rights that 273 | apply to Your use of the Licensed Material: 274 | 275 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 276 | to extract, reuse, reproduce, and Share all or a substantial 277 | portion of the contents of the database for NonCommercial purposes 278 | only; 279 | 280 | b. if You include all or a substantial portion of the database 281 | contents in a database in which You have Sui Generis Database 282 | Rights, then the database in which You have Sui Generis Database 283 | Rights (but not its individual contents) is Adapted Material; and 284 | 285 | c. You must comply with the conditions in Section 3(a) if You Share 286 | all or a substantial portion of the contents of the database. 287 | 288 | For the avoidance of doubt, this Section 4 supplements and does not 289 | replace Your obligations under this Public License where the Licensed 290 | Rights include other Copyright and Similar Rights. 291 | 292 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 293 | 294 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 295 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 296 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 297 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 298 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 299 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 300 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 301 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 302 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 303 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 304 | 305 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 306 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 307 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 308 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 309 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 310 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 311 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 312 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 313 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 314 | 315 | c. The disclaimer of warranties and limitation of liability provided 316 | above shall be interpreted in a manner that, to the extent 317 | possible, most closely approximates an absolute disclaimer and 318 | waiver of all liability. 319 | 320 | Section 6 -- Term and Termination. 321 | 322 | a. This Public License applies for the term of the Copyright and 323 | Similar Rights licensed here. However, if You fail to comply with 324 | this Public License, then Your rights under this Public License 325 | terminate automatically. 326 | 327 | b. Where Your right to use the Licensed Material has terminated under 328 | Section 6(a), it reinstates: 329 | 330 | 1. automatically as of the date the violation is cured, provided 331 | it is cured within 30 days of Your discovery of the 332 | violation; or 333 | 334 | 2. upon express reinstatement by the Licensor. 335 | 336 | For the avoidance of doubt, this Section 6(b) does not affect any 337 | right the Licensor may have to seek remedies for Your violations 338 | of this Public License. 339 | 340 | c. For the avoidance of doubt, the Licensor may also offer the 341 | Licensed Material under separate terms or conditions or stop 342 | distributing the Licensed Material at any time; however, doing so 343 | will not terminate this Public License. 344 | 345 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 346 | License. 347 | 348 | Section 7 -- Other Terms and Conditions. 349 | 350 | a. The Licensor shall not be bound by any additional or different 351 | terms or conditions communicated by You unless expressly agreed. 352 | 353 | b. Any arrangements, understandings, or agreements regarding the 354 | Licensed Material not stated herein are separate from and 355 | independent of the terms and conditions of this Public License. 356 | 357 | Section 8 -- Interpretation. 358 | 359 | a. For the avoidance of doubt, this Public License does not, and 360 | shall not be interpreted to, reduce, limit, restrict, or impose 361 | conditions on any use of the Licensed Material that could lawfully 362 | be made without permission under this Public License. 363 | 364 | b. To the extent possible, if any provision of this Public License is 365 | deemed unenforceable, it shall be automatically reformed to the 366 | minimum extent necessary to make it enforceable. If the provision 367 | cannot be reformed, it shall be severed from this Public License 368 | without affecting the enforceability of the remaining terms and 369 | conditions. 370 | 371 | c. No term or condition of this Public License will be waived and no 372 | failure to comply consented to unless expressly agreed to by the 373 | Licensor. 374 | 375 | d. Nothing in this Public License constitutes or may be interpreted 376 | as a limitation upon, or waiver of, any privileges and immunities 377 | that apply to the Licensor or You, including from the legal 378 | processes of any jurisdiction or authority. 379 | 380 | ======================================================================= 381 | 382 | Creative Commons is not a party to its public 383 | licenses. Notwithstanding, Creative Commons may elect to apply one of 384 | its public licenses to material it publishes and in those instances 385 | will be considered the “Licensor.” The text of the Creative Commons 386 | public licenses is dedicated to the public domain under the CC0 Public 387 | Domain Dedication. Except for the limited purpose of indicating that 388 | material is shared under a Creative Commons public license or as 389 | otherwise permitted by the Creative Commons policies published at 390 | creativecommons.org/policies, Creative Commons does not authorize the 391 | use of the trademark "Creative Commons" or any other trademark or logo 392 | of Creative Commons without its prior written consent including, 393 | without limitation, in connection with any unauthorized modifications 394 | to any of its public licenses or any other arrangements, 395 | understandings, or agreements concerning use of licensed material. For 396 | the avoidance of doubt, this paragraph does not form part of the 397 | public licenses. 398 | 399 | Creative Commons may be contacted at creativecommons.org. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Randomized Value functions via Multiplicative Normalizing Flows 2 | This repo contains code for the paper 3 | 4 | [Randomized Value functions via Multiplicative Normalizing Flows. 5 | Ahmed Touati, Harsh Satija, Joshua Romoff, Joelle Pineau, Pascal Vincent. UAI 2019](https://arxiv.org/abs/1806.02315) 6 | ```@article{touati2018randomized, 7 | title={Randomized value functions via multiplicative normalizing flows}, 8 | author={Touati, Ahmed and Satija, Harsh and Romoff, Joshua and Pineau, Joelle and Vincent, Pascal}, 9 | journal={arXiv preprint arXiv:1806.02315}, 10 | year={2018} 11 | } 12 | ``` 13 | 14 | ## Installation 15 | 16 | ### PyTorch 17 | 18 | without cuda: 19 | 20 | ```conda install pytorch=0.4.0 -c pytorch ``` 21 | 22 | with cuda: 23 | 24 | ```conda install pytorch=0.4.1 cuda90 -c pytorch ``` 25 | 26 | (or cuda92, cuda80, cuda 75. depending on what you have installed) 27 | 28 | ### Baselines for Atari preprocessing 29 | ``` git clone https://github.com/openai/baselines.git ``` 30 | 31 | ``` cd baselines ``` 32 | 33 | ``` pip install -e . ``` 34 | 35 | 36 | ## Simple regression as sanity check 37 | ```python -m qlearn.commun.local_mnf_toy_regression``` 38 | 39 | 40 | 41 | ## Chain env experiments 42 | 43 | ### DQN 44 | ```python -m qlearn.toys.main_nchain --agent DQN --cuda 0 --input-dim 100``` 45 | 46 | Example of outcome: 47 | ``` 48 | episode: 5, Avg. reward: 0.107 49 | episode: 6, Avg. reward: 0.107 50 | ... 51 | episode: 21, Avg. reward: 0.107 52 | episode: 22, Avg. reward: 0.107 53 | episode: 23, Avg. reward: 0.107 54 | episode: 24, Avg. reward: 0.107 55 | episode: 25, Avg. reward: 0.107 56 | episode: 26, Avg. reward: 0.107 57 | episode: 27, Avg. reward: 0.107 58 | episode: 28, Avg. reward: 0.107 59 | episode: 29, Avg. reward: 0.107 60 | episode: 30, Avg. reward: 0.107 61 | ... 62 | ``` 63 | ### MNF DQN 64 | 65 | ```python -m qlearn.toys.main_nchain --agent MNFDQN --cuda 0 --input-dim 100``` 66 | 67 | Example of outcome: 68 | 69 | 70 | ``` 71 | episode: 5, Avg. reward: 0.0 72 | episode: 6, Avg. reward: 0.0 73 | ... 74 | episode: 21, Avg. reward: 0.0 75 | episode: 22, Avg. reward: 0.0 76 | episode: 23, Avg. reward: 0.0 77 | episode: 24, Avg. reward: 10.0 78 | episode: 25, Avg. reward: 10.0 79 | episode: 26, Avg. reward: 10.0 80 | episode: 27, Avg. reward: 10.0 81 | episode: 28, Avg. reward: 10.0 82 | episode: 29, Avg. reward: 10.0 83 | episode: 30, Avg. reward: 10.0 84 | ... 85 | ``` 86 | 87 | ## Atari experiments 88 | ### DQN 89 | 90 | ```python -m qlearn.atari.train_dqn --env BreakoutNoFrameskip-v4 --log-dir log_dir --save-dir save_dir --print-freq 10 --cuda 0``` 91 | 92 | Example of outcome 93 | 94 | ``` 95 | Options 96 | env: BreakoutNoFrameskip-v4 97 | seed: 42 98 | replay_buffer_size: 1000000 99 | lr: 0.0001 100 | num_steps: 10000000 101 | batch_size: 32 102 | learning_freq: 4 103 | target_update_freq: 10000 104 | learning_starts: 50000 105 | double_q: True 106 | log_dir: log_dir 107 | save_dir: save_dir 108 | save_freq: 1000000 109 | final_exploration: 0.1 110 | final_exploration_frame: 1000000 111 | print_freq: 10 112 | run_index: None 113 | cuda: 0 114 | agent: DQN 115 | discount: 0.99 116 | model: None 117 | WARNING:root:This caffe2 python run does not have GPU support. Will run in CPU only mode. 118 | Writing logs to log_dir/BreakoutNoFrameskip-v4-DQN-seed-42-2019-06-26.1725 119 | Logging to /var/folders/y8/gcb_hv6d7nd6t3ctvrmhf8t9s1xjd_/T/openai-2019-06-26-17-27-50-458572 120 | ------------------------------------ 121 | | % completion | 0 | 122 | | episodes | 290 | 123 | | exploration | 0.953 | 124 | | FPS | 34.4 | 125 | | reward (100 epi mean) | 1.3 | 126 | | total steps | 51824 | 127 | ------------------------------------ 128 | 129 | ETA: 3 days and 8 hours 130 | 131 | Saving model due to mean reward increase: None -> 1.3 132 | ------------------------------------ 133 | | % completion | 0 | 134 | | episodes | 300 | 135 | | exploration | 0.952 | 136 | | FPS | 29.6 | 137 | | reward (100 epi mean) | 1.3 | 138 | | total steps | 53525 | 139 | ------------------------------------ 140 | 141 | ETA: 3 days and 21 hours 142 | ``` 143 | 144 | ### MNF DQN 145 | 146 | ```python -m qlearn.atari.train_mnf_agent --env BreakoutNoFrameskip-v4 --alpha 0.01 --log-dir log_dir --save-dir save_dir --print-freq 10 --cuda 0``` 147 | 148 | Example of outcome: 149 | 150 | ``` 151 | Options 152 | env: BreakoutNoFrameskip-v4 153 | seed: 42 154 | replay_buffer_size: 1000000 155 | lr: 0.0001 156 | num_steps: 10000000 157 | batch_size: 32 158 | learning_freq: 4 159 | target_update_freq: 10000 160 | learning_starts: 50000 161 | double_q: False 162 | log_dir: log_dir 163 | save_dir: save_dir 164 | save_freq: 1000000 165 | print_freq: 10 166 | run_index: None 167 | cuda: 0 168 | agent: MNFDQN 169 | discount: 0.99 170 | hidden_dim: 50 171 | n_hidden: 0 172 | n_flows_q: 2 173 | n_flows_r: 2 174 | alpha: 0.01 175 | model: None 176 | WARNING:root:This caffe2 python run does not have GPU support. Will run in CPU only mode. 177 | Writing logs to log_dir/BreakoutNoFrameskip-v4-MNFDQN-seed-42-alpha-0.01-2019-06-26.1730 178 | Logging to /var/folders/y8/gcb_hv6d7nd6t3ctvrmhf8t9s1xjd_/T/openai-2019-06-26-17-32-20-718772 179 | ------------------------------------ 180 | | % completion | 0 | 181 | | episodes | 270 | 182 | | FPS | 34.7 | 183 | | reward (100 epi mean) | 1.4 | 184 | | total steps | 50398 | 185 | ------------------------------------ 186 | 187 | ETA: 3 days and 7 hours 188 | 189 | Saving model due to mean reward increase: None -> 1.4 190 | ------------------------------------ 191 | | % completion | 0 | 192 | | episodes | 280 | 193 | | FPS | 12.3 | 194 | | reward (100 epi mean) | 1.4 | 195 | | total steps | 52433 | 196 | ------------------------------------ 197 | ``` 198 | 199 | ### Noisy DQN 200 | 201 | ```python -m qlearn.atari.train_noisy_agent --env BreakoutNoFrameskip-v4 --log-dir log_dir --save-dir save_dir``` 202 | 203 | ### Bootstrapped DQN 204 | 205 | ```python -m qlearn.atari.train_bootstrapped_agent --env BreakoutNoFrameskip-v4 --log-dir log_dir --save-dir save_dir``` 206 | 207 | ## License 208 | This repo is CC-BY-NC licensed, as found in the LICENSE file. 209 | -------------------------------------------------------------------------------- /local_mnf_toy_regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/facebookresearch/RandomizedValueFunctions/e4bbcf33b489b8bc78b405df2eec45e1ebeb52ed/local_mnf_toy_regression.png -------------------------------------------------------------------------------- /qlearn/atari/bayes_backprop_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import torch 10 | from torch import optim 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from torch.autograd import Variable 14 | from torch.nn.utils import clip_grad_norm_ 15 | 16 | from qlearn.commun.utils import initialize_weights 17 | from qlearn.commun.bayes_backprop_layer import BayesBackpropLinear 18 | 19 | 20 | class AtariBayesBackpropDQN(nn.Module): 21 | def __init__(self, args, input_dim, num_actions): 22 | nn.Module.__init__(self) 23 | self.conv1 = nn.Conv2d(input_dim, 32, 8, stride=4, padding=1) 24 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 25 | self.conv3 = nn.Conv2d(64, 64, 3) 26 | self.fc1 = BayesBackpropLinear(3136, 512) 27 | self.fc2 = BayesBackpropLinear(512, num_actions) 28 | initialize_weights(self) 29 | 30 | def forward(self, x): 31 | x = F.relu(self.conv1(x)) 32 | x = F.relu(self.conv2(x)) 33 | x = F.relu(self.conv3(x)) 34 | x = x.view(-1, 3136) 35 | x = F.relu(self.fc1(x)) 36 | x = self.fc2(x) 37 | return x 38 | 39 | def reset_noise(self): 40 | self.fc1.reset_noise() 41 | self.fc2.reset_noise() 42 | 43 | def get_reg(self): 44 | reg = self.fc1.kldiv() 45 | reg += self.fc2.kldiv() 46 | return reg 47 | 48 | 49 | class AtariBayesBackpropAgent(object): 50 | def __init__(self, args, input_dim, num_actions): 51 | self.num_actions = num_actions 52 | self.batch_size = args.batch_size 53 | self.discount = args.discount 54 | self.double_q = args.double_q 55 | self.input_dim = input_dim 56 | self.kl_coeff = args.alpha / args.replay_buffer_size 57 | self.online_net = AtariBayesBackpropDQN(args, input_dim, num_actions) 58 | self.online_net.train() 59 | 60 | self.target_net = AtariBayesBackpropDQN(args, input_dim, num_actions) 61 | self.update_target_net() 62 | # self.target_net.eval() 63 | for param in self.target_net.parameters(): 64 | param.requires_grad = False 65 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr) 66 | # self.optimiser = optim.RMSprop(self.online_net.parameters(), lr=args.lr, 67 | # alpha=args.alpha, momentum=args.momentum, 68 | # eps=args.eps_rmsprop) 69 | if args.cuda: 70 | self.online_net.cuda() 71 | self.target_net.cuda() 72 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 73 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 74 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 75 | 76 | def update_target_net(self): 77 | self.target_net.load_state_dict(self.online_net.state_dict()) 78 | 79 | # Acts based on single state (no batch) 80 | def act(self, state, eval=False): 81 | if eval: 82 | self.online_net.eval() 83 | else: 84 | self.online_net.train() 85 | # self.online_net.reset_noise() 86 | state = Variable(self.FloatTensor(state / 255.0)) 87 | return self.online_net(state).data.max(1)[1][0] 88 | 89 | def learn(self, states, actions, rewards, next_states, terminals): 90 | self.online_net.train() 91 | self.target_net.eval() 92 | self.online_net.reset_noise() 93 | # self.target_net.reset_noise() 94 | states = Variable(self.FloatTensor(states / 255.0)) 95 | actions = Variable(self.LongTensor(actions)) 96 | next_states = Variable(self.FloatTensor(next_states / 255.0)) 97 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 98 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 99 | 100 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 101 | # columns of actions taken 102 | 103 | state_action_values = self.online_net(states).gather(1, actions.view(-1, 1)) 104 | 105 | if self.double_q: 106 | next_actions = self.online_net(next_states).max(1)[1] 107 | next_state_values = self.target_net(next_states).gather(1, next_actions.view(-1, 1)) 108 | else: 109 | next_state_values = self.target_net(next_states).max(1)[0].view(-1, 1) 110 | 111 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values 112 | 113 | td_errors = F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 114 | # loss = td_errors 115 | kl_reg = self.online_net.get_reg() 116 | loss = td_errors + self.kl_coeff * kl_reg 117 | # Optimize the model 118 | self.optimiser.zero_grad() 119 | loss.backward() 120 | clip_grad_norm_(self.online_net.parameters(), 10) 121 | # for param in self.online_net.parameters(): 122 | # param.grad.data.clamp_(-1, 1) 123 | self.optimiser.step() 124 | 125 | return td_errors, kl_reg, loss 126 | # return loss 127 | -------------------------------------------------------------------------------- /qlearn/atari/bootstrapped_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import os 9 | import torch 10 | from torch import optim 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from torch.autograd import Variable 14 | from torch.nn.utils import clip_grad_norm_ 15 | 16 | from qlearn.commun.utils import initialize_weights 17 | 18 | 19 | class AtariBootstrappedDQN(nn.Module): 20 | def __init__(self, args, input_dim, num_actions): 21 | nn.Module.__init__(self) 22 | 23 | self.features = nn.Sequential( 24 | nn.Conv2d(input_dim, 32, 8, stride=4, padding=1), 25 | nn.ReLU(inplace=True), 26 | nn.Conv2d(32, 64, 4, stride=2), 27 | nn.ReLU(inplace=True), 28 | nn.Conv2d(64, 64, 3), 29 | nn.ReLU(inplace=True) 30 | ) 31 | 32 | self.nheads = args.nheads 33 | self.heads = nn.ModuleList([nn.Sequential(nn.Linear(3136, 512), 34 | nn.ReLU(inplace=True), 35 | nn.Linear(512, num_actions)) for _ in range(args.nheads)]) 36 | 37 | initialize_weights(self) 38 | 39 | def forward_single_head(self, x, k): 40 | x = self.features(x) 41 | x = x.view(-1, 3136) 42 | x = self.heads[k](x) 43 | return x 44 | 45 | def forward(self, x): 46 | x = self.features(x) 47 | x = x.view(-1, 3136) 48 | out = [] 49 | for head in self.heads: 50 | out.append(head(x)) 51 | return out 52 | 53 | class AtariBootstrappedAgent(object): 54 | def __init__(self, args, input_dim, num_actions): 55 | self.num_actions = num_actions 56 | self.batch_size = args.batch_size 57 | self.discount = args.discount 58 | self.double_q = args.double_q 59 | self.input_dim = input_dim 60 | self.nheads = args.nheads 61 | self.online_net = AtariBootstrappedDQN(args, input_dim, num_actions) 62 | if args.model and os.path.isfile(args.model): 63 | self.online_net.load_state_dict(torch.load(args.model)) 64 | self.online_net.train() 65 | 66 | self.target_net = AtariBootstrappedDQN(args, input_dim, num_actions) 67 | self.update_target_net() 68 | self.target_net.eval() 69 | for param in self.target_net.parameters(): 70 | param.requires_grad = False 71 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr) 72 | # self.optimiser = optim.RMSprop(self.online_net.parameters(), lr=args.lr, 73 | # alpha=args.alpha, momentum=args.momentum, 74 | # eps=args.eps_rmsprop) 75 | if args.cuda: 76 | self.online_net.cuda() 77 | self.target_net.cuda() 78 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 79 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 80 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 81 | 82 | def update_target_net(self): 83 | self.target_net.load_state_dict(self.online_net.state_dict()) 84 | 85 | # Acts based on single state (no batch) 86 | def act_single_head(self, state, k): 87 | # self.online_net.eval() 88 | state = Variable(self.FloatTensor(state / 255.0)) 89 | return self.online_net.forward_single_head(state, k).data.max(1)[1][0] 90 | 91 | 92 | def learn(self, states, actions, rewards, next_states, terminals): 93 | self.online_net.train() 94 | self.target_net.train() 95 | states = Variable(self.FloatTensor(states / 255.0)) 96 | actions = Variable(self.LongTensor(actions)) 97 | next_states = Variable(self.FloatTensor(next_states / 255.0)) 98 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 99 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 100 | 101 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 102 | # columns of actions taken 103 | online_outputs = self.online_net(states) 104 | target_outputs = self.target_net(next_states) 105 | loss = 0 106 | for k in range(self.nheads): 107 | state_action_values = online_outputs[k].gather(1, actions.view(-1, 1)) 108 | 109 | next_state_values = target_outputs[k].max(1)[0].view(-1, 1) 110 | 111 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values 112 | 113 | loss += F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 114 | 115 | # Optimize the model 116 | self.optimiser.zero_grad() 117 | loss.backward() 118 | clip_grad_norm_(self.online_net.parameters(), 10) 119 | # for param in self.online_net.parameters(): 120 | # param.grad.data.clamp_(-1, 1) 121 | self.optimiser.step() 122 | 123 | return loss 124 | -------------------------------------------------------------------------------- /qlearn/atari/dqn_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import random 10 | import torch 11 | from torch import optim 12 | import os 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | from torch.autograd import Variable 16 | from torch.nn.utils import clip_grad_norm_ 17 | 18 | from qlearn.commun.utils import initialize_weights 19 | 20 | 21 | class AtariDQN(nn.Module): 22 | def __init__(self, args, input_dim, num_actions): 23 | nn.Module.__init__(self) 24 | self.conv1 = nn.Conv2d(input_dim, 32, 8, stride=4, padding=1) 25 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 26 | self.conv3 = nn.Conv2d(64, 64, 3) 27 | self.fc1 = nn.Linear(3136, 512) 28 | self.fc2 = nn.Linear(512, num_actions) 29 | initialize_weights(self) 30 | 31 | def forward(self, x): 32 | x = F.relu(self.conv1(x)) 33 | x = F.relu(self.conv2(x)) 34 | x = F.relu(self.conv3(x)) 35 | x = x.view(-1, 3136) 36 | x = F.relu(self.fc1(x)) 37 | x = self.fc2(x) 38 | return x 39 | 40 | 41 | class AtariDQNAgent(object): 42 | def __init__(self, args, input_dim, num_actions): 43 | self.num_actions = num_actions 44 | self.batch_size = args.batch_size 45 | self.discount = args.discount 46 | self.double_q = args.double_q 47 | self.input_dim = input_dim 48 | 49 | self.online_net = AtariDQN(args, input_dim, num_actions) 50 | if args.model and os.path.isfile(args.model): 51 | self.online_net.load_state_dict(torch.load(args.model)) 52 | self.online_net.train() 53 | 54 | self.target_net = AtariDQN(args, input_dim, num_actions) 55 | self.update_target_net() 56 | self.target_net.eval() 57 | for param in self.target_net.parameters(): 58 | param.requires_grad = False 59 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr) 60 | # self.optimiser = optim.RMSprop(self.online_net.parameters(), lr=args.lr, 61 | # alpha=args.alpha, momentum=args.momentum, 62 | # eps=args.eps_rmsprop) 63 | if args.cuda: 64 | self.online_net.cuda() 65 | self.target_net.cuda() 66 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 67 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 68 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 69 | 70 | # Acts based on single state (no batch) 71 | def act(self, state, eval=None): 72 | self.online_net.eval() 73 | state = Variable(self.FloatTensor(state / 255.0)) 74 | return self.online_net(state).data.max(1)[1][0] 75 | 76 | # Acts with an epsilon-greedy policy 77 | def act_e_greedy(self, state, update_eps=0.01): 78 | return random.randrange(self.num_actions) if random.random() < update_eps else self.act(state) 79 | 80 | def update_target_net(self): 81 | self.target_net.load_state_dict(self.online_net.state_dict()) 82 | 83 | def learn(self, states, actions, rewards, next_states, terminals): 84 | self.online_net.train() 85 | states = Variable(self.FloatTensor(states / 255.0)) 86 | actions = Variable(self.LongTensor(actions)) 87 | next_states = Variable(self.FloatTensor(next_states / 255.0)) 88 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 89 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 90 | 91 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 92 | # columns of actions taken 93 | state_action_values = self.online_net(states).gather(1, actions.view(-1, 1)) 94 | 95 | if self.double_q: 96 | next_actions = self.online_net(next_states).max(1)[1] 97 | next_state_values = self.target_net(next_states).gather(1, next_actions.view(-1, 1)) 98 | else: 99 | next_state_values = self.target_net(next_states).max(1)[0].view(-1, 1) 100 | 101 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values.view(-1, 1) 102 | 103 | loss = F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 104 | 105 | # Optimize the model 106 | self.optimiser.zero_grad() 107 | loss.backward() 108 | clip_grad_norm_(self.online_net.parameters(), 10) 109 | # for param in self.online_net.parameters(): 110 | # param.grad.data.clamp_(-1, 1) 111 | self.optimiser.step() 112 | 113 | return loss 114 | -------------------------------------------------------------------------------- /qlearn/atari/local_mnf_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import random 11 | import torch 12 | from torch import optim 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | from torch.autograd import Variable 16 | from torch.nn.utils import clip_grad_norm_ 17 | 18 | from qlearn.commun.utils import initialize_weights 19 | from qlearn.commun.local_mnf_layer import MNFLinear 20 | 21 | 22 | class AtariMNFDQN(nn.Module): 23 | def __init__(self, args, input_dim, num_actions): 24 | nn.Module.__init__(self) 25 | self.conv1 = nn.Conv2d(input_dim, 32, 8, stride=4, padding=1) 26 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 27 | self.conv3 = nn.Conv2d(64, 64, 3) 28 | self.fc1 = MNFLinear(3136, 512, args.hidden_dim, args.n_hidden, args.n_flows_q, args.n_flows_r, use_cuda=args.cuda) 29 | self.fc2 = MNFLinear(512, num_actions, args.hidden_dim, args.n_hidden, args.n_flows_q, args.n_flows_r, use_cuda=args.cuda) 30 | initialize_weights(self) 31 | 32 | def forward(self, x, same_noise=False): 33 | x = F.relu(self.conv1(x)) 34 | x = F.relu(self.conv2(x)) 35 | x = F.relu(self.conv3(x)) 36 | x = x.view(-1, 3136) 37 | x = F.relu(self.fc1(x, same_noise=same_noise)) 38 | x = self.fc2(x, same_noise=same_noise) 39 | return x 40 | 41 | def reset_noise(self): 42 | self.fc1.reset_noise() 43 | self.fc2.reset_noise() 44 | 45 | def kldiv(self): 46 | kldiv1 = self.fc1.kldiv() 47 | kldiv2 = self.fc2.kldiv() 48 | return kldiv1 + kldiv2 49 | 50 | 51 | class AtariMNFAgent(object): 52 | def __init__(self, args, input_dim, num_actions): 53 | self.num_actions = num_actions 54 | self.batch_size = args.batch_size 55 | self.discount = args.discount 56 | self.double_q = args.double_q 57 | self.input_dim = input_dim 58 | self.kl_coeff = float(args.alpha) / args.replay_buffer_size 59 | 60 | self.online_net = AtariMNFDQN(args, input_dim, num_actions) 61 | if args.model and os.path.isfile(args.model): 62 | self.online_net.load_state_dict(torch.load(args.model)) 63 | self.online_net.train() 64 | 65 | self.target_net = AtariMNFDQN(args, input_dim, num_actions) 66 | self.update_target_net() 67 | self.target_net.eval() 68 | for param in self.target_net.parameters(): 69 | param.requires_grad = False 70 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr) 71 | # self.optimiser = optim.RMSprop(self.online_net.parameters(), lr=args.lr, 72 | # alpha=args.alpha, momentum=args.momentum, 73 | # eps=args.eps_rmsprop) 74 | if args.cuda: 75 | self.online_net.cuda() 76 | self.target_net.cuda() 77 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 78 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 79 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 80 | 81 | def update_target_net(self): 82 | self.target_net.load_state_dict(self.online_net.state_dict()) 83 | 84 | # Acts based on single state (no batch) 85 | def act(self, state, eval=False): 86 | if eval: 87 | self.online_net.eval() 88 | else: 89 | self.online_net.train() 90 | state = Variable(self.FloatTensor(state / 255.0)) 91 | return self.online_net(state, same_noise=True).data.max(1)[1][0] 92 | 93 | def learn(self, states, actions, rewards, next_states, terminals): 94 | self.online_net.train() 95 | self.online_net.reset_noise() 96 | self.target_net.eval() 97 | states = Variable(self.FloatTensor(states / 255.0)) 98 | actions = Variable(self.LongTensor(actions)) 99 | next_states = Variable(self.FloatTensor(next_states / 255.0)) 100 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 101 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 102 | 103 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 104 | # columns of actions taken 105 | state_values = self.online_net(states, same_noise=False) 106 | kldiv = self.online_net.kldiv() 107 | state_action_values = state_values.gather(1, actions.view(-1, 1)) 108 | 109 | if self.double_q: 110 | next_actions = self.online_net(next_states, same_noise=False).max(1)[1] 111 | next_state_values = self.target_net(next_states).gather(1, next_actions.view(-1, 1)) 112 | else: 113 | next_state_values = self.target_net(next_states).max(1)[0] 114 | 115 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values.view(-1, 1) 116 | 117 | td_errors = F.smooth_l1_loss(state_action_values, target_state_action_values.detach(), size_average=True) 118 | 119 | loss = td_errors + self.kl_coeff * kldiv 120 | 121 | # Optimize the model 122 | self.optimiser.zero_grad() 123 | loss.backward() 124 | clip_grad_norm_(self.online_net.parameters(), 10) 125 | # for param in self.online_net.parameters(): 126 | # param.grad.data.clamp_(-1, 1) 127 | self.optimiser.step() 128 | self.online_net.reset_noise() 129 | return td_errors, kldiv, loss 130 | -------------------------------------------------------------------------------- /qlearn/atari/mnf_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import random 11 | import torch 12 | from torch import optim 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | from torch.autograd import Variable 16 | from torch.nn.utils import clip_grad_norm_ 17 | 18 | from qlearn.commun.utils import initialize_weights 19 | from qlearn.commun.mnf_layer import MNFLinear 20 | 21 | 22 | class AtariMNFDQN(nn.Module): 23 | def __init__(self, args, input_dim, num_actions): 24 | nn.Module.__init__(self) 25 | self.conv1 = nn.Conv2d(input_dim, 32, 8, stride=4, padding=1) 26 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 27 | self.conv3 = nn.Conv2d(64, 64, 3) 28 | self.fc1 = MNFLinear(3136, 512, args.hidden_dim, args.n_hidden, args.n_flows_q, args.n_flows_r) 29 | self.fc2 = MNFLinear(512, num_actions, args.hidden_dim, args.n_hidden, args.n_flows_q, args.n_flows_r) 30 | initialize_weights(self) 31 | 32 | def forward(self, x, kl=True): 33 | x = F.relu(self.conv1(x)) 34 | x = F.relu(self.conv2(x)) 35 | x = F.relu(self.conv3(x)) 36 | x = x.view(-1, 3136) 37 | if kl: 38 | x, kldiv1 = self.fc1(x, kl=True) 39 | x = F.relu(x) 40 | x, kldiv2 = self.fc2(x, kl=True) 41 | kldiv = kldiv1 + kldiv2 42 | return x, kldiv 43 | else: 44 | x = F.relu(self.fc1(x, kl=False)) 45 | x = self.fc2(x, kl=False) 46 | return x 47 | 48 | def reset_noise(self): 49 | self.fc1.reset_noise() 50 | self.fc2.reset_noise() 51 | 52 | 53 | class AtariMNFAgent(object): 54 | def __init__(self, args, input_dim, num_actions): 55 | self.num_actions = num_actions 56 | self.batch_size = args.batch_size 57 | self.discount = args.discount 58 | self.double_q = args.double_q 59 | self.input_dim = input_dim 60 | self.kl_coeff = float(args.alpha) / args.replay_buffer_size 61 | 62 | self.online_net = AtariMNFDQN(args, input_dim, num_actions) 63 | if args.model and os.path.isfile(args.model): 64 | self.online_net.load_state_dict(torch.load(args.model)) 65 | self.online_net.train() 66 | 67 | self.target_net = AtariMNFDQN(args, input_dim, num_actions) 68 | self.update_target_net() 69 | self.target_net.eval() 70 | for param in self.target_net.parameters(): 71 | param.requires_grad = False 72 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr) 73 | # self.optimiser = optim.RMSprop(self.online_net.parameters(), lr=args.lr, 74 | # alpha=args.alpha, momentum=args.momentum, 75 | # eps=args.eps_rmsprop) 76 | if args.cuda: 77 | self.online_net.cuda() 78 | self.target_net.cuda() 79 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 80 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 81 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 82 | 83 | def update_target_net(self): 84 | self.target_net.load_state_dict(self.online_net.state_dict()) 85 | 86 | # Acts based on single state (no batch) 87 | def act(self, state, eval=False): 88 | if eval: 89 | self.online_net.eval() 90 | else: 91 | self.online_net.train() 92 | state = Variable(self.FloatTensor(state / 255.0)) 93 | return self.online_net(state, kl=False).data.max(1)[1][0] 94 | 95 | def learn(self, states, actions, rewards, next_states, terminals): 96 | self.online_net.train() 97 | self.online_net.reset_noise() 98 | self.target_net.eval() 99 | states = Variable(self.FloatTensor(states / 255.0)) 100 | actions = Variable(self.LongTensor(actions)) 101 | next_states = Variable(self.FloatTensor(next_states / 255.0)) 102 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 103 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 104 | 105 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 106 | # columns of actions taken 107 | state_values, kldiv = self.online_net(states, kl=True) 108 | state_action_values = state_values.gather(1, actions.view(-1, 1)) 109 | 110 | if self.double_q: 111 | next_actions = self.online_net(next_states, kl=False).max(1)[1] 112 | next_state_values = self.target_net(next_states, kl=False).gather(1, next_actions.view(-1, 1)) 113 | else: 114 | next_state_values = self.target_net(next_states, kl=False).max(1)[0] 115 | 116 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values.view(-1, 1) 117 | 118 | td_errors = F.smooth_l1_loss(state_action_values, target_state_action_values.detach(), size_average=True) 119 | 120 | loss = td_errors + self.kl_coeff * kldiv 121 | 122 | # Optimize the model 123 | self.optimiser.zero_grad() 124 | loss.backward() 125 | clip_grad_norm_(self.online_net.parameters(), 10) 126 | # for param in self.online_net.parameters(): 127 | # param.grad.data.clamp_(-1, 1) 128 | self.optimiser.step() 129 | 130 | return td_errors, kldiv, loss 131 | -------------------------------------------------------------------------------- /qlearn/atari/noisy_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import torch 11 | from torch import optim 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from torch.autograd import Variable 15 | from torch.nn.utils import clip_grad_norm_ 16 | 17 | from qlearn.commun.utils import initialize_weights 18 | from qlearn.commun.noisy_layer import NoisyLinear 19 | 20 | 21 | class AtariNoisyDQN(nn.Module): 22 | def __init__(self, args, input_dim, num_actions): 23 | nn.Module.__init__(self) 24 | self.conv1 = nn.Conv2d(input_dim, 32, 8, stride=4, padding=1) 25 | self.conv2 = nn.Conv2d(32, 64, 4, stride=2) 26 | self.conv3 = nn.Conv2d(64, 64, 3) 27 | self.fc1 = NoisyLinear(3136, 512) 28 | self.fc2 = NoisyLinear(512, num_actions) 29 | initialize_weights(self) 30 | 31 | def forward(self, x): 32 | x = F.relu(self.conv1(x)) 33 | x = F.relu(self.conv2(x)) 34 | x = F.relu(self.conv3(x)) 35 | x = x.view(-1, 3136) 36 | x = F.relu(self.fc1(x)) 37 | x = self.fc2(x) 38 | return x 39 | 40 | def reset_noise(self): 41 | self.fc1.reset_noise() 42 | self.fc2.reset_noise() 43 | 44 | class AtariNoisyAgent(object): 45 | def __init__(self, args, input_dim, num_actions): 46 | self.num_actions = num_actions 47 | self.batch_size = args.batch_size 48 | self.discount = args.discount 49 | self.double_q = args.double_q 50 | self.input_dim = input_dim 51 | self.online_net = AtariNoisyDQN(args, input_dim, num_actions) 52 | if args.model and os.path.isfile(args.model): 53 | self.online_net.load_state_dict(torch.load(args.model)) 54 | self.online_net.train() 55 | 56 | self.target_net = AtariNoisyDQN(args, input_dim, num_actions) 57 | self.update_target_net() 58 | self.target_net.eval() 59 | for param in self.target_net.parameters(): 60 | param.requires_grad = False 61 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr) 62 | # self.optimiser = optim.RMSprop(self.online_net.parameters(), lr=args.lr, 63 | # alpha=args.alpha, momentum=args.momentum, 64 | # eps=args.eps_rmsprop) 65 | if args.cuda: 66 | self.online_net.cuda() 67 | self.target_net.cuda() 68 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 69 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 70 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 71 | 72 | def update_target_net(self): 73 | self.target_net.load_state_dict(self.online_net.state_dict()) 74 | 75 | # Acts based on single state (no batch) 76 | def act(self, state, eval=False): 77 | if eval: 78 | self.online_net.eval() 79 | else: 80 | self.online_net.train() 81 | self.online_net.reset_noise() 82 | state = Variable(self.FloatTensor(state / 255.0)) 83 | return self.online_net(state).data.max(1)[1][0] 84 | 85 | def learn(self, states, actions, rewards, next_states, terminals): 86 | self.online_net.train() 87 | self.target_net.train() 88 | self.online_net.reset_noise() 89 | self.target_net.reset_noise() 90 | states = Variable(self.FloatTensor(states / 255.0)) 91 | actions = Variable(self.LongTensor(actions)) 92 | next_states = Variable(self.FloatTensor(next_states / 255.0)) 93 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 94 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 95 | 96 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 97 | # columns of actions taken 98 | 99 | state_action_values = self.online_net(states).gather(1, actions.view(-1, 1)) 100 | 101 | if self.double_q: 102 | next_actions = self.online_net(next_states).max(1)[1] 103 | next_state_values = self.target_net(next_states).gather(1, next_actions.view(-1, 1)) 104 | else: 105 | next_state_values = self.target_net(next_states).max(1)[0].view(-1, 1) 106 | 107 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values 108 | 109 | loss = F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 110 | 111 | # Optimize the model 112 | self.optimiser.zero_grad() 113 | loss.backward() 114 | clip_grad_norm_(self.online_net.parameters(), 10) 115 | # for param in self.online_net.parameters(): 116 | # param.grad.data.clamp_(-1, 1) 117 | self.optimiser.step() 118 | 119 | return loss 120 | -------------------------------------------------------------------------------- /qlearn/atari/prior_bootstrapped_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import torch 11 | from torch import optim 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from torch.autograd import Variable 15 | from torch.nn.utils import clip_grad_norm_ 16 | 17 | from qlearn.commun.utils import initialize_weights 18 | from qlearn.atari.bootstrapped_agent import AtariBootstrappedDQN 19 | 20 | class AtariPriorBootstrappedAgent(object): 21 | def __init__(self, args, input_dim, num_actions): 22 | self.num_actions = num_actions 23 | self.batch_size = args.batch_size 24 | self.discount = args.discount 25 | self.double_q = args.double_q 26 | self.input_dim = input_dim 27 | self.nheads = args.nheads 28 | self.beta = args.beta 29 | self.online_net = AtariBootstrappedDQN(args, input_dim, num_actions) 30 | self.prior = AtariBootstrappedDQN(args, input_dim, num_actions) 31 | 32 | # if args.model and os.path.isfile(args.model): 33 | # self.online_net.load_state_dict(torch.load(args.model)) 34 | self.online_net.train() 35 | self.prior.eval() 36 | for param in self.prior.parameters(): 37 | param.requires_grad = False 38 | 39 | self.target_net = AtariBootstrappedDQN(args, input_dim, num_actions) 40 | self.update_target_net() 41 | self.target_net.eval() 42 | for param in self.target_net.parameters(): 43 | param.requires_grad = False 44 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr) 45 | # self.optimiser = optim.RMSprop(self.online_net.parameters(), lr=args.lr, 46 | # alpha=args.alpha, momentum=args.momentum, 47 | # eps=args.eps_rmsprop) 48 | if args.cuda: 49 | self.online_net.cuda() 50 | self.target_net.cuda() 51 | self.prior.cuda() 52 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 53 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 54 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 55 | 56 | def update_target_net(self): 57 | self.target_net.load_state_dict(self.online_net.state_dict()) 58 | 59 | # Acts based on single state (no batch) 60 | def act_single_head(self, state, k): 61 | # self.online_net.eval() 62 | state = Variable(self.FloatTensor(state / 255.0)) 63 | value = self.online_net.forward_single_head(state, k) \ 64 | + self.beta * self.prior.forward_single_head(state, k) 65 | action = value.data.max(1)[1][0] 66 | return action.cpu().item() 67 | 68 | 69 | def learn(self, states, actions, rewards, next_states, terminals): 70 | self.online_net.train() 71 | self.target_net.train() 72 | states = Variable(self.FloatTensor(states / 255.0)) 73 | actions = Variable(self.LongTensor(actions)) 74 | next_states = Variable(self.FloatTensor(next_states / 255.0)) 75 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 76 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 77 | 78 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 79 | # columns of actions taken 80 | online_prior_outputs = self.prior(states) 81 | online_outputs = self.online_net(states) 82 | # online_values = online_prior_outputs + online_outputs 83 | 84 | target_prior_outputs = self.prior(next_states) 85 | target_outputs = self.target_net(next_states) 86 | # import pdb; pdb.set_trace() 87 | 88 | loss = 0 89 | for k in range(self.nheads): 90 | online_prior_output_ = online_prior_outputs[k].detach() 91 | online_output_ = online_outputs[k] 92 | online_value = self.beta * online_prior_output_ + online_output_ 93 | state_action_values = online_value.gather(1, actions.view(-1, 1)) 94 | 95 | target_prior_output_ = target_prior_outputs[k] 96 | target_output_ = target_outputs[k] 97 | target_value = self.beta * target_prior_output_ + target_output_ 98 | next_state_values = target_value.max(1)[0].view(-1, 1) 99 | 100 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values 101 | 102 | loss += F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 103 | 104 | # Optimize the model 105 | self.optimiser.zero_grad() 106 | loss.backward() 107 | clip_grad_norm_(self.online_net.parameters(), 10) 108 | # for param in self.online_net.parameters(): 109 | # param.grad.data.clamp_(-1, 1) 110 | self.optimiser.step() 111 | 112 | return loss 113 | -------------------------------------------------------------------------------- /qlearn/atari/train_bayes_backprop_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import csv 11 | import argparse 12 | import time 13 | import random 14 | import pickle 15 | import numpy as np 16 | import torch 17 | from tensorboardX import SummaryWriter 18 | 19 | from baselines import logger 20 | from baselines.deepq.replay_buffer import ReplayBuffer 21 | from baselines.common.misc_util import boolean_flag, pretty_eta, RunningAvg 22 | from baselines.common.schedules import LinearSchedule, PiecewiseSchedule 23 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 24 | 25 | from qlearn.atari.bayes_backprop_agent import AtariBayesBackpropAgent 26 | 27 | 28 | GAMES = ['BeamRiderNoFrameskip-v4', 29 | 'SpaceInvadersNoFrameskip-v4', 30 | 'BreakoutNoFrameskip-v4', 31 | 'EnduroNoFrameskip-v4', 32 | 'QbertNoFrameskip-v4', 33 | 'SeaquestNoFrameskip-v4', 34 | 'AlienNoFrameskip-v4', 35 | 'AmidarNoFrameskip-v4', 36 | 'FrostbiteNoFrameskip-v4', 37 | 'TutankhamNoFrameskip-v4', 38 | 'BankHeistNoFrameskip-v4', 39 | 'AsterixNoFrameskip-v4', 40 | 'GravitarNoFrameskip-v4'] 41 | 42 | # SEEDS = [123, 55, 104] 43 | SEEDS = [486, 750, 352, 93, 86] 44 | ALPHAS = [0.01, 0.001, 0.0001, 0.00001] 45 | 46 | RUN_ID = [] 47 | for seed in SEEDS: 48 | for game in GAMES: 49 | for alpha in ALPHAS: 50 | RUN_ID.append((seed, game, alpha)) 51 | 52 | 53 | def parse_args(): 54 | parser = argparse.ArgumentParser("Bayes backprop DQN experiments for Atari games") 55 | # Environment 56 | parser.add_argument("--env", type=str, default='PongNoFrameskip-v4', help="name of game") 57 | parser.add_argument("--seed", type=int, default=42, help="which seed to use") 58 | # Core DQN parameters 59 | parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size") 60 | parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer") 61 | # parser.add_argument("--lr", type=float, default=2.5e-4, help="learning rate for RMSprop optimizer") 62 | # parser.add_argument("--alpha", type=float, default=0.95, help="alpha (squared gradient momentum) parameter for RMSprop optimizer") 63 | # parser.add_argument("--momentum", type=float, default=0.95, help="momentum parameter for RMSprop optimizer") 64 | # parser.add_argument("--eps-rmsprop", type=float, default=0.01, help="epsilon (min squared gradient) parameter for RMSprop optimizer") 65 | parser.add_argument("--num-steps", type=int, default=int(1e7), 66 | help="total number of steps to run the environment for") 67 | parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time") 68 | parser.add_argument("--learning-freq", type=int, default=4, 69 | help="number of iterations between every optimization step") 70 | parser.add_argument("--target-update-freq", type=int, default=10000, 71 | help="number of iterations between every target network update") 72 | parser.add_argument("--learning-starts", type=int, default=50000, 73 | help="number of iterations after which learning starts") 74 | boolean_flag(parser, "double-q", default=False, help="whether or not to use double q learning") 75 | 76 | # Checkpointing 77 | parser.add_argument("--log-dir", type=str, default="log_dir", 78 | help="directory in which tensorboard events will be written out.") 79 | parser.add_argument("--save-dir", type=str, default="save_dir", 80 | help="directory in which training state and model will be saved") 81 | parser.add_argument("--save-freq", type=int, default=int(1e6), 82 | help="save model once every time this many iterations are completed") 83 | 84 | # parser.add_argument("--final-exploration", type=float, default=0.1, 85 | # help="final value of epsilon in epsilon greedy exploration") 86 | # parser.add_argument("--final-exploration-frame", type=int, default=int(1e6), 87 | # help="the number of frames over which the initial value of epsilon is linearly annealed to its final value") 88 | # New options 89 | parser.add_argument("--print-freq", type=int, default=100, help="printing frequency") 90 | parser.add_argument("--run-index", type=int, default=None, help="index RUN_ID") 91 | parser.add_argument("--cuda", type=int, default=1, help="whether or not to use cuda") 92 | parser.add_argument("--agent", type=str, default="BayesBackpropDQN", help="which agent to run") 93 | parser.add_argument("--discount", type=float, default=0.99, help="discount factor") 94 | # approximate posterior 95 | parser.add_argument('--hidden_dim', type=int, default=50, help='number of hidden unit used in normalizing flows') 96 | parser.add_argument('--n-hidden', type=int, default=0, help='number of hidden layer used in normalizing flows') 97 | parser.add_argument('--n-flows-q', type=int, default=1, help='number of normalizing flows using for the approximate posterior q') 98 | parser.add_argument('--n-flows-r', type=int, default=1, help='number of normalizing flows using for auxiliary posterior r') 99 | parser.add_argument('--alpha', type=float, default=0.001, help='trade-off parameter betweem KL and likelihood term') 100 | 101 | return parser.parse_args() 102 | 103 | 104 | if __name__ == '__main__': 105 | args = parse_args() 106 | if args.run_index is not None: 107 | args.seed, args.env, args.alpha = RUN_ID[args.run_index] 108 | 109 | print(' ' * 26 + 'Options') 110 | for k, v in vars(args).items(): 111 | print(' ' * 26 + k + ': ' + str(v)) 112 | 113 | if not os.path.exists(args.save_dir): 114 | os.mkdir(args.save_dir) 115 | # Log 116 | date = time.strftime('%Y-%m-%d.%H%M') 117 | log_dir = '{}/{}-{}-seed-{}-alpha-{}-{}'.format(args.log_dir, args.env, args.agent, args.seed, args.alpha, date) 118 | save_dir = '{}/{}-{}-seed-{}-alpha-{}-{}'.format(args.save_dir, args.env, args.agent, args.seed, args.alpha, date) 119 | 120 | log = SummaryWriter(log_dir) 121 | print('Writing logs to {}'.format(log_dir)) 122 | 123 | if not os.path.exists(save_dir): 124 | os.mkdir(save_dir) 125 | 126 | # with open(save_dir + '/error_monitor.csv', "wt") as monitor_file: 127 | # monitor = csv.writer(monitor_file) 128 | # monitor.writerow(['update', 'error', str(int(args.num_steps / args.learning_freq))]) 129 | 130 | with open(save_dir + '/reward_monitor.csv', "wt") as monitor_file: 131 | monitor = csv.writer(monitor_file) 132 | monitor.writerow(['epoch', 'reward', str(args.num_steps)]) 133 | 134 | with open(save_dir + "/params.pkl", 'wb') as f: 135 | pickle.dump(args, f) 136 | 137 | # Create and seed the env. 138 | env = make_atari(args.env) 139 | env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=False) 140 | env.seed(args.seed) 141 | torch.cuda.manual_seed(args.seed) 142 | torch.manual_seed(args.seed) 143 | np.random.seed(args.seed) 144 | random.seed(args.seed) 145 | 146 | # TODO 147 | num_actions = env.action_space.n 148 | agent = AtariBayesBackpropAgent(args, env.observation_space.shape[-1], num_actions) 149 | 150 | # exploration = LinearSchedule(args.final_exploration_frame, args.final_exploration, 1) 151 | 152 | replay_buffer = ReplayBuffer(args.replay_buffer_size) 153 | 154 | start_time, start_steps = None, None 155 | steps_per_iter = RunningAvg(0.999) 156 | iteration_time_est = RunningAvg(0.999) 157 | obs = env.reset() 158 | agent.online_net.reset_noise() 159 | num_iters = 0 160 | num_episodes = 0 161 | num_updates = 0 162 | prev_lives = None 163 | episode_rewards = [0.0] 164 | td_errors_list = [] 165 | best_score = None 166 | 167 | # Main training loop 168 | while True: 169 | num_iters += 1 170 | 171 | # Take action and store transition in the replay buffer. 172 | if num_iters <= args.learning_starts: 173 | action = random.randrange(num_actions) 174 | else: 175 | # Reshape state to (1, channels, x_dim, y_dim) 176 | action = agent.act(np.transpose(np.array(obs)[None], [0, 3, 1, 2]), eval=False) 177 | # import pdb 178 | # pdb.set_trace() 179 | new_obs, rew, done, info = env.step(int(action)) 180 | death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0) 181 | prev_lives = info['ale.lives'] 182 | # if death: 183 | # agent.online_net.reset_noise() 184 | 185 | replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death)) 186 | obs = new_obs 187 | episode_rewards[-1] += rew 188 | 189 | if done: 190 | log.add_scalar('reward', episode_rewards[-1], num_iters) 191 | episode_rewards.append(0.0) 192 | obs = env.reset() 193 | num_episodes += 1 194 | 195 | if num_iters > args.learning_starts and num_iters % args.learning_freq == 0: 196 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) 197 | # Reshape state to (batch, channels, x_dim, y_dim) 198 | obses_t = np.transpose(obses_t, [0, 3, 1, 2]) 199 | obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2]) 200 | 201 | # TODO 202 | td_errors, kl_reg, loss = agent.learn(obses_t, actions, rewards, obses_tp1, dones) 203 | # loss = agent.learn(obses_t, actions, rewards, obses_tp1, dones) 204 | td_errors_list.append(loss.item()) 205 | log.add_scalar('td_errors', td_errors.item(), num_iters) 206 | log.add_scalar('kl_reg', kl_reg.item(), num_iters) 207 | log.add_scalar('loss', loss.item(), num_iters) 208 | 209 | num_updates += 1 210 | 211 | # Update target network. 212 | if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0: 213 | # TODO 214 | agent.update_target_net() 215 | 216 | if start_time is not None: 217 | steps_per_iter.update(num_iters - start_steps) 218 | iteration_time_est.update(time.time() - start_time) 219 | start_time, start_steps = time.time(), num_iters 220 | 221 | if num_iters > args.num_steps: 222 | break 223 | 224 | if done and num_episodes % args.print_freq == 0 and num_episodes >= args.print_freq: 225 | steps_left = args.num_steps - num_iters 226 | completion = np.round(num_iters / args.num_steps, 1) 227 | mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) 228 | logger.record_tabular("% completion", completion) 229 | logger.record_tabular("total steps", num_iters) 230 | logger.record_tabular("episodes", num_episodes) 231 | logger.record_tabular("reward (100 epi mean)", mean_100ep_reward) 232 | #logger.record_tabular("exploration", exploration.value(num_iters)) 233 | 234 | fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6) 235 | if steps_per_iter._value is not None else "calculating...") 236 | logger.record_tabular("FPS", fps_estimate) 237 | logger.dump_tabular() 238 | logger.log() 239 | logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate))) 240 | logger.log() 241 | 242 | with open(save_dir + '/reward_monitor.csv', "a") as monitor_file: 243 | monitor = csv.writer(monitor_file) 244 | monitor.writerow([num_iters, mean_100ep_reward]) 245 | 246 | # if len(td_errors_list) > 0: 247 | # with open(save_dir + '/error_monitor.csv', "a") as monitor_file: 248 | # monitor = csv.writer(monitor_file) 249 | # monitor.writerow([num_updates, round(np.mean(td_errors_list), 4)]) 250 | 251 | if best_score is None or mean_100ep_reward > best_score: 252 | logger.log("Saving model due to mean reward increase: {} -> {}".format( 253 | best_score, mean_100ep_reward)) 254 | best_score = mean_100ep_reward 255 | torch.save(agent.online_net.state_dict(), log_dir + '/best_model.torch') 256 | 257 | torch.save(agent.online_net.state_dict(), save_dir + '/current_model.torch') 258 | # td_errors_list = [0.0] 259 | -------------------------------------------------------------------------------- /qlearn/atari/train_bootstrapped_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import csv 11 | import argparse 12 | import time 13 | import random 14 | import pickle 15 | import numpy as np 16 | import torch 17 | from tensorboardX import SummaryWriter 18 | 19 | from baselines import logger 20 | from baselines.deepq.replay_buffer import ReplayBuffer 21 | from baselines.common.misc_util import boolean_flag, pretty_eta, RunningAvg 22 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 23 | 24 | from qlearn.atari.bootstrapped_agent import AtariBootstrappedAgent 25 | 26 | 27 | GAMES = ['BeamRiderNoFrameskip-v4', 28 | 'SpaceInvadersNoFrameskip-v4', 29 | 'BreakoutNoFrameskip-v4', 30 | 'EnduroNoFrameskip-v4', 31 | 'QbertNoFrameskip-v4', 32 | 'SeaquestNoFrameskip-v4', 33 | 'AlienNoFrameskip-v4', 34 | 'AmidarNoFrameskip-v4', 35 | 'FrostbiteNoFrameskip-v4', 36 | 'TutankhamNoFrameskip-v4', 37 | 'BankHeistNoFrameskip-v4', 38 | 'AsterixNoFrameskip-v4', 39 | 'GravitarNoFrameskip-v4'] 40 | 41 | SEEDS = [486, 750, 352, 93, 86] 42 | 43 | RUN_ID = [] 44 | for seed in SEEDS: 45 | for game in GAMES: 46 | RUN_ID.append((seed, game)) 47 | 48 | 49 | def parse_args(): 50 | parser = argparse.ArgumentParser("Boostrapped DQN experiments for Atari games") 51 | # Environment 52 | parser.add_argument("--env", type=str, default='PongNoFrameskip-v4', help="name of game") 53 | parser.add_argument("--seed", type=int, default=42, help="which seed to use") 54 | # Core DQN parameters 55 | parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size") 56 | parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer") 57 | # parser.add_argument("--lr", type=float, default=2.5e-4, help="learning rate for RMSprop optimizer") 58 | # parser.add_argument("--alpha", type=float, default=0.95, help="alpha (squared gradient momentum) parameter for RMSprop optimizer") 59 | # parser.add_argument("--momentum", type=float, default=0.95, help="momentum parameter for RMSprop optimizer") 60 | # parser.add_argument("--eps-rmsprop", type=float, default=0.01, help="epsilon (min squared gradient) parameter for RMSprop optimizer") 61 | parser.add_argument("--num-steps", type=int, default=int(1e7), 62 | help="total number of steps to run the environment for") 63 | parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time") 64 | parser.add_argument("--learning-freq", type=int, default=4, 65 | help="number of iterations between every optimization step") 66 | parser.add_argument("--target-update-freq", type=int, default=10000, 67 | help="number of iterations between every target network update") 68 | parser.add_argument("--learning-starts", type=int, default=50000, 69 | help="number of iterations after which learning starts") 70 | # boolean_flag(parser, "double-q", default=False, help="whether or not to use double q learning") 71 | parser.add_argument("--double-q", type=int, default=0, help="whether or not to use double q learning") 72 | # Checkpointing 73 | parser.add_argument("--log-dir", type=str, default="log_dir", 74 | help="directory in which tensorboard events will be written out.") 75 | parser.add_argument("--save-dir", type=str, default="save_dir", 76 | help="directory in which training state and model will be saved") 77 | parser.add_argument("--save-freq", type=int, default=int(1e6), 78 | help="save model once every time this many iterations are completed") 79 | 80 | parser.add_argument("--final-exploration", type=float, default=0.1, 81 | help="final value of epsilon in epsilon greedy exploration") 82 | parser.add_argument("--final-exploration-frame", type=int, default=int(1e6), 83 | help="the number of frames over which the initial value of epsilon is linearly annealed to its final value") 84 | # New options 85 | parser.add_argument("--print-freq", type=int, default=100, help="printing frequency") 86 | 87 | parser.add_argument("--run-index", type=int, default=None, help="index RUN_ID") 88 | parser.add_argument("--cuda", type=int, default=1, help="whether or not to use cuda") 89 | parser.add_argument("--agent", type=str, default="BoostrappedDQN", help="which agent to run") 90 | parser.add_argument("--discount", type=float, default=0.99, help="discount factor") 91 | parser.add_argument("--model", type=str, default=None, help="model directory to load") 92 | parser.add_argument('--nheads', type=int, default=10, help='number of heads in Bootstrapped DQN') 93 | 94 | return parser.parse_args() 95 | 96 | if __name__ == '__main__': 97 | args = parse_args() 98 | if args.run_index is not None: 99 | args.seed, args.env = RUN_ID[args.run_index] 100 | 101 | print(' ' * 26 + 'Options') 102 | for k, v in vars(args).items(): 103 | print(' ' * 26 + k + ': ' + str(v)) 104 | 105 | if not os.path.exists(args.save_dir): 106 | os.mkdir(args.save_dir) 107 | # Log 108 | date = time.strftime('%Y-%m-%d.%H%M') 109 | log_dir = '{}/{}-{}-seed-{}-{}'.format(args.log_dir, args.env, args.agent, args.seed, date) 110 | save_dir = '{}/{}-{}-seed-{}-{}'.format(args.save_dir, args.env, args.agent, args.seed, date) 111 | 112 | log = SummaryWriter(log_dir) 113 | print('Writing logs to {}'.format(log_dir)) 114 | 115 | if not os.path.exists(save_dir): 116 | os.mkdir(save_dir) 117 | 118 | # with open(save_dir + '/error_monitor.csv', "wt") as monitor_file: 119 | # monitor = csv.writer(monitor_file) 120 | # monitor.writerow(['update', 'error', str(int(args.num_steps / args.learning_freq))]) 121 | 122 | with open(save_dir + '/reward_monitor.csv', "wt") as monitor_file: 123 | monitor = csv.writer(monitor_file) 124 | monitor.writerow(['epoch', 'reward', str(args.num_steps)]) 125 | 126 | with open(save_dir + "/params.pkl", 'wb') as f: 127 | pickle.dump(args, f) 128 | 129 | # Create and seed the env. 130 | env = make_atari(args.env) 131 | env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=False) 132 | env.seed(args.seed) 133 | torch.cuda.manual_seed(args.seed) 134 | torch.manual_seed(args.seed) 135 | np.random.seed(args.seed) 136 | random.seed(args.seed) 137 | 138 | # TODO 139 | num_actions = env.action_space.n 140 | agent = AtariBootstrappedAgent(args, env.observation_space.shape[-1], num_actions) 141 | 142 | replay_buffer = ReplayBuffer(args.replay_buffer_size) 143 | 144 | start_time, start_steps = None, None 145 | steps_per_iter = RunningAvg(0.999) 146 | iteration_time_est = RunningAvg(0.999) 147 | obs = env.reset() 148 | num_iters = 0 149 | num_episodes = 0 150 | num_updates = 0 151 | prev_lives = None 152 | episode_rewards = [0.0] 153 | td_errors_list = [] 154 | best_score = None 155 | k = random.randrange(args.nheads) 156 | while True: 157 | 158 | num_iters += 1 159 | # Take action and store transition in the replay buffer. 160 | if num_iters <= args.learning_starts: 161 | action = random.randrange(num_actions) 162 | else: 163 | # Reshape state to (1, channels, x_dim, y_dim) 164 | action = agent.act_single_head(np.transpose(np.array(obs)[None], [0, 3, 1, 2]), k) 165 | # import pdb 166 | # pdb.set_trace() 167 | new_obs, rew, done, info = env.step(action) 168 | death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0) 169 | prev_lives = info['ale.lives'] 170 | 171 | replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death)) 172 | obs = new_obs 173 | episode_rewards[-1] += rew 174 | 175 | if done: 176 | log.add_scalar('reward', episode_rewards[-1], num_iters) 177 | episode_rewards.append(0.0) 178 | obs = env.reset() 179 | num_episodes += 1 180 | 181 | if num_iters > args.learning_starts and num_iters % args.learning_freq == 0: 182 | 183 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) 184 | # Reshape state to (batch, channels, x_dim, y_dim) 185 | obses_t = np.transpose(obses_t, [0, 3, 1, 2]) 186 | obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2]) 187 | 188 | # TODO 189 | td_errors = agent.learn(obses_t, actions, rewards, obses_tp1, dones) 190 | td_errors_list.append(td_errors.item()) 191 | log.add_scalar('td_error', td_errors.item(), num_iters) 192 | 193 | print(td_errors.item()) 194 | 195 | num_updates += 1 196 | k = random.randrange(args.nheads) 197 | 198 | # Update target network. 199 | if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0: 200 | # TODO 201 | agent.update_target_net() 202 | 203 | if start_time is not None: 204 | steps_per_iter.update(num_iters - start_steps) 205 | iteration_time_est.update(time.time() - start_time) 206 | start_time, start_steps = time.time(), num_iters 207 | 208 | if num_iters > args.num_steps: 209 | break 210 | 211 | if done and num_episodes % args.print_freq == 0 and num_episodes >= args.print_freq: 212 | steps_left = args.num_steps - num_iters 213 | completion = np.round(num_iters / args.num_steps, 1) 214 | mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) 215 | logger.record_tabular("% completion", completion) 216 | logger.record_tabular("total steps", num_iters) 217 | logger.record_tabular("episodes", num_episodes) 218 | logger.record_tabular("reward (100 epi mean)", mean_100ep_reward) 219 | 220 | fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6) 221 | if steps_per_iter._value is not None else "calculating...") 222 | logger.record_tabular("FPS", fps_estimate) 223 | logger.dump_tabular() 224 | logger.log() 225 | logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate))) 226 | logger.log() 227 | 228 | with open(save_dir + '/reward_monitor.csv', "a") as monitor_file: 229 | monitor = csv.writer(monitor_file) 230 | monitor.writerow([num_iters, mean_100ep_reward]) 231 | 232 | # if len(td_errors_list) > 0: 233 | # with open(save_dir + '/error_monitor.csv', "a") as monitor_file: 234 | # monitor = csv.writer(monitor_file) 235 | # monitor.writerow([num_updates, round(np.mean(td_errors_list), 4)]) 236 | 237 | if best_score is None or mean_100ep_reward > best_score: 238 | logger.log("Saving model due to mean reward increase: {} -> {}".format( 239 | best_score, mean_100ep_reward)) 240 | best_score = mean_100ep_reward 241 | torch.save(agent.online_net.state_dict(), log_dir + '/best_model.torch') 242 | 243 | torch.save(agent.online_net.state_dict(), save_dir + '/current_model.torch') 244 | # td_errors_list = [0.0] 245 | -------------------------------------------------------------------------------- /qlearn/atari/train_dqn.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import csv 11 | import argparse 12 | import time 13 | import random 14 | import pickle 15 | import numpy as np 16 | import torch 17 | from tensorboardX import SummaryWriter 18 | 19 | from baselines import logger 20 | from baselines.deepq.replay_buffer import ReplayBuffer 21 | from baselines.common.misc_util import boolean_flag, pretty_eta, RunningAvg 22 | from baselines.common.schedules import LinearSchedule, PiecewiseSchedule 23 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 24 | 25 | from qlearn.atari.dqn_agent import AtariDQNAgent 26 | 27 | 28 | GAMES = ['BeamRiderNoFrameskip-v4', 29 | 'SpaceInvadersNoFrameskip-v4', 30 | 'BreakoutNoFrameskip-v4', 31 | 'EnduroNoFrameskip-v4', 32 | 'QbertNoFrameskip-v4', 33 | 'SeaquestNoFrameskip-v4', 34 | 'AlienNoFrameskip-v4', 35 | 'AmidarNoFrameskip-v4', 36 | 'FrostbiteNoFrameskip-v4', 37 | 'TutankhamNoFrameskip-v4', 38 | 'BankHeistNoFrameskip-v4', 39 | 'AsterixNoFrameskip-v4', 40 | 'GravitarNoFrameskip-v4'] 41 | 42 | 43 | SEEDS = [486, 750, 352, 93, 86] 44 | 45 | 46 | RUN_ID = [] 47 | for seed in SEEDS: 48 | for game in GAMES: 49 | RUN_ID.append((seed, game)) 50 | 51 | 52 | def parse_args(): 53 | parser = argparse.ArgumentParser("DQN experiments for Atari games") 54 | # Environment 55 | parser.add_argument("--env", type=str, default='PongNoFrameskip-v4', help="name of game") 56 | parser.add_argument("--seed", type=int, default=42, help="which seed to use") 57 | # Core DQN parameters 58 | parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size") 59 | parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer") 60 | # parser.add_argument("--lr", type=float, default=2.5e-4, help="learning rate for RMSprop optimizer") 61 | # parser.add_argument("--alpha", type=float, default=0.95, help="alpha (squared gradient momentum) parameter for RMSprop optimizer") 62 | # parser.add_argument("--momentum", type=float, default=0.95, help="momentum parameter for RMSprop optimizer") 63 | # parser.add_argument("--eps-rmsprop", type=float, default=0.01, help="epsilon (min squared gradient) parameter for RMSprop optimizer") 64 | parser.add_argument("--num-steps", type=int, default=int(1e7), 65 | help="total number of steps to run the environment for") 66 | parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time") 67 | parser.add_argument("--learning-freq", type=int, default=4, 68 | help="number of iterations between every optimization step") 69 | parser.add_argument("--target-update-freq", type=int, default=10000, 70 | help="number of iterations between every target network update") 71 | parser.add_argument("--learning-starts", type=int, default=50000, 72 | help="number of iterations after which learning starts") 73 | boolean_flag(parser, "double-q", default=True, help="whether or not to use double q learning") 74 | 75 | # Checkpointing 76 | parser.add_argument("--log-dir", type=str, default="log_dir", 77 | help="directory in which tensorboard events will be written out.") 78 | parser.add_argument("--save-dir", type=str, default="save_dir", 79 | help="directory in which training state and model will be saved") 80 | parser.add_argument("--save-freq", type=int, default=int(1e6), 81 | help="save model once every time this many iterations are completed") 82 | 83 | parser.add_argument("--final-exploration", type=float, default=0.1, 84 | help="final value of epsilon in epsilon greedy exploration") 85 | parser.add_argument("--final-exploration-frame", type=int, default=int(1e6), 86 | help="the number of frames over which the initial value of epsilon is linearly annealed to its final value") 87 | # New options 88 | parser.add_argument("--print-freq", type=int, default=100, help="printing frequency") 89 | 90 | parser.add_argument("--run-index", type=int, default=None, help="index RUN_ID") 91 | parser.add_argument("--cuda", type=int, default=1, help="whether or not to use cuda") 92 | parser.add_argument("--agent", type=str, default="DQN", help="which agent to run") 93 | parser.add_argument("--discount", type=float, default=0.99, help="discount factor") 94 | 95 | parser.add_argument("--model", type=str, default=None, help="model directory to load") 96 | 97 | return parser.parse_args() 98 | 99 | 100 | if __name__ == '__main__': 101 | args = parse_args() 102 | if args.run_index is not None: 103 | args.seed, args.env = RUN_ID[args.run_index] 104 | 105 | print(' ' * 26 + 'Options') 106 | for k, v in vars(args).items(): 107 | print(' ' * 26 + k + ': ' + str(v)) 108 | 109 | if not os.path.exists(args.save_dir): 110 | os.mkdir(args.save_dir) 111 | # Log 112 | date = time.strftime('%Y-%m-%d.%H%M') 113 | log_dir = '{}/{}-{}-seed-{}-{}'.format(args.log_dir, args.env, args.agent, args.seed, date) 114 | save_dir = '{}/{}-{}-seed-{}-{}'.format(args.save_dir, args.env, args.agent, args.seed, date) 115 | 116 | log = SummaryWriter(log_dir) 117 | print('Writing logs to {}'.format(log_dir)) 118 | 119 | if not os.path.exists(save_dir): 120 | os.mkdir(save_dir) 121 | 122 | # with open(save_dir + '/error_monitor.csv', "wt") as monitor_file: 123 | # monitor = csv.writer(monitor_file) 124 | # monitor.writerow(['update', 'error', str(int(args.num_steps / args.learning_freq))]) 125 | 126 | with open(save_dir + '/reward_monitor.csv', "wt") as monitor_file: 127 | monitor = csv.writer(monitor_file) 128 | monitor.writerow(['epoch', 'reward', str(args.num_steps)]) 129 | 130 | with open(save_dir + "/params.pkl", 'wb') as f: 131 | pickle.dump(args, f) 132 | 133 | # Create and seed the env. 134 | env = make_atari(args.env) 135 | env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=False) 136 | env.seed(args.seed) 137 | torch.cuda.manual_seed(args.seed) 138 | torch.manual_seed(args.seed) 139 | np.random.seed(args.seed) 140 | random.seed(args.seed) 141 | 142 | # TODO 143 | num_actions = env.action_space.n 144 | agent = AtariDQNAgent(args, env.observation_space.shape[-1], num_actions) 145 | 146 | exploration = LinearSchedule(args.final_exploration_frame, args.final_exploration, 1) 147 | 148 | replay_buffer = ReplayBuffer(args.replay_buffer_size) 149 | 150 | start_time, start_steps = None, None 151 | steps_per_iter = RunningAvg(0.999) 152 | iteration_time_est = RunningAvg(0.999) 153 | obs = env.reset() 154 | num_iters = 0 155 | num_episodes = 0 156 | num_updates = 0 157 | prev_lives = None 158 | episode_rewards = [0.0] 159 | td_errors_list = [] 160 | best_score = None 161 | 162 | # Main training loop 163 | while True: 164 | num_iters += 1 165 | 166 | # Take action and store transition in the replay buffer. 167 | update_eps = exploration.value(num_iters) 168 | 169 | # Reshape state to (1, channels, x_dim, y_dim) 170 | action = agent.act_e_greedy(np.transpose(np.array(obs)[None], [0, 3, 1, 2]), update_eps=update_eps) 171 | # import pdb 172 | # pdb.set_trace() 173 | new_obs, rew, done, info = env.step(action) 174 | death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0) 175 | prev_lives = info['ale.lives'] 176 | 177 | replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death)) 178 | obs = new_obs 179 | episode_rewards[-1] += rew 180 | 181 | if done: 182 | log.add_scalar('reward', episode_rewards[-1], num_iters) 183 | episode_rewards.append(0.0) 184 | obs = env.reset() 185 | num_episodes += 1 186 | 187 | if num_iters > args.learning_starts and num_iters % args.learning_freq == 0: 188 | 189 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) 190 | # Reshape state to (batch, channels, x_dim, y_dim) 191 | obses_t = np.transpose(obses_t, [0, 3, 1, 2]) 192 | obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2]) 193 | 194 | # TODO 195 | td_errors = agent.learn(obses_t, actions, rewards, obses_tp1, dones) 196 | td_errors_list.append(td_errors.item()) 197 | log.add_scalar('td_error', td_errors.item(), num_iters) 198 | 199 | num_updates += 1 200 | 201 | # Update target network. 202 | if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0: 203 | # TODO 204 | agent.update_target_net() 205 | 206 | if start_time is not None: 207 | steps_per_iter.update(num_iters - start_steps) 208 | iteration_time_est.update(time.time() - start_time) 209 | start_time, start_steps = time.time(), num_iters 210 | 211 | if num_iters > args.num_steps: 212 | break 213 | 214 | if done and num_episodes % args.print_freq == 0 and num_episodes >= args.print_freq: 215 | steps_left = args.num_steps - num_iters 216 | completion = np.round(num_iters / args.num_steps, 1) 217 | mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) 218 | logger.record_tabular("% completion", completion) 219 | logger.record_tabular("total steps", num_iters) 220 | logger.record_tabular("episodes", num_episodes) 221 | logger.record_tabular("reward (100 epi mean)", mean_100ep_reward) 222 | logger.record_tabular("exploration", exploration.value(num_iters)) 223 | 224 | fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6) 225 | if steps_per_iter._value is not None else "calculating...") 226 | logger.record_tabular("FPS", fps_estimate) 227 | logger.dump_tabular() 228 | logger.log() 229 | logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate))) 230 | logger.log() 231 | 232 | with open(save_dir + '/reward_monitor.csv', "a") as monitor_file: 233 | monitor = csv.writer(monitor_file) 234 | monitor.writerow([num_iters, mean_100ep_reward]) 235 | 236 | # if len(td_errors_list) > 0: 237 | # with open(save_dir + '/error_monitor.csv', "a") as monitor_file: 238 | # monitor = csv.writer(monitor_file) 239 | # monitor.writerow([num_updates, round(np.mean(td_errors_list), 4)]) 240 | 241 | if best_score is None or mean_100ep_reward > best_score: 242 | logger.log("Saving model due to mean reward increase: {} -> {}".format( 243 | best_score, mean_100ep_reward)) 244 | best_score = mean_100ep_reward 245 | torch.save(agent.online_net.state_dict(), log_dir + '/best_model.torch') 246 | 247 | torch.save(agent.online_net.state_dict(), save_dir + '/current_model.torch') 248 | # td_errors_list = [0.0] 249 | -------------------------------------------------------------------------------- /qlearn/atari/train_mnf_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import csv 11 | import argparse 12 | import time 13 | import random 14 | import pickle 15 | import numpy as np 16 | import torch 17 | from tensorboardX import SummaryWriter 18 | 19 | from baselines import logger 20 | from baselines.deepq.replay_buffer import ReplayBuffer 21 | from baselines.common.misc_util import boolean_flag, pretty_eta, RunningAvg 22 | from baselines.common.schedules import LinearSchedule, PiecewiseSchedule 23 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 24 | 25 | from qlearn.atari.local_mnf_agent import AtariMNFAgent 26 | 27 | 28 | GAMES = ['BeamRiderNoFrameskip-v4', 29 | 'SpaceInvadersNoFrameskip-v4', 30 | 'BreakoutNoFrameskip-v4', 31 | 'EnduroNoFrameskip-v4', 32 | 'QbertNoFrameskip-v4', 33 | 'SeaquestNoFrameskip-v4', 34 | 'AlienNoFrameskip-v4', 35 | 'AmidarNoFrameskip-v4', 36 | 'FrostbiteNoFrameskip-v4', 37 | 'TutankhamNoFrameskip-v4', 38 | 'BankHeistNoFrameskip-v4', 39 | 'AsterixNoFrameskip-v4', 40 | 'GravitarNoFrameskip-v4'] 41 | 42 | SEEDS = [486, 750, 352, 93, 86] 43 | 44 | ALPHAS = [0.01, 0.001, 0.0001, 0.00001] 45 | 46 | RUN_ID = [] 47 | 48 | for seed in SEEDS: 49 | for game in GAMES: 50 | for alpha in ALPHAS: 51 | RUN_ID.append((seed, game, alpha)) 52 | 53 | 54 | def parse_args(): 55 | parser = argparse.ArgumentParser("MNF DQN experiments for Atari games") 56 | # Environment 57 | parser.add_argument("--env", type=str, default='PongNoFrameskip-v4', help="name of game") 58 | parser.add_argument("--seed", type=int, default=42, help="which seed to use") 59 | # Core DQN parameters 60 | parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size") 61 | parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer") 62 | # parser.add_argument("--lr", type=float, default=2.5e-4, help="learning rate for RMSprop optimizer") 63 | # parser.add_argument("--alpha", type=float, default=0.95, help="alpha (squared gradient momentum) parameter for RMSprop optimizer") 64 | # parser.add_argument("--momentum", type=float, default=0.95, help="momentum parameter for RMSprop optimizer") 65 | # parser.add_argument("--eps-rmsprop", type=float, default=0.01, help="epsilon (min squared gradient) parameter for RMSprop optimizer") 66 | parser.add_argument("--num-steps", type=int, default=int(1e7), 67 | help="total number of steps to run the environment for") 68 | parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time") 69 | parser.add_argument("--learning-freq", type=int, default=4, 70 | help="number of iterations between every optimization step") 71 | parser.add_argument("--target-update-freq", type=int, default=10000, 72 | help="number of iterations between every target network update") 73 | parser.add_argument("--learning-starts", type=int, default=50000, 74 | help="number of iterations after which learning starts") 75 | boolean_flag(parser, "double-q", default=False, help="whether or not to use double q learning") 76 | 77 | # Checkpointing 78 | parser.add_argument("--log-dir", type=str, default="log_dir", 79 | help="directory in which tensorboard events will be written out.") 80 | parser.add_argument("--save-dir", type=str, default="save_dir", 81 | help="directory in which training state and model will be saved") 82 | parser.add_argument("--save-freq", type=int, default=int(1e6), 83 | help="save model once every time this many iterations are completed") 84 | 85 | # parser.add_argument("--final-exploration", type=float, default=0.1, 86 | # help="final value of epsilon in epsilon greedy exploration") 87 | # parser.add_argument("--final-exploration-frame", type=int, default=int(1e6), 88 | # help="the number of frames over which the initial value of epsilon is linearly annealed to its final value") 89 | # New options 90 | parser.add_argument("--print-freq", type=int, default=100, help="printing frequency") 91 | parser.add_argument("--run-index", type=int, default=None, help="index RUN_ID") 92 | parser.add_argument("--cuda", type=int, default=1, help="whether or not to use cuda") 93 | parser.add_argument("--agent", type=str, default="MNFDQN", help="which agent to run") 94 | parser.add_argument("--discount", type=float, default=0.99, help="discount factor") 95 | # approximate posterior 96 | parser.add_argument('--hidden_dim', type=int, default=50, help='number of hidden unit used in normalizing flows') 97 | parser.add_argument('--n-hidden', type=int, default=0, help='number of hidden layer used in normalizing flows') 98 | parser.add_argument('--n-flows-q', type=int, default=2, help='number of normalizing flows using for the approximate posterior q') 99 | parser.add_argument('--n-flows-r', type=int, default=2, help='number of normalizing flows using for auxiliary posterior r') 100 | parser.add_argument('--alpha', type=float, default=1.0, help='trade-off parameter betweem KL and likelihood term') 101 | 102 | parser.add_argument("--model", type=str, default=None, help="model directory to load") 103 | 104 | return parser.parse_args() 105 | 106 | 107 | if __name__ == '__main__': 108 | args = parse_args() 109 | if args.run_index is not None: 110 | args.seed, args.env, args.alpha = RUN_ID[args.run_index] 111 | 112 | print(' ' * 26 + 'Options') 113 | for k, v in vars(args).items(): 114 | print(' ' * 26 + k + ': ' + str(v)) 115 | 116 | if not os.path.exists(args.save_dir): 117 | os.mkdir(args.save_dir) 118 | # Log 119 | date = time.strftime('%Y-%m-%d.%H%M') 120 | log_dir = '{}/{}-{}-seed-{}-alpha-{}-{}'.format(args.log_dir, args.env, args.agent, args.seed, args.alpha, date) 121 | save_dir = '{}/{}-{}-seed-{}-alpha-{}-{}'.format(args.save_dir, args.env, args.agent, args.seed, args.alpha, date) 122 | 123 | log = SummaryWriter(log_dir) 124 | print('Writing logs to {}'.format(log_dir)) 125 | 126 | if not os.path.exists(save_dir): 127 | os.mkdir(save_dir) 128 | 129 | with open(save_dir + '/error_monitor.csv', "wt") as monitor_file: 130 | monitor = csv.writer(monitor_file) 131 | monitor.writerow(['update', 'error', str(int(args.num_steps / args.learning_freq))]) 132 | 133 | with open(save_dir + '/reward_monitor.csv', "wt") as monitor_file: 134 | monitor = csv.writer(monitor_file) 135 | monitor.writerow(['epoch', 'reward', str(args.num_steps)]) 136 | 137 | with open(save_dir + "/params.pkl", 'wb') as f: 138 | pickle.dump(args, f) 139 | 140 | # Create and seed the env. 141 | env = make_atari(args.env) 142 | env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=False) 143 | env.seed(args.seed) 144 | torch.cuda.manual_seed(args.seed) 145 | torch.manual_seed(args.seed) 146 | np.random.seed(args.seed) 147 | random.seed(args.seed) 148 | 149 | # TODO 150 | num_actions = env.action_space.n 151 | agent = AtariMNFAgent(args, env.observation_space.shape[-1], num_actions) 152 | 153 | # exploration = LinearSchedule(args.final_exploration_frame, args.final_exploration, 1) 154 | 155 | replay_buffer = ReplayBuffer(args.replay_buffer_size) 156 | 157 | start_time, start_steps = None, None 158 | steps_per_iter = RunningAvg(0.999) 159 | iteration_time_est = RunningAvg(0.999) 160 | obs = env.reset() 161 | agent.online_net.reset_noise() 162 | num_iters = 0 163 | num_episodes = 0 164 | num_updates = 0 165 | prev_lives = None 166 | episode_rewards = [0.0] 167 | # td_errors_list = [] 168 | best_score = None 169 | 170 | # Main training loop 171 | while True: 172 | num_iters += 1 173 | 174 | # Take action and store transition in the replay buffer. 175 | if num_iters <= args.learning_starts: 176 | action = random.randrange(num_actions) 177 | else: 178 | # Reshape state to (1, channels, x_dim, y_dim) 179 | action = agent.act(np.transpose(np.array(obs)[None], [0, 3, 1, 2]), eval=False) 180 | # import pdb 181 | # pdb.set_trace() 182 | new_obs, rew, done, info = env.step(int(action)) 183 | death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0) 184 | prev_lives = info['ale.lives'] 185 | # if death: 186 | # agent.online_net.reset_noise() 187 | 188 | replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death)) 189 | obs = new_obs 190 | episode_rewards[-1] += rew 191 | 192 | if done: 193 | log.add_scalar('reward', episode_rewards[-1], num_iters) 194 | # with open(save_dir + '/reward_monitor.csv', "a") as monitor_file: 195 | # monitor = csv.writer(monitor_file) 196 | # monitor.writerow([num_iters, episode_rewards[-1]]) 197 | episode_rewards.append(0.0) 198 | obs = env.reset() 199 | num_episodes += 1 200 | 201 | if num_iters > args.learning_starts and num_iters % args.learning_freq == 0: 202 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) 203 | # Reshape state to (batch, channels, x_dim, y_dim) 204 | obses_t = np.transpose(obses_t, [0, 3, 1, 2]) 205 | obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2]) 206 | 207 | # TODO 208 | td_errors, kl_reg, loss = agent.learn(obses_t, actions, rewards, obses_tp1, dones) 209 | # loss = agent.learn(obses_t, actions, rewards, obses_tp1, dones) 210 | # td_errors_list.append(loss.item()) 211 | log.add_scalar('td_errors', td_errors.item(), num_iters) 212 | log.add_scalar('kl_reg', kl_reg.item(), num_iters) 213 | log.add_scalar('loss', loss.item(), num_iters) 214 | 215 | num_updates += 1 216 | 217 | # Update target network. 218 | if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0: 219 | # TODO 220 | agent.update_target_net() 221 | 222 | if start_time is not None: 223 | steps_per_iter.update(num_iters - start_steps) 224 | iteration_time_est.update(time.time() - start_time) 225 | start_time, start_steps = time.time(), num_iters 226 | 227 | if num_iters > args.num_steps: 228 | break 229 | 230 | if done and num_episodes % args.print_freq == 0 and num_episodes >= args.print_freq: 231 | steps_left = args.num_steps - num_iters 232 | completion = np.round(num_iters / args.num_steps, 1) 233 | mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) 234 | logger.record_tabular("% completion", completion) 235 | logger.record_tabular("total steps", num_iters) 236 | logger.record_tabular("episodes", num_episodes) 237 | logger.record_tabular("reward (100 epi mean)", mean_100ep_reward) 238 | #logger.record_tabular("exploration", exploration.value(num_iters)) 239 | 240 | fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6) 241 | if steps_per_iter._value is not None else "calculating...") 242 | logger.record_tabular("FPS", fps_estimate) 243 | logger.dump_tabular() 244 | logger.log() 245 | logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate))) 246 | logger.log() 247 | 248 | with open(save_dir + '/reward_monitor.csv', "a") as monitor_file: 249 | monitor = csv.writer(monitor_file) 250 | monitor.writerow([num_iters, mean_100ep_reward]) 251 | 252 | # if len(td_errors_list) > 0: 253 | # with open(save_dir + '/error_monitor.csv', "a") as monitor_file: 254 | # monitor = csv.writer(monitor_file) 255 | # monitor.writerow([num_updates, round(np.mean(td_errors_list), 4)]) 256 | 257 | if best_score is None or mean_100ep_reward > best_score: 258 | logger.log("Saving model due to mean reward increase: {} -> {}".format( 259 | best_score, mean_100ep_reward)) 260 | best_score = mean_100ep_reward 261 | torch.save(agent.online_net.state_dict(), log_dir + '/best_model.torch') 262 | 263 | torch.save(agent.online_net.state_dict(), save_dir + '/current_model.torch') 264 | # td_errors_list = [0.0] 265 | -------------------------------------------------------------------------------- /qlearn/atari/train_noisy_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import csv 11 | import argparse 12 | import time 13 | import random 14 | import pickle 15 | import numpy as np 16 | import torch 17 | from tensorboardX import SummaryWriter 18 | 19 | from baselines import logger 20 | from baselines.deepq.replay_buffer import ReplayBuffer 21 | from baselines.common.misc_util import boolean_flag, pretty_eta, RunningAvg 22 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 23 | 24 | from qlearn.atari.noisy_agent import AtariNoisyAgent 25 | 26 | 27 | GAMES = ['BeamRiderNoFrameskip-v4', 28 | 'SpaceInvadersNoFrameskip-v4', 29 | 'BreakoutNoFrameskip-v4', 30 | 'EnduroNoFrameskip-v4', 31 | 'QbertNoFrameskip-v4', 32 | 'SeaquestNoFrameskip-v4', 33 | 'AlienNoFrameskip-v4', 34 | 'AmidarNoFrameskip-v4', 35 | 'FrostbiteNoFrameskip-v4', 36 | 'TutankhamNoFrameskip-v4', 37 | 'BankHeistNoFrameskip-v4', 38 | 'AsterixNoFrameskip-v4', 39 | 'GravitarNoFrameskip-v4'] 40 | 41 | 42 | SEEDS = [486, 750, 352, 93, 86] 43 | 44 | RUN_ID = [] 45 | for seed in SEEDS: 46 | for game in GAMES: 47 | RUN_ID.append((seed, game)) 48 | 49 | def parse_args(): 50 | parser = argparse.ArgumentParser("Noisy DQN experiments for Atari games") 51 | # Environment 52 | parser.add_argument("--env", type=str, default='PongNoFrameskip-v4', help="name of game") 53 | parser.add_argument("--seed", type=int, default=42, help="which seed to use") 54 | # Core DQN parameters 55 | parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size") 56 | parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer") 57 | # parser.add_argument("--lr", type=float, default=2.5e-4, help="learning rate for RMSprop optimizer") 58 | # parser.add_argument("--alpha", type=float, default=0.95, help="alpha (squared gradient momentum) parameter for RMSprop optimizer") 59 | # parser.add_argument("--momentum", type=float, default=0.95, help="momentum parameter for RMSprop optimizer") 60 | # parser.add_argument("--eps-rmsprop", type=float, default=0.01, help="epsilon (min squared gradient) parameter for RMSprop optimizer") 61 | parser.add_argument("--num-steps", type=int, default=int(1e7), 62 | help="total number of steps to run the environment for") 63 | parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time") 64 | parser.add_argument("--learning-freq", type=int, default=4, 65 | help="number of iterations between every optimization step") 66 | parser.add_argument("--target-update-freq", type=int, default=10000, 67 | help="number of iterations between every target network update") 68 | parser.add_argument("--learning-starts", type=int, default=50000, 69 | help="number of iterations after which learning starts") 70 | # boolean_flag(parser, "double-q", default=False, help="whether or not to use double q learning") 71 | parser.add_argument("--double-q", type=int, default=0, help="whether or not to use double q learning") 72 | # Checkpointing 73 | parser.add_argument("--log-dir", type=str, default="log_dir", 74 | help="directory in which tensorboard events will be written out.") 75 | parser.add_argument("--save-dir", type=str, default="save_dir", 76 | help="directory in which training state and model will be saved") 77 | parser.add_argument("--save-freq", type=int, default=int(1e6), 78 | help="save model once every time this many iterations are completed") 79 | 80 | parser.add_argument("--final-exploration", type=float, default=0.1, 81 | help="final value of epsilon in epsilon greedy exploration") 82 | parser.add_argument("--final-exploration-frame", type=int, default=int(1e6), 83 | help="the number of frames over which the initial value of epsilon is linearly annealed to its final value") 84 | # New options 85 | parser.add_argument("--print-freq", type=int, default=100, help="printing frequency") 86 | 87 | parser.add_argument("--run-index", type=int, default=None, help="index RUN_ID") 88 | parser.add_argument("--cuda", type=int, default=1, help="whether or not to use cuda") 89 | parser.add_argument("--agent", type=str, default="NoisyDQN", help="which agent to run") 90 | parser.add_argument("--discount", type=float, default=0.99, help="discount factor") 91 | parser.add_argument("--model", type=str, default=None, help="model directory to load") 92 | return parser.parse_args() 93 | 94 | if __name__ == '__main__': 95 | args = parse_args() 96 | if args.run_index is not None: 97 | args.seed, args.env = RUN_ID[args.run_index] 98 | 99 | print(' ' * 26 + 'Options') 100 | for k, v in vars(args).items(): 101 | print(' ' * 26 + k + ': ' + str(v)) 102 | 103 | if not os.path.exists(args.save_dir): 104 | os.mkdir(args.save_dir) 105 | # Log 106 | date = time.strftime('%Y-%m-%d.%H%M') 107 | log_dir = '{}/{}-{}-seed-{}-{}'.format(args.log_dir, args.env, args.agent, args.seed, date) 108 | save_dir = '{}/{}-{}-seed-{}-{}'.format(args.save_dir, args.env, args.agent, args.seed, date) 109 | 110 | log = SummaryWriter(log_dir) 111 | print('Writing logs to {}'.format(log_dir)) 112 | 113 | if not os.path.exists(save_dir): 114 | os.mkdir(save_dir) 115 | 116 | # with open(save_dir + '/error_monitor.csv', "wt") as monitor_file: 117 | # monitor = csv.writer(monitor_file) 118 | # monitor.writerow(['update', 'error', str(int(args.num_steps / args.learning_freq))]) 119 | 120 | with open(save_dir + '/reward_monitor.csv', "wt") as monitor_file: 121 | monitor = csv.writer(monitor_file) 122 | monitor.writerow(['epoch', 'reward', str(args.num_steps)]) 123 | 124 | with open(save_dir + "/params.pkl", 'wb') as f: 125 | pickle.dump(args, f) 126 | 127 | # Create and seed the env. 128 | env = make_atari(args.env) 129 | env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=False) 130 | env.seed(args.seed) 131 | torch.cuda.manual_seed(args.seed) 132 | torch.manual_seed(args.seed) 133 | np.random.seed(args.seed) 134 | random.seed(args.seed) 135 | 136 | # TODO 137 | num_actions = env.action_space.n 138 | agent = AtariNoisyAgent(args, env.observation_space.shape[-1], num_actions) 139 | 140 | replay_buffer = ReplayBuffer(args.replay_buffer_size) 141 | 142 | start_time, start_steps = None, None 143 | steps_per_iter = RunningAvg(0.999) 144 | iteration_time_est = RunningAvg(0.999) 145 | obs = env.reset() 146 | num_iters = 0 147 | num_episodes = 0 148 | num_updates = 0 149 | prev_lives = None 150 | episode_rewards = [0.0] 151 | td_errors_list = [] 152 | best_score = None 153 | 154 | while True: 155 | num_iters += 1 156 | # Take action and store transition in the replay buffer. 157 | if num_iters <= args.learning_starts: 158 | action = random.randrange(num_actions) 159 | else: 160 | # Reshape state to (1, channels, x_dim, y_dim) 161 | action = agent.act(np.transpose(np.array(obs)[None], [0, 3, 1, 2]), eval=False) 162 | # import pdb 163 | # pdb.set_trace() 164 | new_obs, rew, done, info = env.step(action) 165 | death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0) 166 | prev_lives = info['ale.lives'] 167 | 168 | replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death)) 169 | obs = new_obs 170 | episode_rewards[-1] += rew 171 | 172 | if done: 173 | log.add_scalar('reward', episode_rewards[-1], num_iters) 174 | episode_rewards.append(0.0) 175 | obs = env.reset() 176 | num_episodes += 1 177 | 178 | if num_iters > args.learning_starts and num_iters % args.learning_freq == 0: 179 | 180 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) 181 | # Reshape state to (batch, channels, x_dim, y_dim) 182 | obses_t = np.transpose(obses_t, [0, 3, 1, 2]) 183 | obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2]) 184 | 185 | # TODO 186 | td_errors = agent.learn(obses_t, actions, rewards, obses_tp1, dones) 187 | td_errors_list.append(td_errors.item()) 188 | log.add_scalar('td_error', td_errors.item(), num_iters) 189 | 190 | num_updates += 1 191 | 192 | # Update target network. 193 | if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0: 194 | # TODO 195 | agent.update_target_net() 196 | 197 | if start_time is not None: 198 | steps_per_iter.update(num_iters - start_steps) 199 | iteration_time_est.update(time.time() - start_time) 200 | start_time, start_steps = time.time(), num_iters 201 | 202 | if num_iters > args.num_steps: 203 | break 204 | 205 | if done and num_episodes % args.print_freq == 0 and num_episodes >= args.print_freq: 206 | steps_left = args.num_steps - num_iters 207 | completion = np.round(num_iters / args.num_steps, 1) 208 | mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) 209 | logger.record_tabular("% completion", completion) 210 | logger.record_tabular("total steps", num_iters) 211 | logger.record_tabular("episodes", num_episodes) 212 | logger.record_tabular("reward (100 epi mean)", mean_100ep_reward) 213 | 214 | fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6) 215 | if steps_per_iter._value is not None else "calculating...") 216 | logger.record_tabular("FPS", fps_estimate) 217 | logger.dump_tabular() 218 | logger.log() 219 | logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate))) 220 | logger.log() 221 | 222 | with open(save_dir + '/reward_monitor.csv', "a") as monitor_file: 223 | monitor = csv.writer(monitor_file) 224 | monitor.writerow([num_iters, mean_100ep_reward]) 225 | 226 | # if len(td_errors_list) > 0: 227 | # with open(save_dir + '/error_monitor.csv', "a") as monitor_file: 228 | # monitor = csv.writer(monitor_file) 229 | # monitor.writerow([num_updates, round(np.mean(td_errors_list), 4)]) 230 | 231 | if best_score is None or mean_100ep_reward > best_score: 232 | logger.log("Saving model due to mean reward increase: {} -> {}".format( 233 | best_score, mean_100ep_reward)) 234 | best_score = mean_100ep_reward 235 | torch.save(agent.online_net.state_dict(), log_dir + '/best_model.torch') 236 | 237 | torch.save(agent.online_net.state_dict(), save_dir + '/current_model.torch') 238 | # td_errors_list = [0.0] 239 | -------------------------------------------------------------------------------- /qlearn/atari/train_prior_bootstrapped_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import csv 11 | import argparse 12 | import time 13 | import random 14 | import pickle 15 | import numpy as np 16 | import torch 17 | from tensorboardX import SummaryWriter 18 | 19 | from baselines import logger 20 | from baselines.deepq.replay_buffer import ReplayBuffer 21 | from baselines.common.misc_util import boolean_flag, pretty_eta, RunningAvg 22 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 23 | 24 | from qlearn.atari.prior_bootstrapped_agent import AtariPriorBootstrappedAgent 25 | 26 | 27 | GAMES = ['BeamRiderNoFrameskip-v4', 28 | 'SpaceInvadersNoFrameskip-v4', 29 | 'BreakoutNoFrameskip-v4', 30 | 'EnduroNoFrameskip-v4', 31 | 'QbertNoFrameskip-v4', 32 | 'SeaquestNoFrameskip-v4', 33 | 'AlienNoFrameskip-v4', 34 | 'AmidarNoFrameskip-v4', 35 | 'FrostbiteNoFrameskip-v4', 36 | 'TutankhamNoFrameskip-v4', 37 | 'BankHeistNoFrameskip-v4', 38 | 'AsterixNoFrameskip-v4', 39 | 'GravitarNoFrameskip-v4'] 40 | 41 | SEEDS = [486, 750, 352, 93, 86] 42 | 43 | RUN_ID = [] 44 | for seed in SEEDS: 45 | for game in GAMES: 46 | RUN_ID.append((seed, game)) 47 | 48 | 49 | def parse_args(): 50 | parser = argparse.ArgumentParser("Boostrapped DQN with prior experiments for Atari games") 51 | # Environment 52 | parser.add_argument("--env", type=str, default='PongNoFrameskip-v4', help="name of game") 53 | parser.add_argument("--seed", type=int, default=42, help="which seed to use") 54 | # Core DQN parameters 55 | parser.add_argument("--replay-buffer-size", type=int, default=int(1e6), help="replay buffer size") 56 | parser.add_argument("--lr", type=float, default=1e-4, help="learning rate for Adam optimizer") 57 | # parser.add_argument("--lr", type=float, default=2.5e-4, help="learning rate for RMSprop optimizer") 58 | # parser.add_argument("--alpha", type=float, default=0.95, help="alpha (squared gradient momentum) parameter for RMSprop optimizer") 59 | # parser.add_argument("--momentum", type=float, default=0.95, help="momentum parameter for RMSprop optimizer") 60 | # parser.add_argument("--eps-rmsprop", type=float, default=0.01, help="epsilon (min squared gradient) parameter for RMSprop optimizer") 61 | parser.add_argument("--num-steps", type=int, default=int(1e7), 62 | help="total number of steps to run the environment for") 63 | parser.add_argument("--batch-size", type=int, default=32, help="number of transitions to optimize at the same time") 64 | parser.add_argument("--learning-freq", type=int, default=4, 65 | help="number of iterations between every optimization step") 66 | parser.add_argument("--target-update-freq", type=int, default=10000, 67 | help="number of iterations between every target network update") 68 | parser.add_argument("--learning-starts", type=int, default=50000, 69 | help="number of iterations after which learning starts") 70 | # boolean_flag(parser, "double-q", default=False, help="whether or not to use double q learning") 71 | parser.add_argument("--double-q", type=int, default=0, help="whether or not to use double q learning") 72 | # Checkpointing 73 | parser.add_argument("--log-dir", type=str, default="log_dir", 74 | help="directory in which tensorboard events will be written out.") 75 | parser.add_argument("--save-dir", type=str, default="save_dir", 76 | help="directory in which training state and model will be saved") 77 | parser.add_argument("--save-freq", type=int, default=int(1e6), 78 | help="save model once every time this many iterations are completed") 79 | 80 | parser.add_argument("--final-exploration", type=float, default=0.1, 81 | help="final value of epsilon in epsilon greedy exploration") 82 | parser.add_argument("--final-exploration-frame", type=int, default=int(1e6), 83 | help="the number of frames over which the initial value of epsilon is linearly annealed to its final value") 84 | # New options 85 | parser.add_argument("--print-freq", type=int, default=100, help="printing frequency") 86 | 87 | parser.add_argument("--run-index", type=int, default=None, help="index RUN_ID") 88 | parser.add_argument("--cuda", type=int, default=1, help="whether or not to use cuda") 89 | parser.add_argument("--agent", type=str, default="PriorBoostrappedDQN", help="which agent to run") 90 | parser.add_argument("--discount", type=float, default=0.99, help="discount factor") 91 | parser.add_argument("--model", type=str, default=None, help="model directory to load") 92 | parser.add_argument('--nheads', type=int, default=10, help='number of heads in Bootstrapped DQN') 93 | parser.add_argument('--beta', type=float, default=0.1, help='prior scale') 94 | 95 | return parser.parse_args() 96 | 97 | if __name__ == '__main__': 98 | args = parse_args() 99 | if args.run_index is not None: 100 | args.seed, args.env = RUN_ID[args.run_index] 101 | 102 | print(' ' * 26 + 'Options') 103 | for k, v in vars(args).items(): 104 | print(' ' * 26 + k + ': ' + str(v)) 105 | 106 | if not os.path.exists(args.save_dir): 107 | os.mkdir(args.save_dir) 108 | # Log 109 | date = time.strftime('%Y-%m-%d.%H%M') 110 | log_dir = '{}/{}-{}-seed-{}-{}'.format(args.log_dir, args.env, args.agent, args.seed, date) 111 | save_dir = '{}/{}-{}-seed-{}-{}'.format(args.save_dir, args.env, args.agent, args.seed, date) 112 | 113 | log = SummaryWriter(log_dir) 114 | print('Writing logs to {}'.format(log_dir)) 115 | 116 | if not os.path.exists(save_dir): 117 | os.mkdir(save_dir) 118 | 119 | # with open(save_dir + '/error_monitor.csv', "wt") as monitor_file: 120 | # monitor = csv.writer(monitor_file) 121 | # monitor.writerow(['update', 'error', str(int(args.num_steps / args.learning_freq))]) 122 | 123 | with open(save_dir + '/reward_monitor.csv', "wt") as monitor_file: 124 | monitor = csv.writer(monitor_file) 125 | monitor.writerow(['epoch', 'reward', str(args.num_steps)]) 126 | 127 | with open(save_dir + "/params.pkl", 'wb') as f: 128 | pickle.dump(args, f) 129 | 130 | # Create and seed the env. 131 | env = make_atari(args.env) 132 | env = wrap_deepmind(env, episode_life=False, clip_rewards=False, frame_stack=True, scale=False) 133 | env.seed(args.seed) 134 | torch.cuda.manual_seed(args.seed) 135 | torch.manual_seed(args.seed) 136 | np.random.seed(args.seed) 137 | random.seed(args.seed) 138 | 139 | # TODO 140 | num_actions = env.action_space.n 141 | agent = AtariPriorBootstrappedAgent(args, env.observation_space.shape[-1], num_actions) 142 | 143 | replay_buffer = ReplayBuffer(args.replay_buffer_size) 144 | 145 | start_time, start_steps = None, None 146 | steps_per_iter = RunningAvg(0.999) 147 | iteration_time_est = RunningAvg(0.999) 148 | obs = env.reset() 149 | num_iters = 0 150 | num_episodes = 0 151 | num_updates = 0 152 | prev_lives = None 153 | episode_rewards = [0.0] 154 | td_errors_list = [] 155 | best_score = None 156 | k = random.randrange(args.nheads) 157 | while True: 158 | 159 | num_iters += 1 160 | # Take action and store transition in the replay buffer. 161 | if num_iters <= args.learning_starts: 162 | action = random.randrange(num_actions) 163 | else: 164 | # Reshape state to (1, channels, x_dim, y_dim) 165 | action = agent.act_single_head(np.transpose(np.array(obs)[None], [0, 3, 1, 2]), k) 166 | # import pdb 167 | # pdb.set_trace() 168 | new_obs, rew, done, info = env.step(action) 169 | death = done or (prev_lives is not None and info['ale.lives'] < prev_lives and info['ale.lives'] > 0) 170 | prev_lives = info['ale.lives'] 171 | replay_buffer.add(obs, action, np.sign(rew), new_obs, float(death)) 172 | obs = new_obs 173 | episode_rewards[-1] += rew 174 | 175 | if done: 176 | log.add_scalar('reward', episode_rewards[-1], num_iters) 177 | episode_rewards.append(0.0) 178 | obs = env.reset() 179 | num_episodes += 1 180 | 181 | if num_iters > args.learning_starts and num_iters % args.learning_freq == 0: 182 | 183 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) 184 | # Reshape state to (batch, channels, x_dim, y_dim) 185 | obses_t = np.transpose(obses_t, [0, 3, 1, 2]) 186 | obses_tp1 = np.transpose(obses_tp1, [0, 3, 1, 2]) 187 | 188 | # TODO 189 | td_errors = agent.learn(obses_t, actions, rewards, obses_tp1, dones) 190 | td_errors_list.append(td_errors.item()) 191 | log.add_scalar('td_error', td_errors.item(), num_iters) 192 | 193 | # print(td_errors.item()) 194 | 195 | num_updates += 1 196 | k = random.randrange(args.nheads) 197 | 198 | # Update target network. 199 | if num_iters > args.learning_starts and num_iters % args.target_update_freq == 0: 200 | # TODO 201 | agent.update_target_net() 202 | 203 | if start_time is not None: 204 | steps_per_iter.update(num_iters - start_steps) 205 | iteration_time_est.update(time.time() - start_time) 206 | start_time, start_steps = time.time(), num_iters 207 | 208 | if num_iters > args.num_steps: 209 | break 210 | 211 | if done and num_episodes % args.print_freq == 0 and num_episodes >= args.print_freq: 212 | steps_left = args.num_steps - num_iters 213 | completion = np.round(num_iters / args.num_steps, 1) 214 | mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1) 215 | logger.record_tabular("% completion", completion) 216 | logger.record_tabular("total steps", num_iters) 217 | logger.record_tabular("episodes", num_episodes) 218 | logger.record_tabular("reward (100 epi mean)", mean_100ep_reward) 219 | 220 | fps_estimate = (float(steps_per_iter) / (float(iteration_time_est) + 1e-6) 221 | if steps_per_iter._value is not None else "calculating...") 222 | logger.record_tabular("FPS", fps_estimate) 223 | logger.dump_tabular() 224 | logger.log() 225 | logger.log("ETA: " + pretty_eta(int(steps_left / fps_estimate))) 226 | logger.log() 227 | 228 | with open(save_dir + '/reward_monitor.csv', "a") as monitor_file: 229 | monitor = csv.writer(monitor_file) 230 | monitor.writerow([num_iters, mean_100ep_reward]) 231 | 232 | # if len(td_errors_list) > 0: 233 | # with open(save_dir + '/error_monitor.csv', "a") as monitor_file: 234 | # monitor = csv.writer(monitor_file) 235 | # monitor.writerow([num_updates, round(np.mean(td_errors_list), 4)]) 236 | 237 | if best_score is None or mean_100ep_reward > best_score: 238 | logger.log("Saving model due to mean reward increase: {} -> {}".format( 239 | best_score, mean_100ep_reward)) 240 | best_score = mean_100ep_reward 241 | torch.save(agent.online_net.state_dict(), log_dir + '/best_model.torch') 242 | 243 | torch.save(agent.online_net.state_dict(), save_dir + '/current_model.torch') 244 | # td_errors_list = [0.0] 245 | -------------------------------------------------------------------------------- /qlearn/commun/bayes_backprop_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import math 10 | import torch 11 | import torch.nn as nn 12 | from torch.autograd import Variable 13 | from torch.nn import functional as F 14 | 15 | class BayesBackpropLinear(nn.Module): 16 | def __init__(self, in_features, out_features, sigma_prior=1): 17 | nn.Module.__init__(self) 18 | self.in_features = in_features 19 | self.out_features = out_features 20 | self.sigma_prior = sigma_prior 21 | self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features)) 22 | self.weight_logsigma = nn.Parameter(torch.Tensor(out_features, in_features)) 23 | self.register_buffer('weight_epsilon', torch.Tensor(out_features, in_features)) 24 | self.bias_mu = nn.Parameter(torch.Tensor(out_features)) 25 | self.bias_logsigma = nn.Parameter(torch.Tensor(out_features)) 26 | self.register_buffer('bias_epsilon', torch.Tensor(out_features)) 27 | self.reset_parameters() 28 | self.reset_noise() 29 | 30 | def reset_parameters(self): 31 | mu_range = math.sqrt(3.0 / self.in_features) 32 | self.weight_mu.data.uniform_(-mu_range, mu_range) 33 | self.weight_logsigma.data.fill_(-3) 34 | self.bias_mu.data.uniform_(-mu_range, -mu_range) 35 | self.bias_logsigma.data.fill_(-3) 36 | 37 | def reset_noise(self): 38 | self.weight_epsilon.copy_(torch.randn(self.out_features, self.in_features)) 39 | self.bias_epsilon.copy_(torch.randn(self.out_features)) 40 | 41 | def forward(self, input): 42 | if self.training: 43 | weight_sigma = F.softplus(self.weight_logsigma) 44 | bias_sigma = F.softplus(self.bias_logsigma) 45 | return F.linear(input, self.weight_mu + weight_sigma * self.weight_epsilon, self.bias_mu + bias_sigma * self.bias_epsilon) 46 | else: 47 | return F.linear(input, self.weight_mu, self.bias_mu) 48 | 49 | def kldiv(self): 50 | weight_sigma = F.softplus(self.weight_logsigma) 51 | bias_sigma = F.softplus(self.bias_logsigma) 52 | kldiv_weight = torch.sum( math.log(self.sigma_prior) - torch.log(weight_sigma) + \ 53 | (weight_sigma **2 + self.weight_mu **2) /(2 * self.sigma_prior ** 2) - 0.5) 54 | 55 | kldiv_bias = torch.sum( math.log(self.sigma_prior) - torch.log(bias_sigma) + \ 56 | (bias_sigma **2 + self.bias_mu **2) /(2 * self.sigma_prior ** 2) - 0.5) 57 | 58 | return kldiv_weight + kldiv_bias 59 | -------------------------------------------------------------------------------- /qlearn/commun/dropout_toy_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import numpy as np 10 | import matplotlib 11 | matplotlib.use('Agg') 12 | import matplotlib.pylab as plt 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | from torch.autograd import Variable 18 | from torch import optim 19 | 20 | from qlearn.commun.variational_dropout_layer import VariationalDropoutLinear 21 | 22 | BAYES = True 23 | use_cuda = True 24 | 25 | 26 | class RegressionModel(nn.Module): 27 | def __init__(self): 28 | nn.Module.__init__(self) 29 | #input_dim, output_dim, hidden_dim, n_hidden, n_flows_q, n_flows_r 30 | self.fc1 = VariationalDropoutLinear(1, 100, 50, 1, 2) 31 | self.fc2 = VariationalDropoutLinear(100, 1, 50, 1, 1) 32 | 33 | def forward(self, x): 34 | if self.training: 35 | x, kldiv1 = self.fc1.forward(x) 36 | x = F.relu(x) 37 | x, kldiv2 = self.fc2.forward(x) 38 | kldiv = kldiv1 + kldiv2 39 | return x, kldiv 40 | else: 41 | x = self.fc1.forward(x) 42 | x = F.relu(x) 43 | x = self.fc2.forward(x) 44 | return x 45 | 46 | def reset_noise(self): 47 | self.fc1.reset_noise() 48 | self.fc2.reset_noise() 49 | 50 | 51 | class MLP(nn.Module): 52 | def __init__(self): 53 | nn.Module.__init__(self) 54 | self.fc1 = nn.Linear(1, 100) 55 | self.fc2 = nn.Linear(100, 1) 56 | 57 | def forward(self, x): 58 | x = self.fc1(x) 59 | x = F.relu(x) 60 | x = self.fc2(x) 61 | return x 62 | 63 | #X = torch.Tensor(20, 1).uniform_(-4, 4) 64 | X = np.random.uniform(-4, 4, (20, 1)).astype('float32') 65 | # X = np.random.rand(20, 1).astype('float32') * 8 - 4 66 | sigma = 3 67 | epsilon = np.random.normal(size=X.shape).astype('float32') 68 | Y = np.power(X, 3) + sigma * epsilon 69 | 70 | if BAYES: 71 | regressor = RegressionModel() 72 | else: 73 | regressor = MLP() 74 | 75 | x = Variable(torch.from_numpy(X)) 76 | y = Variable(torch.from_numpy(Y)) 77 | if use_cuda: 78 | x = x.cuda() 79 | y = y.cuda() 80 | optimiser = optim.Adam(regressor.parameters(), lr=0.01) 81 | 82 | if use_cuda: 83 | regressor.cuda() 84 | # y = y.cuda() 85 | 86 | regressor.train() 87 | for epoch in range(1000): 88 | regressor.zero_grad() 89 | if BAYES: 90 | regressor.reset_noise() 91 | # import pdb 92 | # pdb.set_trace() 93 | y_pred, kldiv = regressor(x) 94 | kl_reg = kldiv / 20.0 95 | mse = F.mse_loss(y_pred, y) / (2 * 9) 96 | loss = mse + kl_reg 97 | else: 98 | loss = F.mse_loss(regressor(x), y) / (2 * 9) 99 | loss.backward() 100 | optimiser.step() 101 | # if epoch % 10 == 0: 102 | if BAYES: 103 | print('epoch: {}, loss: {}, kl: {}, mse: {}'.format(epoch, loss.item(), kl_reg.item(), mse.item())) 104 | else: 105 | print('epoch: {}, loss: {}'.format(epoch, loss.item())) 106 | 107 | n_test = 500 108 | x_test = np.linspace(-6, 6, n_test).reshape(n_test, 1).astype('float32') 109 | y_preds = [] 110 | 111 | # regressor.eval() 112 | # assert regressor.fc1.training == False 113 | 114 | X_TEST = Variable(torch.from_numpy(x_test)) 115 | if use_cuda: 116 | X_TEST = X_TEST.cuda() 117 | for _ in range(20): 118 | if BAYES: 119 | regressor.reset_noise() 120 | y_pred, _ = regressor(X_TEST) 121 | y_preds.append(y_pred.data.cpu().numpy()) 122 | else: 123 | y_preds.append(regressor(X_TEST).data.cpu().numpy()) 124 | y_preds = np.array(y_preds).reshape(20, n_test) 125 | y_preds_mean = np.mean(y_preds, axis=0) 126 | y_preds_var = np.std(y_preds, axis=0) 127 | 128 | 129 | plt.plot(x_test, y_preds_mean) 130 | if BAYES: 131 | plt.fill_between(x_test.reshape(n_test,), y_preds_mean - 3 * y_preds_var, y_preds_mean + 3 * y_preds_var, alpha=0.5) 132 | plt.plot(X, Y, 'x') 133 | plt.ylim(-100, 100) 134 | plt.savefig('toy_regression.png') 135 | -------------------------------------------------------------------------------- /qlearn/commun/local_mnf_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | from torch.autograd import Variable 13 | import torch.nn.functional as F 14 | 15 | from qlearn.commun.norm_flows import MaskedNVPFlow 16 | 17 | 18 | class MNFLinear(nn.Module): 19 | def __init__(self, in_features, out_features, hidden_dim, n_hidden, n_flows_q, n_flows_r, 20 | use_cuda=True, prior_var=1.0, threshold_var=0.5): 21 | nn.Module.__init__(self) 22 | self.in_features = in_features 23 | self.out_features = out_features 24 | self.hidden_dim = hidden_dim 25 | self.n_hidden = n_hidden 26 | self.n_flows_q = n_flows_q 27 | self.n_flows_r = n_flows_r 28 | self.prior_var = prior_var 29 | self.threshold_var = threshold_var 30 | self.use_cuda = use_cuda 31 | 32 | self.weight_mu = nn.Parameter(torch.Tensor(in_features, out_features)) 33 | self.weight_logstd = nn.Parameter(torch.Tensor(in_features, out_features)) 34 | self.bias_mu = nn.Parameter(torch.Tensor(out_features)) 35 | self.bias_logvar = nn.Parameter(torch.Tensor(out_features)) 36 | 37 | self.qzero_mu = nn.Parameter(torch.Tensor(in_features)) 38 | self.qzero_logvar = nn.Parameter(torch.Tensor(in_features)) 39 | # auxiliary variable c, b1 and b2 are defined in equation (9) and (10) 40 | self.rzero_c = nn.Parameter(torch.Tensor(in_features)) 41 | self.rzero_b1 = nn.Parameter(torch.Tensor(in_features)) 42 | self.rzero_b2 = nn.Parameter(torch.Tensor(in_features)) 43 | 44 | self.flow_q = MaskedNVPFlow(in_features, hidden_dim, n_hidden, n_flows_q) 45 | self.flow_r = MaskedNVPFlow(in_features, hidden_dim, n_hidden, n_flows_r) 46 | 47 | self.register_buffer('epsilon_z', torch.Tensor(in_features)) 48 | self.register_buffer('epsilon_linear', torch.Tensor(out_features)) 49 | self.reset_parameters() 50 | self.reset_noise() 51 | 52 | def reset_noise(self): 53 | epsilon_z = torch.randn(self.in_features) 54 | epsilon_linear = torch.randn(self.out_features) 55 | self.epsilon_z.copy_(epsilon_z) 56 | self.epsilon_linear.copy_(epsilon_linear) 57 | self.flow_q.reset_noise() 58 | self.flow_r.reset_noise() 59 | 60 | def reset_parameters(self): 61 | 62 | in_stdv = np.sqrt(4.0 / self.in_features) 63 | out_stdv = np.sqrt(4.0 / self.out_features) 64 | stdv2 = np.sqrt(4.0 / (self.in_features + self.out_features)) 65 | 66 | self.weight_mu.data.normal_(0, stdv2) 67 | self.weight_logstd.data.normal_(-9, 1e-3 * stdv2) 68 | self.bias_mu.data.zero_() 69 | self.bias_logvar.data.normal_(-9, 1e-3 * out_stdv) 70 | 71 | self.qzero_mu.data.normal_(1 if self.n_flows_q == 0 else 0, in_stdv) 72 | self.qzero_logvar.data.normal_(np.log(0.1), 1e-3 * in_stdv) 73 | self.rzero_c.data.normal_(0, in_stdv) 74 | self.rzero_b1.data.normal_(0, in_stdv) 75 | self.rzero_b2.data.normal_(0, in_stdv) 76 | 77 | def sample_z(self, batch_size, kl=True, same_noise=False): 78 | if self.training: 79 | if batch_size > 1: 80 | assert kl == False 81 | qzero_std = torch.exp(0.5 * self.qzero_logvar) 82 | qzero_std = qzero_std.expand(batch_size, self.in_features) 83 | z_mu = self.qzero_mu.expand(batch_size, self.in_features) 84 | if same_noise: 85 | epsilon_z = self.epsilon_z.expand(batch_size, self.in_features) 86 | else: 87 | epsilon_z = Variable(torch.randn(batch_size, self.in_features)) 88 | if self.use_cuda: 89 | epsilon_z = epsilon_z.cuda() 90 | 91 | z = z_mu + qzero_std * epsilon_z 92 | z = self.flow_q(z, kl=False) 93 | return z 94 | if batch_size == 1: 95 | qzero_std = torch.exp(0.5 * self.qzero_logvar) 96 | z = self.qzero_mu + qzero_std * self.epsilon_z 97 | if kl: 98 | z, logdets = self.flow_q(z, kl=True) 99 | return z, logdets 100 | else: 101 | z = self.flow_q(z, kl=False) 102 | return z 103 | else: 104 | assert kl == False 105 | z = self.qzero_mu 106 | z = self.flow_q(z, kl=False) 107 | return z 108 | 109 | def forward(self, x, same_noise=False): 110 | batch_size = x.size()[0] 111 | if self.training: 112 | z = self.sample_z(batch_size, kl=False, same_noise=same_noise) 113 | 114 | weight_std = torch.clamp(torch.exp(self.weight_logstd), 0, self.threshold_var) 115 | bias_std = torch.clamp(torch.exp(0.5 * self.bias_logvar), 0, self.threshold_var) 116 | out_mu = torch.matmul(x * z, self.weight_mu) + self.bias_mu 117 | out_var = torch.matmul(x * x, weight_std * weight_std) + bias_std 118 | if batch_size > 1: 119 | if same_noise: 120 | epsilon_linear = self.epsilon_linear.expand(batch_size, self.out_features) 121 | else: 122 | epsilon_linear = Variable(torch.randn(batch_size, self.out_features)) 123 | if self.use_cuda: 124 | epsilon_linear = epsilon_linear.cuda() 125 | if batch_size == 1: 126 | epsilon_linear = self.epsilon_linear 127 | 128 | out = out_mu + torch.sqrt(out_var) * epsilon_linear 129 | return out 130 | else: 131 | z = self.sample_z(1, kl=False) 132 | weight_mu = z.view(-1, 1) * self.weight_mu 133 | out = torch.matmul(x, weight_mu) + self.bias_mu 134 | return out 135 | 136 | def kldiv(self): 137 | z, logdets = self.sample_z(1, kl=True) 138 | 139 | weight_mu = z.view(-1, 1) * self.weight_mu 140 | 141 | kldiv_weight = 0.5 * (- 2 * self.weight_logstd + torch.exp(2 * self.weight_logstd) 142 | + weight_mu * weight_mu - 1).sum() 143 | kldiv_bias = 0.5 * (- self.bias_logvar + torch.exp(self.bias_logvar) 144 | + self.bias_mu * self.bias_mu - 1).sum() 145 | 146 | 147 | logq = - 0.5 * self.qzero_logvar.sum() 148 | logq -= logdets 149 | 150 | cw_mu = torch.matmul(self.rzero_c, weight_mu) 151 | epsilon = Variable(torch.randn(self.out_features)) 152 | if self.use_cuda: 153 | epsilon = epsilon.cuda() 154 | cw_var = torch.matmul(self.rzero_c * self.rzero_c, torch.exp(2 * self.weight_logstd)) 155 | # import pdb 156 | # pdb.set_trace() 157 | cw = F.tanh(cw_mu + torch.sqrt(cw_var) * epsilon) 158 | 159 | mu_tilde = torch.mean(self.rzero_b1.ger(cw), dim=1) 160 | neg_log_var_tilde = torch.mean(self.rzero_b2.ger(cw), dim=1) 161 | 162 | z, logr = self.flow_r(z, kl=True) 163 | 164 | z_mu_square = (z - mu_tilde) * (z - mu_tilde) 165 | logr += 0.5 * (- torch.exp(neg_log_var_tilde) * z_mu_square 166 | + neg_log_var_tilde).sum() 167 | 168 | kldiv = kldiv_weight + kldiv_bias + logq - logr 169 | return kldiv 170 | -------------------------------------------------------------------------------- /qlearn/commun/local_mnf_toy_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import numpy as np 10 | import matplotlib 11 | matplotlib.use('Agg') 12 | import matplotlib.pylab as plt 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | from torch.autograd import Variable 18 | from torch import optim 19 | 20 | from qlearn.commun.local_mnf_layer import MNFLinear 21 | 22 | BAYES = True 23 | use_cuda = False 24 | 25 | seed = 60 26 | 27 | torch.cuda.manual_seed(seed) 28 | torch.manual_seed(seed) 29 | np.random.seed(seed) 30 | 31 | 32 | class RegressionModel(nn.Module): 33 | def __init__(self, use_cuda=False): 34 | nn.Module.__init__(self) 35 | #input_dim, output_dim, hidden_dim, n_hidden, n_flows_q, n_flows_r 36 | self.fc1 = MNFLinear(1, 100, 16, 0, 2, 2, use_cuda=use_cuda) 37 | self.fc2 = MNFLinear(100, 1, 16, 0, 2, 2, use_cuda=use_cuda) 38 | 39 | def forward(self, x, same_noise=False): 40 | x = self.fc1.forward(x, same_noise=same_noise) 41 | x = F.relu(x) 42 | x = self.fc2.forward(x, same_noise=same_noise) 43 | return x 44 | 45 | def reset_noise(self): 46 | self.fc1.reset_noise() 47 | self.fc2.reset_noise() 48 | 49 | def kldiv(self): 50 | kldiv1 = self.fc1.kldiv() 51 | kldiv2 = self.fc2.kldiv() 52 | return kldiv1 + kldiv2 53 | 54 | 55 | class MLP(nn.Module): 56 | def __init__(self): 57 | nn.Module.__init__(self) 58 | self.fc1 = nn.Linear(1, 100) 59 | self.fc2 = nn.Linear(100, 1) 60 | 61 | def forward(self, x): 62 | x = self.fc1(x) 63 | x = F.relu(x) 64 | x = self.fc2(x) 65 | return x 66 | 67 | if __name__ == '__main__': 68 | 69 | X = np.random.uniform(-4, 4, (20, 1)).astype('float32') 70 | # X = np.random.rand(20, 1).astype('float32') * 8 - 4 71 | sigma = 3 72 | epsilon = np.random.normal(size=X.shape).astype('float32') 73 | Y = np.power(X, 3) + sigma * epsilon 74 | 75 | if BAYES: 76 | regressor = RegressionModel(use_cuda=use_cuda) 77 | else: 78 | regressor = MLP() 79 | 80 | x = Variable(torch.from_numpy(X)) 81 | y = Variable(torch.from_numpy(Y)) 82 | if use_cuda: 83 | x = x.cuda() 84 | y = y.cuda() 85 | optimiser = optim.Adam(regressor.parameters(), lr=0.01) 86 | 87 | if use_cuda: 88 | regressor.cuda() 89 | # y = y.cuda() 90 | 91 | regressor.train() 92 | if BAYES: 93 | regressor.reset_noise() 94 | for epoch in range(1000): 95 | regressor.zero_grad() 96 | if BAYES: 97 | regressor.reset_noise() 98 | y_pred = regressor(x, same_noise=False) 99 | kldiv = regressor.kldiv() 100 | kl_reg = kldiv / 20.0 101 | # y_pred = regressor(x, kl=False) 102 | mse = F.mse_loss(y_pred, y) / (2 * 9) 103 | loss = mse + kl_reg 104 | # loss = mse 105 | else: 106 | loss = F.mse_loss(regressor(x), y) / (2 * 9) 107 | loss.backward() 108 | optimiser.step() 109 | if epoch % 10 == 0: 110 | if BAYES: 111 | print('epoch: {}, loss: {}, kl: {}, mse: {}'.format(epoch, loss.item(), kl_reg.item(), mse.item())) 112 | # print('epoch: {}, loss: {}'.format(epoch, loss.item())) 113 | else: 114 | print('epoch: {}, loss: {}'.format(epoch, loss.item())) 115 | 116 | n_test = 500 117 | x_test = np.linspace(-6, 6, n_test).reshape(n_test, 1).astype('float32') 118 | y_preds = [] 119 | 120 | regressor.train() 121 | # assert regressor.fc1.training == False 122 | 123 | X_TEST = Variable(torch.from_numpy(x_test)) 124 | if use_cuda: 125 | X_TEST = X_TEST.cuda() 126 | for _ in range(20): 127 | if BAYES: 128 | regressor.reset_noise() 129 | y_pred = regressor(X_TEST, same_noise=True) 130 | # y_pred = regressor(X_TEST) 131 | y_preds.append(y_pred.data.cpu().numpy()) 132 | else: 133 | y_preds.append(regressor(X_TEST).data.cpu().numpy()) 134 | y_preds = np.array(y_preds).reshape(20, n_test) 135 | y_preds_mean = np.mean(y_preds, axis=0) 136 | y_preds_var = np.std(y_preds, axis=0) 137 | 138 | 139 | plt.plot(x_test, y_preds_mean) 140 | if BAYES: 141 | plt.fill_between(x_test.reshape(n_test,), y_preds_mean - 3 * y_preds_var, y_preds_mean + 3 * y_preds_var, alpha=0.5) 142 | plt.plot(X, Y, 'x') 143 | plt.ylim(-100, 100) 144 | plt.savefig('local_mnf_toy_regression.png') 145 | # plt.show() 146 | -------------------------------------------------------------------------------- /qlearn/commun/mnf_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | from torch.autograd import Variable 13 | import torch.nn.functional as F 14 | 15 | from qlearn.commun.norm_flows import MaskedNVPFlow 16 | 17 | 18 | class MNFLinear(nn.Module): 19 | def __init__(self, in_features, out_features, hidden_dim, n_hidden, n_flows_q, n_flows_r, 20 | prior_var=1.0, threshold_var=0.5): 21 | nn.Module.__init__(self) 22 | self.in_features = in_features 23 | self.out_features = out_features 24 | self.hidden_dim = hidden_dim 25 | self.n_hidden = n_hidden 26 | self.n_flows_q = n_flows_q 27 | self.n_flows_r = n_flows_r 28 | self.prior_var = prior_var 29 | self.threshold_var = threshold_var 30 | 31 | self.weight_mu = nn.Parameter(torch.Tensor(out_features, in_features)) 32 | self.weight_logstd = nn.Parameter(torch.Tensor(out_features, in_features)) 33 | self.bias_mu = nn.Parameter(torch.Tensor(out_features)) 34 | self.bias_logvar = nn.Parameter(torch.Tensor(out_features)) 35 | 36 | self.qzero_mu = nn.Parameter(torch.Tensor(in_features)) 37 | self.qzero_logvar = nn.Parameter(torch.Tensor(in_features)) 38 | # auxiliary variable c, b1 and b2 are defined in equation (9) and (10) 39 | self.rzero_c = nn.Parameter(torch.Tensor(in_features)) 40 | self.rzero_b1 = nn.Parameter(torch.Tensor(in_features)) 41 | self.rzero_b2 = nn.Parameter(torch.Tensor(in_features)) 42 | 43 | self.flow_q = MaskedNVPFlow(in_features, hidden_dim, n_hidden, n_flows_q) 44 | self.flow_r = MaskedNVPFlow(in_features, hidden_dim, n_hidden, n_flows_r) 45 | 46 | self.register_buffer('epsilon_z', torch.Tensor(in_features)) 47 | self.register_buffer('epsilon_weight', torch.Tensor(out_features, in_features)) 48 | self.register_buffer('epsilon_bias', torch.Tensor(out_features)) 49 | self.reset_parameters() 50 | self.reset_noise() 51 | 52 | def reset_noise(self): 53 | epsilon_z = torch.randn(self.in_features) 54 | epsilon_weight = torch.randn(self.out_features, self.in_features) 55 | epsilon_bias = torch.randn(self.out_features) 56 | self.epsilon_z.copy_(epsilon_z) 57 | self.epsilon_weight.copy_(epsilon_weight) 58 | self.epsilon_bias.copy_(epsilon_bias) 59 | self.flow_q.reset_noise() 60 | self.flow_r.reset_noise() 61 | 62 | def reset_parameters(self): 63 | 64 | in_stdv = np.sqrt(4.0 / self.in_features) 65 | out_stdv = np.sqrt(4.0 / self.out_features) 66 | stdv2 = np.sqrt(4.0 / (self.in_features + self.out_features)) 67 | 68 | self.weight_mu.data.normal_(0, stdv2) 69 | self.weight_logstd.data.normal_(-9, 1e-3 * stdv2) 70 | self.bias_mu.data.zero_() 71 | self.bias_logvar.data.normal_(-9, 1e-3 * out_stdv) 72 | 73 | self.qzero_mu.data.normal_(1 if self.n_flows_q == 0 else 0, in_stdv) 74 | self.qzero_logvar.data.normal_(np.log(0.1), 1e-3 * in_stdv) 75 | self.rzero_c.data.normal_(0, in_stdv) 76 | self.rzero_b1.data.normal_(0, in_stdv) 77 | self.rzero_b2.data.normal_(0, in_stdv) 78 | 79 | def sample_z(self, kl=True): 80 | if self.training: 81 | qzero_std = torch.exp(0.5 * self.qzero_logvar) 82 | z = self.qzero_mu + qzero_std * self.epsilon_z 83 | else: 84 | z = self.qzero_mu 85 | if kl: 86 | z, logdets = self.flow_q(z, kl=True) 87 | return z, logdets 88 | else: 89 | z = self.flow_q(z, kl=False) 90 | return z 91 | 92 | def forward(self, input, kl=True): 93 | if self.training: 94 | if kl: 95 | z, logdets = self.sample_z(kl=True) 96 | else: 97 | z = self.sample_z(kl=False) 98 | weight_std = torch.clamp(torch.exp(self.weight_logstd), 0, self.threshold_var) 99 | bias_std = torch.clamp(torch.exp(0.5 * self.bias_logvar), 0, self.threshold_var) 100 | weight_mu = z.view(1, -1) * self.weight_mu 101 | weight = weight_mu + weight_std * self.epsilon_weight 102 | bias = self.bias_mu + bias_std * self.epsilon_bias 103 | out = F.linear(input, weight, bias) 104 | if not kl: 105 | return out 106 | else: 107 | kldiv_weight = 0.5 * (- 2 * self.weight_logstd + torch.exp(2 * self.weight_logstd) 108 | + weight_mu * weight_mu - 1).sum() 109 | kldiv_bias = 0.5 * (- self.bias_logvar + torch.exp(self.bias_logvar) 110 | + self.bias_mu * self.bias_mu - 1).sum() 111 | logq = - 0.5 * self.qzero_logvar.sum() 112 | logq -= logdets 113 | 114 | cw = F.tanh(torch.matmul(self.rzero_c, weight.t())) 115 | 116 | mu_tilde = torch.mean(self.rzero_b1.ger(cw), dim=1) 117 | neg_log_var_tilde = torch.mean(self.rzero_b2.ger(cw), dim=1) 118 | 119 | z, logr = self.flow_r(z) 120 | 121 | z_mu_square = (z - mu_tilde) * (z - mu_tilde) 122 | logr += 0.5 * (- torch.exp(neg_log_var_tilde) * z_mu_square 123 | + neg_log_var_tilde).sum() 124 | 125 | 126 | kldiv = kldiv_weight + kldiv_bias + logq - logr 127 | return out, kldiv 128 | else: 129 | assert kl == False 130 | z = self.sample_z(kl=False) 131 | weight_mu = z.view(1, -1) * self.weight_mu 132 | out = F.linear(input, weight_mu, self.bias_mu) 133 | return out 134 | -------------------------------------------------------------------------------- /qlearn/commun/mnf_toy_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import numpy as np 10 | import matplotlib 11 | matplotlib.use('Agg') 12 | import matplotlib.pylab as plt 13 | 14 | import torch 15 | import torch.nn as nn 16 | import torch.nn.functional as F 17 | from torch.autograd import Variable 18 | from torch import optim 19 | 20 | from qlearn.commun.mnf_layer import MNFLinear 21 | 22 | BAYES = True 23 | use_cuda = False 24 | 25 | seed = 60 26 | 27 | torch.cuda.manual_seed(seed) 28 | torch.manual_seed(seed) 29 | np.random.seed(seed) 30 | 31 | 32 | class RegressionModel(nn.Module): 33 | def __init__(self): 34 | nn.Module.__init__(self) 35 | #input_dim, output_dim, hidden_dim, n_hidden, n_flows_q, n_flows_r 36 | self.fc1 = MNFLinear(1, 100, 16, 0, 2, 2) 37 | self.fc2 = MNFLinear(100, 1, 16, 0, 2, 2) 38 | 39 | def forward(self, x, kl=True): 40 | if self.training: 41 | if kl: 42 | x, kldiv1 = self.fc1.forward(x, kl=True) 43 | x = F.relu(x) 44 | x, kldiv2 = self.fc2.forward(x, kl=True) 45 | kldiv = kldiv1 + kldiv2 46 | return x, kldiv 47 | else: 48 | x = self.fc1.forward(x, kl=False) 49 | x = F.relu(x) 50 | x = self.fc2.forward(x, kl=False) 51 | return x 52 | else: 53 | x = self.fc1.forward(x, kl=False) 54 | x = F.relu(x) 55 | x = self.fc2.forward(x, kl=False) 56 | return x 57 | 58 | def reset_noise(self): 59 | self.fc1.reset_noise() 60 | self.fc2.reset_noise() 61 | 62 | 63 | class MLP(nn.Module): 64 | def __init__(self): 65 | nn.Module.__init__(self) 66 | self.fc1 = nn.Linear(1, 100) 67 | self.fc2 = nn.Linear(100, 1) 68 | 69 | def forward(self, x): 70 | x = self.fc1(x) 71 | x = F.relu(x) 72 | x = self.fc2(x) 73 | return x 74 | 75 | if __name__ == '__main__': 76 | 77 | X = np.random.uniform(-4, 4, (20, 1)).astype('float32') 78 | # X = np.random.rand(20, 1).astype('float32') * 8 - 4 79 | sigma = 3 80 | epsilon = np.random.normal(size=X.shape).astype('float32') 81 | Y = np.power(X, 3) + sigma * epsilon 82 | 83 | if BAYES: 84 | regressor = RegressionModel() 85 | else: 86 | regressor = MLP() 87 | 88 | x = Variable(torch.from_numpy(X)) 89 | y = Variable(torch.from_numpy(Y)) 90 | if use_cuda: 91 | x = x.cuda() 92 | y = y.cuda() 93 | optimiser = optim.Adam(regressor.parameters(), lr=0.01) 94 | 95 | if use_cuda: 96 | regressor.cuda() 97 | # y = y.cuda() 98 | 99 | regressor.train() 100 | if BAYES: 101 | regressor.reset_noise() 102 | for epoch in range(1000): 103 | regressor.zero_grad() 104 | if BAYES: 105 | regressor.reset_noise() 106 | y_pred, kldiv = regressor(x, kl=True) 107 | kl_reg = kldiv / 20.0 108 | # y_pred = regressor(x, kl=False) 109 | mse = F.mse_loss(y_pred, y) / (2 * 9) 110 | loss = mse + kl_reg 111 | # loss = mse 112 | else: 113 | loss = F.mse_loss(regressor(x), y) / (2 * 9) 114 | loss.backward() 115 | optimiser.step() 116 | if epoch % 10 == 0: 117 | if BAYES: 118 | print('epoch: {}, loss: {}, kl: {}, mse: {}'.format(epoch, loss.item(), kl_reg.item(), mse.item())) 119 | # print('epoch: {}, loss: {}'.format(epoch, loss.item())) 120 | else: 121 | print('epoch: {}, loss: {}'.format(epoch, loss.item())) 122 | 123 | n_test = 500 124 | x_test = np.linspace(-6, 6, n_test).reshape(n_test, 1).astype('float32') 125 | y_preds = [] 126 | 127 | regressor.train() 128 | # assert regressor.fc1.training == False 129 | 130 | X_TEST = Variable(torch.from_numpy(x_test)) 131 | if use_cuda: 132 | X_TEST = X_TEST.cuda() 133 | for _ in range(20): 134 | if BAYES: 135 | regressor.reset_noise() 136 | y_pred = regressor(X_TEST, kl=False) 137 | # y_pred = regressor(X_TEST) 138 | y_preds.append(y_pred.data.cpu().numpy()) 139 | else: 140 | y_preds.append(regressor(X_TEST).data.cpu().numpy()) 141 | y_preds = np.array(y_preds).reshape(20, n_test) 142 | y_preds_mean = np.mean(y_preds, axis=0) 143 | y_preds_var = np.std(y_preds, axis=0) 144 | 145 | 146 | plt.plot(x_test, y_preds_mean) 147 | if BAYES: 148 | plt.fill_between(x_test.reshape(n_test,), y_preds_mean - 3 * y_preds_var, y_preds_mean + 3 * y_preds_var, alpha=0.5) 149 | plt.plot(X, Y, 'x') 150 | plt.ylim(-100, 100) 151 | plt.savefig('global_mnf_toy_regression.png') 152 | # plt.show() -------------------------------------------------------------------------------- /qlearn/commun/noisy_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import math 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch.nn import functional as F 14 | 15 | # this code is mostly copy-pasted from https://github.com/Kaixhin/Rainbow/blob/master/model.py 16 | class NoisyLinear(nn.Module): 17 | def __init__(self, in_features, out_features, std_init=0.4): 18 | super(NoisyLinear, self).__init__() 19 | self.in_features = in_features 20 | self.out_features = out_features 21 | self.std_init = std_init 22 | self.weight_mu = nn.Parameter(torch.empty(out_features, in_features)) 23 | self.weight_sigma = nn.Parameter(torch.empty(out_features, in_features)) 24 | self.register_buffer('weight_epsilon', torch.empty(out_features, in_features)) 25 | self.bias_mu = nn.Parameter(torch.empty(out_features)) 26 | self.bias_sigma = nn.Parameter(torch.empty(out_features)) 27 | self.register_buffer('bias_epsilon', torch.empty(out_features)) 28 | self.reset_parameters() 29 | self.reset_noise() 30 | 31 | def reset_parameters(self): 32 | mu_range = 1 / math.sqrt(self.in_features) 33 | self.weight_mu.data.uniform_(-mu_range, mu_range) 34 | self.weight_sigma.data.fill_(self.std_init / math.sqrt(self.in_features)) 35 | self.bias_mu.data.uniform_(-mu_range, mu_range) 36 | self.bias_sigma.data.fill_(self.std_init / math.sqrt(self.out_features)) 37 | 38 | def _scale_noise(self, size): 39 | x = torch.randn(size) 40 | return x.sign().mul_(x.abs().sqrt_()) 41 | 42 | def reset_noise(self): 43 | epsilon_in = self._scale_noise(self.in_features) 44 | epsilon_out = self._scale_noise(self.out_features) 45 | self.weight_epsilon.copy_(epsilon_out.ger(epsilon_in)) 46 | self.bias_epsilon.copy_(epsilon_out) 47 | 48 | def forward(self, input): 49 | if self.training: 50 | return F.linear(input, self.weight_mu + self.weight_sigma * self.weight_epsilon, self.bias_mu + self.bias_sigma * self.bias_epsilon) 51 | else: 52 | return F.linear(input, self.weight_mu, self.bias_mu) 53 | -------------------------------------------------------------------------------- /qlearn/commun/norm_flows.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | from torch.autograd import Variable 12 | from qlearn.commun.utils import initialize_weights 13 | 14 | 15 | class SingleMaskedNVPFlow(nn.Module): 16 | def __init__(self, input_dim, hidden_dim, n_hidden): 17 | nn.Module.__init__(self) 18 | self.input_dim = input_dim 19 | self.hidden_dim = hidden_dim 20 | self.n_hidden = n_hidden 21 | 22 | self.first_layer = nn.Sequential( 23 | nn.Linear(input_dim, hidden_dim), 24 | nn.Tanh() 25 | ) 26 | 27 | hidden_modules = [] 28 | for _ in range(n_hidden): 29 | hidden_modules.append(nn.Linear(hidden_dim, hidden_dim)) 30 | hidden_modules.append(nn.Tanh()) 31 | 32 | self.hidden_layer = nn.Sequential(*hidden_modules) 33 | self.mu_layer = nn.Linear(hidden_dim, input_dim) 34 | self.sigma_layer = nn.Sequential( 35 | nn.Linear(hidden_dim, input_dim), 36 | nn.Sigmoid() 37 | ) 38 | self.register_buffer('mask', torch.Tensor(input_dim)) 39 | initialize_weights(self) 40 | 41 | def reset_noise(self): 42 | mask = torch.bernoulli(0.5*torch.ones(self.input_dim)) 43 | self.mask.copy_(mask) 44 | 45 | def forward(self, z, kl=True): 46 | if self.training: 47 | mask = self.mask 48 | else: 49 | mask = 0.5 50 | h = self.first_layer(mask * z) 51 | h = self.hidden_layer(h) 52 | mu = self.mu_layer(h) 53 | sigma = self.sigma_layer(h) 54 | z = (1 - mask) * (z * sigma + (1 - sigma) * mu) + mask * z 55 | if kl: 56 | if z.dim() == 1: 57 | logdet = ((1 - mask) * torch.log(sigma)).sum() 58 | else: 59 | logdet = ((1 - mask) * torch.log(sigma)).sum(1) 60 | return z, logdet 61 | else: 62 | return z 63 | 64 | 65 | class MaskedNVPFlow(nn.Module): 66 | def __init__(self, input_dim, hidden_dim, n_hidden, n_flows): 67 | nn.Module.__init__(self) 68 | self.input_dim = input_dim 69 | self.hidden_input = hidden_dim 70 | self.n_hidden = n_hidden 71 | self.n_flows = n_flows 72 | self.flow_list = nn.ModuleList([SingleMaskedNVPFlow(input_dim, hidden_dim, n_hidden) for _ in range(n_flows)]) 73 | 74 | def forward(self, z, kl=True): 75 | if kl: 76 | if z.dim() == 1: 77 | logdets = 0 78 | else: 79 | logdets = Variable(torch.zeros_like(z[:, 0])) 80 | for flow in self.flow_list: 81 | z, logdet = flow(z, kl=True) 82 | logdets += logdet 83 | return z, logdets 84 | else: 85 | for flow in self.flow_list: 86 | z = flow(z, kl=False) 87 | return z 88 | 89 | def reset_noise(self): 90 | for flow in self.flow_list: 91 | flow.reset_noise() 92 | -------------------------------------------------------------------------------- /qlearn/commun/toy_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import numpy as np 9 | import matplotlib 10 | matplotlib.use('Agg') 11 | import matplotlib.pylab as plt 12 | 13 | import torch 14 | import torch.nn as nn 15 | import torch.nn.functional as F 16 | from torch.autograd import Variable 17 | from torch import optim 18 | # Copyright (c) Facebook, Inc. and its affiliates. 19 | # All rights reserved. 20 | # 21 | # This source code is licensed under the license found in the 22 | # LICENSE file in the root directory of this source tree. 23 | # 24 | 25 | 26 | from qlearn.commun.bayes_backprop_layer import BayesBackpropLinear 27 | 28 | BAYES = True 29 | use_cuda = False 30 | seed = 60 31 | 32 | torch.cuda.manual_seed(seed) 33 | torch.manual_seed(seed) 34 | np.random.seed(seed) 35 | 36 | 37 | class RegressionModel(nn.Module): 38 | def __init__(self): 39 | nn.Module.__init__(self) 40 | #input_dim, output_dim, hidden_dim, n_hidden, n_flows_q, n_flows_r 41 | self.fc1 = BayesBackpropLinear(1, 100) 42 | self.fc2 = BayesBackpropLinear(100, 1) 43 | 44 | def forward(self, x): 45 | x = self.fc1.forward(x) 46 | x = F.relu(x) 47 | x = self.fc2.forward(x) 48 | return x 49 | 50 | def get_reg(self): 51 | reg = self.fc1.kldiv() 52 | reg += self.fc2.kldiv() 53 | return reg 54 | 55 | def reset_noise(self): 56 | self.fc1.reset_noise() 57 | self.fc2.reset_noise() 58 | 59 | 60 | class MLP(nn.Module): 61 | def __init__(self): 62 | nn.Module.__init__(self) 63 | self.fc1 = nn.Linear(1, 100) 64 | self.fc2 = nn.Linear(100, 1) 65 | 66 | def forward(self, x): 67 | x = self.fc1(x) 68 | x = F.relu(x) 69 | x = self.fc2(x) 70 | return x 71 | 72 | #X = torch.Tensor(20, 1).uniform_(-4, 4) 73 | 74 | 75 | X = np.random.uniform(-4, 4, (20, 1)).astype('float32') 76 | sigma = 3 77 | epsilon = np.random.normal(size=X.shape).astype('float32') 78 | Y = np.power(X, 3) + sigma * epsilon 79 | 80 | if BAYES: 81 | regressor = RegressionModel() 82 | else: 83 | regressor = MLP() 84 | 85 | 86 | x = Variable(torch.from_numpy(X)) 87 | y = Variable(torch.from_numpy(Y)) 88 | if use_cuda: 89 | x = x.cuda() 90 | y = y.cuda() 91 | optimiser = optim.Adam(regressor.parameters(), lr=0.01) 92 | 93 | if use_cuda: 94 | regressor.cuda() 95 | # y = y.cuda() 96 | 97 | regressor.train() 98 | for epoch in range(1000): 99 | regressor.zero_grad() 100 | if BAYES: 101 | kl_reg = 1.0 * regressor.get_reg()/x.size()[0] 102 | regressor.reset_noise() 103 | mse = F.mse_loss(regressor(x), y) / (2 * 9) 104 | loss = mse + kl_reg 105 | else: 106 | loss = F.mse_loss(regressor(x), y) / (2 * 9) 107 | loss.backward() 108 | optimiser.step() 109 | if epoch % 10 == 0: 110 | if BAYES: 111 | print('epoch: {}, loss: {}, kl: {}, mse: {}'.format(epoch, loss.item(), kl_reg.item(), mse.item())) 112 | else: 113 | print('epoch: {}, loss: {}'.format(epoch, loss.item())) 114 | 115 | x_test = np.linspace(-6, 6, 100).reshape(100, 1).astype('float32') 116 | y_preds = [] 117 | 118 | # regressor.eval() 119 | # assert regressor.fc1.training == False 120 | 121 | X_TEST = Variable(torch.from_numpy(x_test)) 122 | if use_cuda: 123 | X_TEST = X_TEST.cuda() 124 | for _ in range(20): 125 | if BAYES: 126 | regressor.reset_noise() 127 | y_preds.append(regressor(X_TEST).data.cpu().numpy()) 128 | else: 129 | y_preds.append(regressor(X_TEST).data.cpu().numpy()) 130 | y_preds = np.array(y_preds).reshape(20, 100) 131 | y_preds_mean = np.mean(y_preds, axis=0) 132 | y_preds_var = np.std(y_preds, axis=0) 133 | 134 | 135 | plt.plot(x_test, y_preds_mean) 136 | if BAYES: 137 | plt.fill_between(x_test.reshape(100,), y_preds_mean - 3 * y_preds_var, y_preds_mean + 3 * y_preds_var, alpha=0.5) 138 | plt.plot(X, Y, 'x') 139 | plt.ylim(-100, 100) 140 | plt.savefig('bb_toy_regression.png') 141 | -------------------------------------------------------------------------------- /qlearn/commun/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import math 10 | import torch.nn as nn 11 | 12 | 13 | def initialize_weights(model): 14 | for m in model.modules(): 15 | if isinstance(m, nn.Conv2d): 16 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.in_channels 17 | m.weight.data.normal_(0, math.sqrt(4. / n)) 18 | if m.bias is not None: 19 | m.bias.data.zero_() 20 | elif isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.BatchNorm1d): 21 | m.weight.data.fill_(1) 22 | m.bias.data.zero_() 23 | elif isinstance(m, nn.Linear): 24 | n = m.in_features + m.out_features 25 | m.weight.data.normal_(0, math.sqrt(4. / n)) 26 | m.bias.data.zero_() 27 | 28 | 29 | def _norm(p, dim): 30 | """Computes the norm over all dimensions except dim""" 31 | if dim is None: 32 | return p.norm() 33 | elif dim == 0: 34 | output_size = (p.size(0),) + (1,) * (p.dim() - 1) 35 | return p.contiguous().view(p.size(0), -1).norm(dim=1).view(*output_size) 36 | elif dim == p.dim() - 1: 37 | output_size = (1,) * (p.dim() - 1) + (p.size(-1),) 38 | return p.contiguous().view(-1, p.size(-1)).norm(dim=0).view(*output_size) 39 | else: 40 | return _norm(p.transpose(0, dim), 0).transpose(0, dim) 41 | -------------------------------------------------------------------------------- /qlearn/commun/variational_dropout_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import math 10 | 11 | import torch 12 | import torch.nn as nn 13 | from torch.nn import functional as F 14 | from qlearn.commun.norm_flows import MaskedNVPFlow 15 | from qlearn.commun.utils import _norm 16 | 17 | 18 | class VariationalDropoutLinear(nn.Module): 19 | def __init__(self, in_features, out_features, hidden_dim, n_hidden, n_flows): 20 | super(VariationalDropoutLinear, self).__init__() 21 | self.in_features = in_features 22 | self.out_features = out_features 23 | self.n_flows = n_flows 24 | self.hidden_dim = hidden_dim 25 | self.n_hidden = n_hidden 26 | self.n_flows = n_flows 27 | self.direction = nn.Parameter(torch.empty(out_features, in_features)) 28 | self.bias = nn.Parameter(torch.empty(out_features)) 29 | self.register_buffer('gzero_epsilon', torch.empty(out_features)) 30 | self.gzero_mu = nn.Parameter(torch.Tensor(out_features)) 31 | self.gzero_logsigma = nn.Parameter(torch.Tensor(out_features)) 32 | self.flow = MaskedNVPFlow(out_features, hidden_dim, n_hidden, n_flows) 33 | 34 | self.reset_parameters() 35 | self.reset_noise() 36 | 37 | def reset_parameters(self): 38 | out_stdv = math.sqrt(4.0 / self.out_features) 39 | stdv2 = math.sqrt(4.0 / (self.in_features + self.out_features)) 40 | self.direction.data.normal_(0, stdv2) 41 | self.bias.data.normal_(0, out_stdv) 42 | self.gzero_mu.data.normal_(0, out_stdv) 43 | self.gzero_logsigma.data.normal_(math.log(0.1), 1e-3 * out_stdv) 44 | 45 | def reset_noise(self): 46 | self.gzero_epsilon.copy_(torch.randn(self.out_features)) 47 | self.flow.reset_noise() 48 | 49 | def sample_g(self): 50 | if self.training: 51 | gzero_sigma = F.softplus(self.gzero_logsigma) 52 | gzero = self.gzero_mu + gzero_sigma * self.gzero_epsilon 53 | g, logdets = self.flow(gzero) 54 | logq = - torch.log(gzero_sigma).sum() 55 | logq -= logdets[0] 56 | logp = - 0.5 * torch.sum(g * g) 57 | kldiv = logq - logp 58 | return g, kldiv 59 | else: 60 | gzero = self.gzero_mu 61 | g = self.flow(gzero) 62 | return g 63 | 64 | def forward(self, input): 65 | if self.training: 66 | g, kldiv = self.sample_g() 67 | # weight = self.direction * (g.view(-1, 1) / _norm(self.direction, dim=0)) 68 | weight = self.direction / _norm(self.direction, dim=0) 69 | out = g.view(1, -1) * F.linear(input, weight, self.bias) 70 | # out = F.linear(input, weight, self.bias) 71 | return out, kldiv 72 | else: 73 | g = self.sample_g() 74 | weight = self.direction * (g.view(-1, 1) / _norm(self.direction, dim=0)) 75 | out = F.linear(input, weight, self.bias) 76 | return out 77 | -------------------------------------------------------------------------------- /qlearn/envs/nchain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import gym 10 | from gym import spaces 11 | import numpy as np 12 | 13 | 14 | class NChainEnv(gym.Env): 15 | """n-Chain environment 16 | The environment consists of a chain of N states and the agent always starts in state s2, 17 | from where it can either move left or right. 18 | In state s1, the agent receives a small reward of r = 0.001 and a larger reward r = 1 in state sN. 19 | This environment is described in 20 | Deep Exploration via Bootstrapped DQN(https://papers.nips.cc/paper/6501-deep-exploration-via-bootstrapped-dqn.pdf) 21 | """ 22 | def __init__(self, n): 23 | self.n = n 24 | self.state = 1 # Start at state s2 25 | self.action_space = spaces.Discrete(2) 26 | self.observation_space = spaces.Discrete(self.n) 27 | self.max_nsteps = n + 8 28 | 29 | def step(self, action): 30 | assert self.action_space.contains(action) 31 | v = np.arange(self.n) 32 | reward = lambda s, a: 1.0 if (s == (self.n - 1) and a == 1) else (0.001 if (s == 0 and a == 0) else 0) 33 | is_done = lambda nsteps: nsteps >= self.max_nsteps 34 | 35 | r = reward(self.state, action) 36 | if action: # forward 37 | if self.state != self.n - 1: 38 | self.state += 1 39 | else: # backward 40 | if self.state != 0: 41 | self.state -= 1 42 | self.nsteps += 1 43 | return (v <= self.state).astype('float32'), r, is_done(self.nsteps), None 44 | 45 | def reset(self): 46 | v = np.arange(self.n) 47 | self.state = 1 48 | self.nsteps = 0 49 | return (v <= self.state).astype('float32') 50 | -------------------------------------------------------------------------------- /qlearn/toys/agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | 10 | import os 11 | import random 12 | import numpy as np 13 | import torch 14 | import torch.nn.functional as F 15 | from torch import optim 16 | from torch.autograd import Variable 17 | from qlearn.toys.model import DQN 18 | 19 | 20 | class Agent(): 21 | def __init__(self, args, env): 22 | self.action_space = env.action_space.n 23 | self.batch_size = args.batch_size 24 | self.discount = args.discount 25 | self.double_q = args.double_q 26 | 27 | self.online_net = DQN(args, self.action_space) 28 | if args.model and os.path.isfile(args.model): 29 | self.online_net.load_state_dict(torch.load(args.model)) 30 | self.online_net.train() 31 | 32 | self.target_net = DQN(args, self.action_space) 33 | self.update_target_net() 34 | self.target_net.eval() 35 | for param in self.target_net.parameters(): 36 | param.requires_grad = False 37 | 38 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) 39 | if args.cuda: 40 | self.online_net.cuda() 41 | self.target_net.cuda() 42 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 43 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 44 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 45 | 46 | 47 | # Acts based on single state (no batch) 48 | def act(self, state): 49 | self.online_net.eval() 50 | state = Variable(self.FloatTensor(state)) 51 | return self.online_net(state).data.max(1)[1][0] 52 | 53 | # Acts with an epsilon-greedy policy 54 | def act_e_greedy(self, state, epsilon=0.01): 55 | return random.randrange(self.action_space) if random.random() < epsilon else self.act(state) 56 | 57 | def update_target_net(self): 58 | self.target_net.load_state_dict(self.online_net.state_dict()) 59 | 60 | def learn(self, states, actions, rewards, next_states, terminals): 61 | self.online_net.train() 62 | self.target_net.eval() 63 | states = Variable(self.FloatTensor(states)) 64 | actions = Variable(self.LongTensor(actions)) 65 | next_states = Variable(self.FloatTensor(next_states)) 66 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 67 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 68 | 69 | # import pdb 70 | # pdb.set_trace() 71 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 72 | # columns of actions taken 73 | state_action_values = self.online_net(states).gather(1, actions.view(-1, 1)) 74 | if self.double_q: 75 | next_actions = self.online_net(next_states).max(1)[1] 76 | next_state_values = self.target_net(next_states).gather(1, next_actions.view(-1, 1)) 77 | else: 78 | next_state_values = self.target_net(next_states).max(1)[0] 79 | 80 | # Compute V(s_{t+1}) for all next states. 81 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values.view(-1, 1) 82 | # Undo volatility (which was used to prevent unnecessary gradients) 83 | #target_state_action_values = Variable(target_state_action_values.data) 84 | 85 | # Compute Huber loss 86 | loss = F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 87 | # Optimize the model 88 | self.optimiser.zero_grad() 89 | loss.backward() 90 | for param in self.online_net.parameters(): 91 | param.grad.data.clamp_(-1, 1) 92 | self.optimiser.step() 93 | return loss 94 | -------------------------------------------------------------------------------- /qlearn/toys/bayes_backprop_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import random 11 | import numpy as np 12 | import torch 13 | import torch.nn.functional as F 14 | from torch import optim 15 | from torch.autograd import Variable 16 | from qlearn.toys.model import BayesBackpropDQN 17 | 18 | 19 | class BayesBackpropAgent(): 20 | def __init__(self, args, env): 21 | self.action_space = env.action_space.n 22 | self.batch_size = args.batch_size 23 | self.discount = args.discount 24 | self.double_q = args.double_q 25 | 26 | self.online_net = BayesBackpropDQN(args, self.action_space) 27 | if args.model and os.path.isfile(args.model): 28 | self.online_net.load_state_dict(torch.load(args.model)) 29 | self.online_net.train() 30 | 31 | self.target_net = BayesBackpropDQN(args, self.action_space) 32 | self.update_target_net() 33 | self.target_net.train() 34 | for param in self.target_net.parameters(): 35 | param.requires_grad = False 36 | 37 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) 38 | if args.cuda: 39 | self.online_net.cuda() 40 | self.target_net.cuda() 41 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 42 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 43 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 44 | 45 | # Acts based on single state (no batch) 46 | def act(self, state, eval=False): 47 | if eval: 48 | self.online_net.eval() 49 | else: 50 | self.online_net.train() 51 | # self.online_net.reset_noise() 52 | state = Variable(self.FloatTensor(state)) 53 | return self.online_net(state).data.max(1)[1][0] 54 | 55 | def update_target_net(self): 56 | self.target_net.load_state_dict(self.online_net.state_dict()) 57 | 58 | def learn(self, states, actions, rewards, next_states, terminals): 59 | self.online_net.train() 60 | self.target_net.train() 61 | self.online_net.reset_noise() 62 | self.target_net.reset_noise() 63 | states = Variable(self.FloatTensor(states)) 64 | actions = Variable(self.LongTensor(actions)) 65 | next_states = Variable(self.FloatTensor(next_states)) 66 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 67 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 68 | 69 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 70 | # columns of actions taken 71 | 72 | state_action_values = self.online_net(states).gather(1, actions.view(-1, 1)) 73 | 74 | if self.double_q: 75 | next_actions = self.online_net(next_states).max(1)[1] 76 | next_state_values = self.target_net(next_states).gather(1, next_actions.view(-1, 1)) 77 | else: 78 | next_state_values = self.target_net(next_states).max(1)[0].view(-1, 1) 79 | 80 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values 81 | # import pdb 82 | # pdb.set_trace() 83 | mse = F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 84 | kl_reg = self.online_net.get_reg() 85 | loss = mse + kl_reg / self.batch_size 86 | # Optimize the model 87 | self.optimiser.zero_grad() 88 | loss.backward() 89 | for param in self.online_net.parameters(): 90 | param.grad.data.clamp_(-1, 1) 91 | self.optimiser.step() 92 | 93 | return loss 94 | -------------------------------------------------------------------------------- /qlearn/toys/bootstrapped_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import random 11 | import numpy as np 12 | import torch 13 | import torch.nn.functional as F 14 | from torch import optim 15 | from torch.autograd import Variable 16 | from qlearn.toys.model import BoostrappedDQN 17 | from collections import Counter 18 | 19 | 20 | class BootstrappedAgent(): 21 | def __init__(self, args, env): 22 | self.action_space = env.action_space.n 23 | self.batch_size = args.batch_size 24 | self.discount = args.discount 25 | self.nheads = args.nheads 26 | self.double_q = args.double_q 27 | 28 | self.online_net = BoostrappedDQN(args, self.action_space) 29 | if args.model and os.path.isfile(args.model): 30 | self.online_net.load_state_dict(torch.load(args.model)) 31 | self.online_net.train() 32 | 33 | self.target_net = BoostrappedDQN(args, self.action_space) 34 | self.update_target_net() 35 | self.target_net.eval() 36 | for param in self.target_net.parameters(): 37 | param.requires_grad = False 38 | 39 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) 40 | if args.cuda: 41 | self.online_net.cuda() 42 | self.target_net.cuda() 43 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 44 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 45 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 46 | 47 | # Acts based on single state (no batch) 48 | def act_single_head(self, state, k): 49 | self.online_net.eval() 50 | state = Variable(self.FloatTensor(state / 255.0)) 51 | return self.online_net.forward_single_head(state, k).data.max(1)[1][0] 52 | 53 | def act(self, state): 54 | self.online_net.eval() 55 | state = Variable(self.FloatTensor(state / 255.0)) 56 | outputs = self.online_net.forward(state) 57 | actions = [] 58 | for k in range(self.online_net.nheads): 59 | actions.append(int(outputs[k].data.max(1)[1][0])) 60 | action, _ = Counter(actions).most_common()[0] 61 | return action 62 | 63 | # Acts with an epsilon-greedy policy 64 | def act_e_greedy(self, state, k, epsilon=0.01): 65 | return random.randrange(self.action_space) if random.random() < epsilon else self.act_single_head(state, k) 66 | 67 | def update_target_net(self): 68 | self.target_net.load_state_dict(self.online_net.state_dict()) 69 | 70 | def learn(self, states, actions, rewards, next_states, terminals): 71 | self.online_net.train() 72 | self.target_net.eval() 73 | states = Variable(self.FloatTensor(states)) 74 | actions = Variable(self.LongTensor(actions)) 75 | next_states = Variable(self.FloatTensor(next_states)) 76 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 77 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 78 | 79 | # import pdb 80 | # pdb.set_trace() 81 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 82 | # columns of actions taken 83 | online_outputs = self.online_net(states) 84 | target_outputs = self.target_net(next_states) 85 | loss = 0 86 | # import pdb 87 | # pdb.set_trace() 88 | for k in range(self.nheads): 89 | state_action_values = online_outputs[k].gather(1, actions.view(-1, 1)) 90 | 91 | # Compute V(s_{t+1}) for all next states. 92 | if self.double_q: 93 | next_actions = online_outputs[k].max(1)[1] 94 | next_state_values = target_outputs[k].gather(1, next_actions.view(-1, 1)) 95 | else: 96 | next_state_values = target_outputs[k].max(1)[0].view(-1, 1) 97 | 98 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values.view(-1, 1) 99 | 100 | # Compute Huber loss 101 | loss += F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 102 | # loss /= args.nheads 103 | # Optimize the model 104 | self.optimiser.zero_grad() 105 | loss.backward() 106 | for param in self.online_net.parameters(): 107 | param.grad.data.clamp_(-1, 1) 108 | self.optimiser.step() 109 | return loss 110 | -------------------------------------------------------------------------------- /qlearn/toys/main_nchain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import time 11 | from datetime import datetime 12 | import random 13 | import numpy as np 14 | import math 15 | from collections import Counter 16 | import torch 17 | from torch.autograd import Variable 18 | from tensorboardX import SummaryWriter 19 | from baselines.common.schedules import LinearSchedule 20 | from baselines.deepq.replay_buffer import ReplayBuffer 21 | 22 | from qlearn.toys.agent import Agent 23 | from qlearn.toys.bootstrapped_agent import BootstrappedAgent 24 | from qlearn.toys.bayes_backprop_agent import BayesBackpropAgent 25 | from qlearn.toys.noisy_agent import NoisyAgent 26 | from qlearn.toys.mnf_agent import MNFAgent 27 | from qlearn.envs.nchain import NChainEnv 28 | # from qlearn.toys.memory import ReplayBuffer 29 | from qlearn.toys.test import test 30 | 31 | 32 | parser = argparse.ArgumentParser(description='DQN') 33 | parser.add_argument('--seed', type=int, default=510, help='Random seed') 34 | parser.add_argument('--cuda', type=int, default=1, help='use cuda') 35 | parser.add_argument('--max-steps', type=int, default=int(50e6), metavar='STEPS', help='Number of training steps') 36 | 37 | parser.add_argument('--evaluation-episodes', type=int, default=1, metavar='N', help='Number of evaluation episodes to average over') 38 | parser.add_argument('--model', type=str, metavar='PARAMS', help='Pretrained model (state dict)') 39 | parser.add_argument('--replay_buffer_size', type=int, default=int(10000), metavar='CAPACITY', help='Experience replay memory capacity') 40 | parser.add_argument('--learning-freq', type=int, default=10, metavar='k', help='Frequency of sampling from memory') 41 | parser.add_argument("--learning-starts", type=int, default=32, help="number of iterations after which learning starts") 42 | parser.add_argument('--discount', type=float, default=0.999, metavar='GAMMA', help='Discount factor') 43 | parser.add_argument('--target-update-freq', type=int, default=100, metavar='TAU', help='Number of steps after which to update target network') 44 | parser.add_argument('--lr', type=float, default=0.001, metavar='ETA', help='Learning rate') 45 | parser.add_argument('--adam-eps', type=float, default=1.5e-4, metavar='EPSILON', help='Adam epsilon') 46 | parser.add_argument('--batch-size', type=int, default=32, metavar='SIZE', help='Batch size') 47 | parser.add_argument('--input-dim', type=int, default=8, help='the length of chain environment') 48 | parser.add_argument('--evaluation-interval', type=int, default=10, metavar='STEPS', help='Number of training steps between evaluations') 49 | parser.add_argument('--nheads', type=int, default=10, help='number of heads in Bootstrapped DQN') 50 | parser.add_argument('--agent', type=str, default='DQN', help='type of agent') 51 | parser.add_argument('--final-exploration', type=float, default=0.1, help='last value of epsilon') 52 | parser.add_argument('--final-exploration-step', type=float, default=1000, help='horizon of epsilon schedule') 53 | parser.add_argument('--max-episodes', type=int, default=int(2e3), metavar='EPISODES', help='Number of training episodes') 54 | parser.add_argument('--hidden_dim', type=int, default=int(16), help='number of hidden unit used in normalizing flows') 55 | parser.add_argument('--n-hidden', type=int, default=int(0), help='number of hidden layer used in normalizing flows') 56 | parser.add_argument('--n-flows-q', type=int, default=int(1), help='number of normalizing flows using for the approximate posterior q') 57 | parser.add_argument('--n-flows-r', type=int, default=int(1), help='number of normalizing flows using for auxiliary posterior r') 58 | parser.add_argument('--logdir', type=str, default='logs', help='log directory') 59 | parser.add_argument('--double-q', type=int, default=1, help='whether or not to use Double DQN') 60 | 61 | # Setup 62 | args = parser.parse_args() 63 | assert args.agent in ['DQN', 'BootstrappedDQN', 'NoisyDQN', 'BayesBackpropDQN', 'MNFDQN'] 64 | 65 | print(' ' * 26 + 'Options') 66 | for k, v in vars(args).items(): 67 | print(' ' * 26 + k + ': ' + str(v)) 68 | 69 | random.seed(args.seed) 70 | np.random.seed(args.seed) 71 | torch.manual_seed(args.seed) 72 | if args.cuda: 73 | torch.cuda.manual_seed(args.seed) 74 | 75 | # Environment 76 | env = NChainEnv(args.input_dim) 77 | action_space = env.action_space.n 78 | 79 | # Log 80 | date = time.strftime('%Y-%m-%d.%H%M') 81 | run_dir = '{}/{}-{}-{}'.format(args.logdir, 'Nchain', args.agent, date) 82 | 83 | log = SummaryWriter(run_dir) 84 | print('Writing logs to {}'.format(run_dir)) 85 | 86 | # Agent 87 | if args.agent == 'BootstrappedDQN': 88 | dqn = BootstrappedAgent(args, env) 89 | elif args.agent == 'NoisyDQN': 90 | dqn = NoisyAgent(args, env) 91 | elif args.agent == 'BayesBackpropDQN': 92 | dqn = BayesBackpropAgent(args, env) 93 | elif args.agent == 'MNFDQN': 94 | dqn = MNFAgent(args, env) 95 | else: 96 | dqn = Agent(args, env) 97 | 98 | replay_buffer = ReplayBuffer(args.replay_buffer_size) 99 | # mem = ReplayBuffer(args.memory_capacity) 100 | 101 | # schedule of epsilon annealing 102 | exploration = LinearSchedule(args.final_exploration_step, args.final_exploration, 1) 103 | 104 | # import pdb 105 | # pdb.set_trace() 106 | 107 | # Training loop 108 | dqn.online_net.train() 109 | timestamp = 0 110 | for episode in range(args.max_episodes): 111 | 112 | epsilon = exploration.value(episode) 113 | 114 | state, done = env.reset(), False 115 | if args.agent == 'BootstrappedDQN': 116 | k = random.randrange(args.nheads) 117 | elif args.agent == 'VariationalDQN': 118 | dqn.online_net.freeze_noise() 119 | elif args.agent == 'BayesBackpropDQN': 120 | dqn.online_net.reset_noise() 121 | elif args.agent == 'MNFDQN': 122 | dqn.online_net.reset_noise() 123 | while not done: 124 | timestamp += 1 125 | 126 | if args.agent == 'BootstrappedDQN': 127 | action = dqn.act_single_head(state[None], k) 128 | elif args.agent in ['NoisyDQN', 'BayesBackpropDQN', 'MNFDQN']: 129 | action = dqn.act(state[None], eval=False) 130 | elif args.agent == 'DQN': 131 | action = dqn.act_e_greedy(state[None], epsilon=epsilon) 132 | 133 | next_state, reward, done, _ = env.step(int(action)) 134 | # Store the transition in memory 135 | replay_buffer.add(state, action, reward, next_state, float(done)) 136 | 137 | # Move to the next state 138 | state = next_state 139 | # 140 | if timestamp % args.target_update_freq == 0: 141 | dqn.update_target_net() 142 | 143 | if timestamp > args.learning_starts: 144 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(args.batch_size) 145 | loss = dqn.learn(obses_t, actions, rewards, obses_tp1, dones) 146 | log.add_scalar('loss', loss, timestamp) 147 | 148 | # if episode % 10 == 0: 149 | # visited = [] 150 | # for transition in replay_buffer.memory: 151 | # visited.append(transition.state.sum()) 152 | # print(Counter(visited)) 153 | 154 | if episode > 4: 155 | avg_reward = test(args, env, dqn) # Test 156 | print('episode: ' + str(episode) + ', Avg. reward: ' + str(round(avg_reward, 4))) 157 | -------------------------------------------------------------------------------- /qlearn/toys/mnf_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import random 11 | import numpy as np 12 | import torch 13 | import torch.nn.functional as F 14 | from torch import optim 15 | from torch.autograd import Variable 16 | from qlearn.toys.model import MNFDQN 17 | 18 | 19 | class MNFAgent(): 20 | def __init__(self, args, env): 21 | self.action_space = env.action_space.n 22 | self.batch_size = args.batch_size 23 | self.discount = args.discount 24 | self.double_q = args.double_q 25 | self.kl_coeff = 1.0 / args.replay_buffer_size 26 | 27 | self.online_net = MNFDQN(args, self.action_space) 28 | if args.model and os.path.isfile(args.model): 29 | self.online_net.load_state_dict(torch.load(args.model)) 30 | self.online_net.train() 31 | 32 | self.target_net = MNFDQN(args, self.action_space) 33 | self.update_target_net() 34 | self.target_net.train() 35 | for param in self.target_net.parameters(): 36 | param.requires_grad = False 37 | 38 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) 39 | if args.cuda: 40 | self.online_net.cuda() 41 | self.target_net.cuda() 42 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 43 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 44 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 45 | 46 | # Acts based on single state (no batch) 47 | def act(self, state, eval=False): 48 | if eval: 49 | self.online_net.eval() 50 | else: 51 | self.online_net.train() 52 | # self.online_net.reset_noise() 53 | state = Variable(self.FloatTensor(state)) 54 | return self.online_net(state, same_noise=True).data.max(1)[1][0] 55 | 56 | def update_target_net(self): 57 | self.target_net.load_state_dict(self.online_net.state_dict()) 58 | 59 | def learn(self, states, actions, rewards, next_states, terminals): 60 | self.online_net.train() 61 | self.target_net.eval() 62 | self.online_net.reset_noise() 63 | states = Variable(self.FloatTensor(states)) 64 | actions = Variable(self.LongTensor(actions)) 65 | next_states = Variable(self.FloatTensor(next_states)) 66 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 67 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 68 | 69 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 70 | # columns of actions taken 71 | 72 | state_values = self.online_net(states, same_noise=False) 73 | kldiv = self.online_net.kldiv() 74 | state_action_values = state_values.gather(1, actions.view(-1, 1)) 75 | 76 | if self.double_q: 77 | next_actions = self.online_net(next_states, same_noise=False).max(1)[1] 78 | next_state_values = self.target_net(next_states, same_noise=False).gather(1, next_actions.view(-1, 1)) 79 | else: 80 | next_state_values = self.target_net(next_states).max(1)[0] 81 | 82 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values.view(-1, 1) 83 | 84 | td_errors = F.smooth_l1_loss(state_action_values, target_state_action_values.detach(), size_average=True) 85 | 86 | loss = td_errors + self.kl_coeff * kldiv 87 | 88 | # Optimize the model 89 | self.optimiser.zero_grad() 90 | loss.backward() 91 | for param in self.online_net.parameters(): 92 | param.grad.data.clamp_(-1, 1) 93 | self.optimiser.step() 94 | 95 | return loss 96 | -------------------------------------------------------------------------------- /qlearn/toys/model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | from torch.autograd import Variable 12 | import torch.nn.functional as F 13 | 14 | from qlearn.commun.utils import initialize_weights 15 | from qlearn.commun.noisy_layer import NoisyLinear 16 | from qlearn.commun.bayes_backprop_layer import BayesBackpropLinear 17 | from qlearn.commun.local_mnf_layer import MNFLinear 18 | 19 | 20 | class DQN(nn.Module): 21 | def __init__(self, args, action_space): 22 | nn.Module.__init__(self) 23 | self.features = nn.Sequential( 24 | nn.Linear(args.input_dim, 16), 25 | nn.ReLU(inplace=True), 26 | nn.Linear(16, 16), 27 | nn.ReLU(inplace=True) 28 | ) 29 | self.last_layer = nn.Linear(16, action_space) 30 | initialize_weights(self) 31 | 32 | def forward(self, x): 33 | x = self.features(x) 34 | x = self.last_layer(x) 35 | return x 36 | 37 | 38 | class BoostrappedDQN(nn.Module): 39 | def __init__(self, args, action_space): 40 | nn.Module.__init__(self) 41 | # self.features = nn.Sequential( 42 | # nn.Linear(args.input_dim, 16), 43 | # nn.ReLU(inplace=True) 44 | # ) 45 | self.nheads = args.nheads 46 | self.heads = nn.ModuleList([nn.Sequential(nn.Linear(args.input_dim, 16), 47 | nn.ReLU(inplace=True), 48 | nn.Linear(16, 16), 49 | nn.ReLU(inplace=True), 50 | nn.Linear(16, action_space)) for _ in range(args.nheads)]) 51 | 52 | initialize_weights(self) 53 | 54 | def forward_single_head(self, x, k): 55 | # x = self.features(x) 56 | x = self.heads[k](x) 57 | return x 58 | 59 | def forward(self, x): 60 | # x = self.features(x) 61 | out = [] 62 | for head in self.heads: 63 | out.append(head(x)) 64 | return out 65 | 66 | 67 | class MNFDQN(nn.Module): 68 | def __init__(self, args, action_space): 69 | nn.Module.__init__(self) 70 | self.fc1 = MNFLinear(args.input_dim, 16, args.hidden_dim, args.n_hidden, args.n_flows_q, args.n_flows_r, use_cuda=args.cuda) 71 | self.fc2 = MNFLinear(16, 16, args.hidden_dim, args.n_hidden, args.n_flows_q, args.n_flows_r, use_cuda=args.cuda) 72 | self.fc3 = MNFLinear(16, action_space, args.hidden_dim, args.n_hidden, args.n_flows_q, args.n_flows_r, use_cuda=args.cuda) 73 | 74 | def forward(self, x, same_noise=False): 75 | x = F.relu(self.fc1(x, same_noise=same_noise)) 76 | x = F.relu(self.fc2(x, same_noise=same_noise)) 77 | x = self.fc3(x, same_noise=same_noise) 78 | return x 79 | 80 | def kldiv(self): 81 | kldiv1 = self.fc1.kldiv() 82 | kldiv2 = self.fc2.kldiv() 83 | kldiv3 = self.fc3.kldiv() 84 | return kldiv1 + kldiv2 + kldiv3 85 | 86 | def reset_noise(self): 87 | self.fc1.reset_noise() 88 | self.fc2.reset_noise() 89 | self.fc3.reset_noise() 90 | 91 | # def forward(self, x, kl=True): 92 | # if kl: 93 | # x, kldiv1 = self.fc1(x, kl=True) 94 | # x = F.relu(x) 95 | # x, kldiv2 = self.fc2(x, kl=True) 96 | # x = F.relu(x) 97 | # x, kldiv3 = self.fc3(x, kl=True) 98 | # kldiv = kldiv1 + kldiv2 + kldiv3 99 | # return x, kldiv 100 | # else: 101 | # x = F.relu(self.fc1(x, kl=False)) 102 | # x = F.relu(self.fc2(x, kl=False)) 103 | # x = self.fc3(x, kl=False) 104 | # return x 105 | 106 | 107 | class NoisyDQN(nn.Module): 108 | def __init__(self, args, action_space): 109 | nn.Module.__init__(self) 110 | self.fc1 = NoisyLinear(args.input_dim, 16) 111 | self.fc2 = NoisyLinear(16, 16) 112 | self.fc3 = NoisyLinear(16, action_space) 113 | 114 | def forward(self, x): 115 | x = F.relu(self.fc1(x)) 116 | x = F.relu(self.fc2(x)) 117 | x = self.fc3(x) 118 | return x 119 | 120 | def reset_noise(self): 121 | self.fc1.reset_noise() 122 | self.fc2.reset_noise() 123 | self.fc3.reset_noise() 124 | 125 | 126 | class BayesBackpropDQN(nn.Module): 127 | def __init__(self, args, action_space): 128 | nn.Module.__init__(self) 129 | self.fc1 = BayesBackpropLinear(args.input_dim, 16) 130 | self.fc2 = BayesBackpropLinear(16, 16) 131 | self.fc3 = BayesBackpropLinear(16, action_space) 132 | 133 | def forward(self, x): 134 | x = F.relu(self.fc1(x)) 135 | x = F.relu(self.fc2(x)) 136 | x = self.fc3(x) 137 | return x 138 | 139 | def reset_noise(self): 140 | self.fc1.reset_noise() 141 | self.fc2.reset_noise() 142 | self.fc3.reset_noise() 143 | 144 | def get_reg(self): 145 | reg = self.fc1.kldiv() 146 | reg += self.fc2.kldiv() 147 | reg += self.fc3.kldiv() 148 | return reg 149 | -------------------------------------------------------------------------------- /qlearn/toys/noisy_agent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import random 11 | import numpy as np 12 | import torch 13 | import torch.nn.functional as F 14 | from torch import optim 15 | from torch.autograd import Variable 16 | from qlearn.toys.model import NoisyDQN 17 | 18 | class NoisyAgent(): 19 | def __init__(self, args, env): 20 | self.action_space = env.action_space.n 21 | self.batch_size = args.batch_size 22 | self.discount = args.discount 23 | self.double_q = args.double_q 24 | 25 | self.online_net = NoisyDQN(args, self.action_space) 26 | if args.model and os.path.isfile(args.model): 27 | self.online_net.load_state_dict(torch.load(args.model)) 28 | self.online_net.train() 29 | 30 | self.target_net = NoisyDQN(args, self.action_space) 31 | self.update_target_net() 32 | self.target_net.train() 33 | for param in self.target_net.parameters(): 34 | param.requires_grad = False 35 | 36 | self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps) 37 | if args.cuda: 38 | self.online_net.cuda() 39 | self.target_net.cuda() 40 | self.FloatTensor = torch.cuda.FloatTensor if args.cuda else torch.FloatTensor 41 | self.LongTensor = torch.cuda.LongTensor if args.cuda else torch.LongTensor 42 | self.ByteTensor = torch.cuda.ByteTensor if args.cuda else torch.ByteTensor 43 | 44 | # Acts based on single state (no batch) 45 | def act(self, state, eval=False): 46 | if eval: 47 | self.online_net.eval() 48 | else: 49 | self.online_net.train() 50 | self.online_net.reset_noise() 51 | state = Variable(self.FloatTensor(state)) 52 | return self.online_net(state).data.max(1)[1][0] 53 | 54 | def update_target_net(self): 55 | self.target_net.load_state_dict(self.online_net.state_dict()) 56 | 57 | def learn(self, states, actions, rewards, next_states, terminals): 58 | self.online_net.train() 59 | self.target_net.train() 60 | self.online_net.reset_noise() 61 | self.target_net.reset_noise() 62 | states = Variable(self.FloatTensor(states)) 63 | actions = Variable(self.LongTensor(actions)) 64 | next_states = Variable(self.FloatTensor(next_states)) 65 | rewards = Variable(self.FloatTensor(rewards)).view(-1, 1) 66 | terminals = Variable(self.FloatTensor(terminals)).view(-1, 1) 67 | 68 | # Compute Q(s_t, a) - the model computes Q(s_t), then we select the 69 | # columns of actions taken 70 | 71 | state_action_values = self.online_net(states).gather(1, actions.view(-1, 1)) 72 | 73 | if self.double_q: 74 | next_actions = self.online_net(next_states).max(1)[1] 75 | next_state_values = self.target_net(next_states).gather(1, next_actions.view(-1, 1)) 76 | else: 77 | next_state_values = self.target_net(next_states).max(1)[0].view(-1, 1) 78 | 79 | target_state_action_values = rewards + (1 - terminals) * self.discount * next_state_values 80 | # import pdb 81 | # pdb.set_trace() 82 | loss = F.smooth_l1_loss(state_action_values, target_state_action_values.detach()) 83 | 84 | # Optimize the model 85 | self.optimiser.zero_grad() 86 | loss.backward() 87 | for param in self.online_net.parameters(): 88 | param.grad.data.clamp_(-1, 1) 89 | self.optimiser.step() 90 | 91 | return loss 92 | -------------------------------------------------------------------------------- /qlearn/toys/test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import torch 11 | from torch.autograd import Variable 12 | 13 | 14 | # Test DQN 15 | def test(args, env, dqn): 16 | rewards = [] 17 | 18 | # Test performance over several episodes 19 | done = True 20 | dqn.online_net.eval() 21 | # dqn.online_net.freeze_noise() 22 | for _ in range(args.evaluation_episodes): 23 | while True: 24 | if done: 25 | state, reward_sum, done = env.reset(), 0, False 26 | if args.agent == 'VariationalDQN': 27 | action = dqn.act(state[None], sample=False) 28 | elif args.agent in ['NoisyDQN', 'BayesBackpropDQN', 'MNFDQN']: 29 | action = dqn.act(state[None], eval=True) 30 | elif args.agent == 'DQN': 31 | action = dqn.act(state[None]) 32 | elif args.agent == 'BootstrappedDQN': 33 | action = dqn.act(state[None]) 34 | # Choose an action greedily 35 | state, reward, done, _ = env.step(int(action)) # Step 36 | reward_sum += reward 37 | 38 | if done: 39 | rewards.append(reward_sum) 40 | break 41 | env.close() 42 | 43 | # return average reward 44 | return sum(rewards) / len(rewards) 45 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from setuptools import setup, find_packages 9 | setup(name='qlearn', 10 | packages=[package for package in find_packages() 11 | if package.startswith('qlearn')], 12 | version='0.1') 13 | --------------------------------------------------------------------------------