├── .github └── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── .snyk ├── Gruntfile.js ├── LICENSE ├── README.md ├── bin ├── copy-config ├── setup ├── setup_macOS └── setup_ubuntu ├── circle.yml ├── config ├── .theanorc ├── example-default.json └── keras.json ├── data └── .gitkeep ├── environment.yml ├── main.py ├── package.json ├── requirements.txt ├── rl ├── __init__.py ├── agent │ ├── __init__.py │ ├── actor_critic.py │ ├── base_agent.py │ ├── conv_dqn.py │ ├── ddpg.py │ ├── deep_exp_sarsa.py │ ├── deep_sarsa.py │ ├── double_conv_dqn.py │ ├── double_dqn.py │ ├── dqn.py │ ├── freeze_dqn.py │ ├── offpol_sarsa.py │ └── q_table.py ├── analytics.py ├── experiment.py ├── hyperoptimizer │ ├── __init__.py │ ├── base_hyperoptimizer.py │ ├── grid_search.py │ ├── line_search.py │ └── random_search.py ├── memory │ ├── __init__.py │ ├── base_memory.py │ ├── linear.py │ ├── prioritized_exp_replay.py │ └── ranked.py ├── model │ └── .gitkeep ├── optimizer │ ├── __init__.py │ ├── adam.py │ ├── base_optimizer.py │ ├── rmsprop.py │ └── sgd.py ├── policy │ ├── __init__.py │ ├── actor_critic.py │ ├── base_policy.py │ ├── boltzmann.py │ ├── epsilon_greedy.py │ └── noise.py ├── preprocessor │ ├── __init__.py │ ├── atari.py │ ├── base_preprocessor.py │ └── linear.py ├── spec │ ├── atari_experiment_specs.json │ ├── box2d_experiment_specs.json │ ├── classic_experiment_specs.json │ ├── component_locks.json │ ├── dev_experiment_specs.json │ ├── problems.json │ └── pygame_experiment_specs.json └── util.py ├── setup.py └── test ├── __init__.py ├── conftest.py ├── test_atari.py ├── test_box2d.py ├── test_classic.py └── test_dev.py /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | _If you're contributing new algorithms and its solutions to OpenAI Lab, please follow below. Otherwise, clear below and concisely describe your commits._ 2 | 3 | ### Solution Submission 4 | 5 | Once accepted, we will add the following to the OpenAI Lab [Best Solutions](http://kengz.me/openai_lab/#problems) and the code. 6 | 7 | - name the Pull Request title like `Solution: CartPole-v0 with DQN` 8 | - add PR label `solution` 9 | 10 | Then, submit the following: 11 | 12 | - [ ] problem: CartPole-v0 13 | - [ ] algorithm (commit code if new): DQN 14 | - [ ] best `fitness_score`: _the highest_ 15 | - [ ] author: _your name_ 16 | - [ ] commit `experiment_spec`: dqn 17 | - _attach (not commit)_ the experiment files: 18 | - [ ] `_analysis_data.csv` (zip) 19 | - [ ] `.json` (zip) 20 | - [ ] `.png` 21 | - [ ] `_analysis.png` 22 | - [ ] `_analysis_correlation.png` 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | coverage 6 | InstalledFiles 7 | lib/bundler/man 8 | pkg 9 | rdoc 10 | spec/reports 11 | test/tmp 12 | test/version_tmp 13 | tmp 14 | *.DS_STORE 15 | build/ 16 | .cache 17 | .vagrant 18 | .sass-cache 19 | 20 | # YARD artifacts 21 | .yardoc 22 | _yardoc 23 | doc/ 24 | .idea/ 25 | 26 | # Python ignores 27 | __pycache__/ 28 | *.py[cod] 29 | *$py.class 30 | *.egg* 31 | *.manifest 32 | .cache/ 33 | htmlcov/ 34 | .coverage 35 | venv/ 36 | ENV/ 37 | .env/ 38 | openai_lab/ 39 | src/ 40 | 41 | node_modules/ 42 | 43 | .DS_Store 44 | 45 | *checkpoint* 46 | *.tfl* 47 | *.meta 48 | data/ 49 | model/ 50 | *.png 51 | *.txt 52 | *.json 53 | *.csv 54 | *.log 55 | *.h5 56 | *.xml 57 | -------------------------------------------------------------------------------- /.snyk: -------------------------------------------------------------------------------- 1 | # Snyk (https://snyk.io) policy file, patches or ignores known vulnerabilities. 2 | version: v1.7.1 3 | ignore: {} 4 | # patches apply the minimum changes required to fix a vulnerability 5 | patch: 6 | 'npm:debug:20170905': 7 | - grunt-contrib-watch > tiny-lr > debug: 8 | patched: '2017-09-30T01:17:03.873Z' 9 | - grunt-contrib-watch > tiny-lr > body-parser > debug: 10 | patched: '2017-09-30T01:17:03.873Z' 11 | -------------------------------------------------------------------------------- /Gruntfile.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash') 2 | const fs = require('fs') 3 | const resolve = require('resolve-dir') 4 | 5 | 6 | // generic experimentId matcher. index 2: experimentId, 3 or 4: experimentName 7 | const expIdRegex = /(\-e\s+)?(([a-zA-Z0-9_]+)\-\d{4}_\d{2}_\d{2}_\d{6}|([a-zA-Z0-9_]+))/ 8 | const historyPath = './config/history.json' 9 | const finishMsg = ` 10 | =========================================== 11 | Experiments complete. Press Ctrl+C to exit. 12 | =========================================== 13 | ` 14 | 15 | 16 | module.exports = function(grunt) { 17 | process.env.NODE_ENV = grunt.option('prod') ? 'production' : 'development' 18 | 19 | const config = require('config') 20 | const dataSrc = 'data' 21 | const dataDest = resolve(config.data_sync_destination) 22 | const experiments = config.experiments 23 | const experimentTasks = _.map(experiments, function(name) { 24 | return `shell:experiment:${name}` 25 | }) 26 | 27 | function writeHistory(history) { 28 | grunt.log.ok(`Writing updated lab history ${JSON.stringify(history, null, 2)}`) 29 | fs.writeFileSync(historyPath, JSON.stringify(history, null, 2)) 30 | return history 31 | } 32 | 33 | function readHistory() { 34 | if (grunt.option('resume')) { 35 | try { 36 | return JSON.parse(fs.readFileSync(historyPath, 'utf8')) 37 | } catch (err) { 38 | grunt.log.ok(`No existing ${historyPath} to resume, creating new`) 39 | return writeHistory({}) 40 | } 41 | } else { 42 | return {} 43 | } 44 | } 45 | 46 | let history = readHistory() 47 | 48 | function getExpId(filepath) { 49 | if (!fs.lstatSync(filepath).isFile()) { 50 | // write history on folder being created 51 | return filepath 52 | } else if (_.endsWith(filepath, '.json')) { 53 | // write history on json written (fallback guard) 54 | let expIdPath = _.join(_.initial(filepath.split('_')), '_') 55 | return expIdPath.split('/').pop() 56 | } else { 57 | return false 58 | } 59 | } 60 | 61 | function updateHistory(filepath) { 62 | let expId = getExpId(filepath) 63 | if (!expId) { 64 | return 65 | } 66 | const matchedPath = expId.split('/').pop().match(expIdRegex) 67 | if (matchedPath) { 68 | const experimentId = matchedPath[2] 69 | const experimentName = matchedPath[3] || matchedPath[4] 70 | history[experimentName] = experimentId 71 | writeHistory(history) 72 | } 73 | } 74 | 75 | function remoteCmd() { 76 | return grunt.option('remote') ? 'xvfb-run -a -s "-screen 0 1400x900x24" --' : '' 77 | } 78 | 79 | function bestCmd() { 80 | return grunt.option('best') ? '' : ' -bp' 81 | } 82 | 83 | function debugCmd() { 84 | return grunt.option('debug') ? ' -d' : '' 85 | } 86 | 87 | function quietCmd() { 88 | return grunt.option('quiet') ? ' -q' : '' 89 | } 90 | 91 | function notiCmd(experiment) { 92 | return (grunt.option('prod') && !grunt.option('analyze')) ? `NOTI_SLACK_DEST='${config.NOTI_SLACK_DEST}' NOTI_SLACK_TOK='${config.NOTI_SLACK_TOK}' noti -k -t 'Experiment completed' -m '[${new Date().toISOString()}] ${experiment} on ${process.env.USER}'` : '' 93 | } 94 | 95 | function resumeExperimentStr(eStr) { 96 | const matchedExp = eStr.match(expIdRegex) 97 | if (matchedExp) { 98 | const experimentIdOrName = matchedExp[2] 99 | const experimentName = matchedExp[3] || matchedExp[4] 100 | if (history[experimentName]) { 101 | return eStr.replace(experimentIdOrName, history[experimentName]) 102 | } 103 | } 104 | return eStr 105 | } 106 | 107 | function composeCommand(experimentStr) { 108 | var eStr = experimentStr 109 | if (grunt.option('resume') || grunt.option('analyze')) { 110 | eStr = resumeExperimentStr(eStr) 111 | } 112 | 113 | const envCmd = 'if (conda env list | grep --quiet "openai_lab"); then echo "activating conda"; source activate openai_lab; elif [ -d ./.env ]; then echo "activating virtualenv"; source .env/bin/activate; else echo "using system python"; fi; ' 114 | 115 | // override with custom command if has 'python' 116 | const pyCmd = _.includes(eStr, 'python') ? eStr : `python3 main.py${bestCmd()}${debugCmd()}${quietCmd()} -t 5 -e ${eStr}` 117 | const cmd = `${envCmd}${remoteCmd()} ${pyCmd} | tee ./data/terminal.log; ${notiCmd(eStr)}` 118 | grunt.log.ok(`Composed command: ${cmd}`) 119 | return cmd 120 | } 121 | 122 | 123 | require('load-grunt-tasks')(grunt) 124 | 125 | grunt.initConfig({ 126 | sync: { 127 | main: { 128 | files: [{ 129 | cwd: dataSrc, 130 | src: ['**'], 131 | dest: dataDest, 132 | }], 133 | pretend: !grunt.option('prod'), // Don't do real IO 134 | } 135 | }, 136 | 137 | watch: { 138 | data: { 139 | files: `${dataSrc}/**`, 140 | tasks: ['sync'], 141 | options: { 142 | debounceDelay: 20 * 60 * 1000, 143 | interval: 60000, 144 | }, 145 | } 146 | }, 147 | 148 | shell: { 149 | options: { 150 | execOptions: { 151 | killSignal: 'SIGINT', 152 | env: process.env 153 | } 154 | }, 155 | experiment: { 156 | command(experimentStr) { 157 | return composeCommand(experimentStr) 158 | }, 159 | options: { 160 | stdout: true 161 | } 162 | }, 163 | finish: `echo "${finishMsg}"`, 164 | clear: 'rm -rf .cache __pycache__ */__pycache__ *egg-info htmlcov .coverage* *.xml data/**/ data/*.log config/history.json', 165 | }, 166 | 167 | concurrent: { 168 | default: ['watch', ['lab', 'shell:finish']], 169 | options: { 170 | logConcurrentOutput: true 171 | } 172 | }, 173 | }) 174 | 175 | grunt.event.on('watch', function(action, filepath, target) { 176 | updateHistory(filepath) 177 | }) 178 | 179 | grunt.registerTask('lab', 'run all the experiments', experimentTasks) 180 | grunt.registerTask('lab_sync', 'run lab with auto file syncing', ['concurrent:default']) 181 | grunt.registerTask('default', ['lab_sync']) 182 | 183 | grunt.registerTask('analyze', function() { 184 | grunt.option('analyze', true) 185 | grunt.option('resume', true) 186 | grunt.task.run('default') 187 | }) 188 | grunt.registerTask('clear', ['shell:clear']) 189 | } 190 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Wah Loon Keng, Laura Graesser 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenAI Lab [![GitHub release](https://img.shields.io/github/release/kengz/openai_lab.svg)](https://github.com/kengz/openai_lab) [![CircleCI](https://circleci.com/gh/kengz/openai_lab.svg?style=shield)](https://circleci.com/gh/kengz/openai_lab) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/9e55f845b10b4b51b213620bfb98e4b3)](https://www.codacy.com/app/kengzwl/openai_lab?utm_source=github.com&utm_medium=referral&utm_content=kengz/openai_lab&utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/9e55f845b10b4b51b213620bfb98e4b3)](https://www.codacy.com/app/kengzwl/openai_lab?utm_source=github.com&utm_medium=referral&utm_content=kengz/openai_lab&utm_campaign=Badge_Coverage) [![GitHub stars](https://img.shields.io/github/stars/kengz/openai_lab.svg?style=social&label=Star)](https://github.com/kengz/openai_lab) [![GitHub forks](https://img.shields.io/github/forks/kengz/openai_lab.svg?style=social&label=Fork)](https://github.com/kengz/openai_lab) 2 | 3 | --- 4 | 5 |

NOTICE: Please use the next version, SLM-Lab.

6 | 7 | --- 8 | 9 |

OpenAI Lab Documentation

10 | 11 | --- 12 | 13 | _An experimentation framework for Reinforcement Learning using OpenAI Gym, Tensorflow, and Keras._ 14 | 15 | _OpenAI Lab_ is created to do Reinforcement Learning (RL) like science - _theorize, experiment_. It provides an easy interface to [OpenAI Gym](https://gym.openai.com/) and [Keras](https://keras.io/), with an automated experimentation and evaluation framework. 16 | 17 | ### Features 18 | 19 | 1. **Unified RL environment and agent interface** using OpenAI Gym, Tensorflow, Keras, so you can focus on developing the algorithms. 20 | 2. **[Core RL algorithms implementations](http://kengz.me/openai_lab/#agents-matrix), with reusable modular components** for developing deep RL algorithms. 21 | 3. **[An experimentation framework](http://kengz.me/openai_lab/#experiments)** for running hundreds of trials of hyperparameter optimizations, with logs, plots and analytics for testing new RL algorithms. Experimental settings are stored in standardized JSONs for reproducibility and comparisons. 22 | 4. **[Automated analytics of the experiments](http://kengz.me/openai_lab/#analysis)** for evaluating the RL agents and environments, and to help pick the best solution. 23 | 5. **The [Fitness Matrix](http://kengz.me/openai_lab/#fitness-matrix)**, a table of the best scores of RL algorithms v.s. the environments; useful for research. 24 | 25 | 26 | With OpenAI Lab, we could focus on researching the essential elements of reinforcement learning such as the algorithm, policy, memory, and parameter tuning. It allows us to build agents efficiently using existing components with the implementations from research ideas. We could then test the research hypotheses systematically by running experiments. 27 | 28 | *Read more about the research problems the Lab addresses in [Motivations](http://kengz.me/openai_lab/#motivations). Ultimately, the Lab is a generalized framework for doing reinforcement learning, agnostic of OpenAI Gym and Keras. E.g. Pytorch-based implementations are on the roadmap.* 29 | 30 | 31 | ### Implemented Algorithms 32 | 33 | A list of the core RL algorithms implemented/planned. 34 | 35 | To see their scores against OpenAI gym environments, go to **[Fitness Matrix](http://kengz.me/openai_lab/#fitness-matrix)**. 36 | 37 | 38 | |algorithm|implementation|eval score (pending)| 39 | |:---|:---|:---| 40 | |[DQN](https://arxiv.org/abs/1312.5602)|[DQN](https://github.com/kengz/openai_lab/blob/master/rl/agent/dqn.py)|-| 41 | |[Double DQN](https://arxiv.org/abs/1509.06461)|[DoubleDQN](https://github.com/kengz/openai_lab/blob/master/rl/agent/double_dqn.py)|-| 42 | |[Dueling DQN](https://arxiv.org/abs/1511.06581)|-|-| 43 | |Sarsa|[DeepSarsa](https://github.com/kengz/openai_lab/blob/master/rl/agent/deep_sarsa.py)|-| 44 | |Off-Policy Sarsa|[OffPolicySarsa](https://github.com/kengz/openai_lab/blob/master/rl/agent/offpol_sarsa.py)|-| 45 | |[PER (Prioritized Experience Replay)](https://arxiv.org/abs/1511.05952)|[PrioritizedExperienceReplay](https://github.com/kengz/openai_lab/blob/master/rl/memory/prioritized_exp_replay.py)|-| 46 | |[CEM (Cross Entropy Method)](https://en.wikipedia.org/wiki/Cross-entropy_method)|next|-| 47 | |[REINFORCE](http://incompleteideas.net/sutton/williams-92.pdf)|-|-| 48 | |[DPG (Deterministic Policy Gradient) off-policy actor-critic](http://jmlr.org/proceedings/papers/v32/silver14.pdf)|[ActorCritic](https://github.com/kengz/openai_lab/blob/master/rl/agent/actor_critic.py)|-| 49 | |[DDPG (Deep-DPG) actor-critic with target networks](https://arxiv.org/abs/1509.02971)|[DDPG](https://github.com/kengz/openai_lab/blob/master/rl/agent/ddpg.py)|-| 50 | |[A3C (asynchronous advantage actor-critic)](https://arxiv.org/pdf/1602.01783.pdf)|-|-| 51 | |Dyna|next|-| 52 | |[TRPO](https://arxiv.org/abs/1502.05477)|-|-| 53 | |Q*(lambda)|-|-| 54 | |Retrace(lambda)|-|-| 55 | |[Neural Episodic Control (NEC)](https://arxiv.org/abs/1703.01988)|-|-| 56 | |[EWC (Elastic Weight Consolidation)](https://arxiv.org/abs/1612.00796)|-|-| 57 | 58 | 59 | ### Run the Lab 60 | 61 | Next, see [Installation](http://kengz.me/openai_lab/#installation) and jump to [Quickstart](http://kengz.me/openai_lab/#quickstart). 62 | 63 | 64 |
Timelapse of OpenAI Lab
65 | 66 | *Timelapse of OpenAI Lab, solving CartPole-v0.* 67 | -------------------------------------------------------------------------------- /bin/copy-config: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # copy keys file if not already exist 4 | CONFIG_DIR=`pwd`/config 5 | EXAMPLE_CONFIG="$CONFIG_DIR/example-default.json" 6 | DEV_CONFIG="$CONFIG_DIR/default.json" 7 | PROD_CONFIG="$CONFIG_DIR/production.json" 8 | 9 | if [ ! -e "$DEV_CONFIG" ]; then 10 | cp $EXAMPLE_CONFIG $DEV_CONFIG 11 | echo "[ --- Created $DEV_CONFIG --- ]" 12 | fi 13 | 14 | if [ ! -e "$PROD_CONFIG" ]; then 15 | cp $EXAMPLE_CONFIG $PROD_CONFIG 16 | echo "[ --- Created $PROD_CONFIG --- ]" 17 | fi 18 | -------------------------------------------------------------------------------- /bin/setup: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script runs the same sequence as the CircleCI build 3 | # Run this as: 4 | # bin/setup 5 | 6 | 7 | # Fail on the first error; killable by SIGINT 8 | set -e 9 | trap "exit" INT 10 | 11 | 12 | read -p " 13 | ================================================ 14 | 15 | Welcome to the OpenAI Lab setup script; 16 | This will invoke sudo; alternatively, 17 | inspect bin/setup_ubuntu or bin/setup_macOS and run the lines manually. 18 | 19 | Press enter to continue, Ctrl+c to quit: 20 | 21 | ================================================ 22 | " 23 | 24 | # copy keys file if not already exist 25 | BIN_DIR=`pwd`/bin 26 | $BIN_DIR/copy-config 27 | 28 | # determine if is Mac OSX, or Linux; then run accordingly 29 | if [ $(uname) == "Darwin" ]; 30 | # Mac runs below 31 | then ( 32 | $BIN_DIR/setup_macOS; 33 | ); 34 | else ( 35 | $BIN_DIR/setup_ubuntu; 36 | ); 37 | fi 38 | 39 | 40 | echo " 41 | ================================================ 42 | 43 | Setup done. 44 | Running basic installation checks. 45 | 46 | ================================================ 47 | " 48 | 49 | # post-installation checks 50 | python3 -c "import tensorflow; print('tensorflow version:'); print(tensorflow.__version__)" 51 | python3 -c "import gym; gym.make('LunarLander-v2')" 52 | python3 -c "import gym; gym.make('SpaceInvaders-v0')" 53 | 54 | 55 | echo " 56 | ================================================ 57 | 58 | Installation complete. 59 | 60 | ================================================ 61 | " -------------------------------------------------------------------------------- /bin/setup_macOS: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script sets up OpenAI Lab for macOS 3 | 4 | # Fail on the first error; killable by SIGINT 5 | set -e 6 | trap "exit" INT 7 | 8 | # install system dependencies 9 | if which brew >/dev/null; then 10 | echo "Brew is already installed" 11 | else 12 | ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" 13 | fi 14 | 15 | # system dependencies for full openai gym 16 | hb_list=(cmake boost boost-python sdl2 swig wget) 17 | for item in "${hb_list[@]}"; do 18 | brew info "${item}" | grep --quiet 'Not installed' && brew install "${item}" 19 | done 20 | 21 | # install noti for auto-notification 22 | if which noti >/dev/null; then 23 | echo "Noti is already installed" 24 | else 25 | curl -L https://github.com/variadico/noti/releases/download/v2.5.0/noti2.5.0.darwin-amd64.tar.gz | tar -xz 26 | sudo mv noti /usr/local/bin/ 27 | fi 28 | 29 | # install nodejs (for npm and file watcher) 30 | if which node >/dev/null; then 31 | echo "Nodejs is already installed" 32 | else 33 | brew install node 34 | fi 35 | # install npm modules 36 | if [ -d ./node_modules ]; then 37 | echo "Npm modules already installed" 38 | else 39 | npm install; sudo npm i -g grunt-cli 40 | fi 41 | 42 | # install python3 43 | if which python3 >/dev/null; then 44 | echo "Python3 is already installed" 45 | else 46 | brew install python3 47 | fi 48 | 49 | # install python dependencies 50 | sudo python3 -m pip install -r requirements.txt 51 | -------------------------------------------------------------------------------- /bin/setup_ubuntu: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script sets up OpenAI Lab for Linux Ubuntu 3 | 4 | # Fail on the first error; killable by SIGINT 5 | set -e 6 | trap "exit" INT 7 | 8 | # install system dependencies 9 | sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get update 10 | sudo apt-get install -y gcc-4.9 g++-4.9 libhdf5-dev libopenblas-dev git 11 | 12 | # system dependencies for full openai gym 13 | sudo apt-get install -y cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 14 | 15 | # install noti for auto-notification 16 | if which noti >/dev/null; then 17 | echo "Noti is already installed" 18 | else 19 | curl -L https://github.com/variadico/noti/releases/download/v2.5.0/noti2.5.0.linux-amd64.tar.gz | tar -xz 20 | sudo mv noti /usr/bin/ 21 | fi 22 | 23 | # install nodejs (for npm and file watcher) 24 | if which node >/dev/null; then 25 | echo "Nodejs is already installed" 26 | else 27 | curl -sL https://deb.nodesource.com/setup_7.x | sudo -E bash - 28 | sudo apt-get install -y nodejs 29 | fi 30 | # install npm modules 31 | if [ -d ./node_modules ]; then 32 | echo "Npm modules already installed" 33 | else 34 | npm install; sudo npm i -g grunt-cli 35 | fi 36 | 37 | # install python3 38 | if which python3 >/dev/null; then 39 | echo "Python3 is already installed" 40 | else 41 | sudo apt-get -y install python3-dev python3-pip python3-setuptools 42 | fi 43 | 44 | # install python dependencies 45 | sudo python3 -m pip install -r requirements.txt 46 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | machine: 2 | python: 3 | version: 3.5.2 4 | 5 | dependencies: 6 | pre: 7 | - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get update 8 | - sudo apt-get install -y gcc-4.9 g++-4.9 libhdf5-dev libopenblas-dev git python3-tk tk-dev python3-dev python3-setuptools 9 | - sudo apt-get install -y cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig 10 | - pip install -U pip 11 | override: 12 | - pip install -r requirements.txt 13 | - mkdir ~/.keras && cp ./config/keras.json ~/.keras/ 14 | test: 15 | override: 16 | - xvfb-run -a -s "-screen 0 1400x900x24" -- python setup.py test 17 | - coverage xml && python-codacy-coverage -r coverage.xml 18 | 19 | general: 20 | branches: 21 | ignore: 22 | - doc 23 | - gh-pages 24 | -------------------------------------------------------------------------------- /config/.theanorc: -------------------------------------------------------------------------------- 1 | [global] 2 | floatX = float32 3 | device = gpu0 4 | 5 | [lib] 6 | cnmem = 0.2 7 | -------------------------------------------------------------------------------- /config/example-default.json: -------------------------------------------------------------------------------- 1 | { 2 | "data_sync_destination": "~/Dropbox/openai_lab/data", 3 | "NOTI_SLACK_DEST": "#rl-monitor", 4 | "NOTI_SLACK_TOK": "GET_SLACK_BOT_TOKEN_FROM_https://my.slack.com/services/new/bot", 5 | "experiments": [ 6 | "quickstart_dqn" 7 | ] 8 | } 9 | -------------------------------------------------------------------------------- /config/keras.json: -------------------------------------------------------------------------------- 1 | { 2 | "epsilon": 1e-07, 3 | "image_dim_ordering": "tf", 4 | "floatx": "float32", 5 | "backend": "tensorflow" 6 | } 7 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kengz/openai_lab/d0669d89268f2dc01c1cf878e4879775c7b6eb3c/data/.gitkeep -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: openai_lab 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python>=3.5 6 | - anaconda 7 | - six 8 | - h5py 9 | - matplotlib==1.4.3 10 | - seaborn>=0.7.1 11 | - Pillow>=3.3.1 12 | - PyOpenGL>=3.1.0 13 | - glances>=2.6.2 14 | - pytest-cov>=2.3.1 15 | - pytest-xdist>=1.15.0 16 | - pip: 17 | - codacy-coverage>=1.3.3 18 | - mem_top==0.1.5 19 | - atari_py>=0.0.18 20 | - cmake==0.6.0 21 | - tensorflow>=1.0.0 22 | - Keras>=1.2.2,<2.0.0 23 | - "--editable=git+https://github.com/openai/gym.git#egg=gym[all]" 24 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from rl.experiment import run 2 | from rl.util import args 3 | 4 | if __name__ == '__main__': 5 | run(args.experiment, **vars(args)) 6 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "openai_lab", 3 | "version": "1.0.0", 4 | "description": "An experimentation system for Reinforcement Learning using OpenAI and Keras", 5 | "main": "index.js", 6 | "directories": { 7 | "test": "test" 8 | }, 9 | "scripts": { 10 | "start": "grunt", 11 | "test": "python3 setup.py test", 12 | "posttest": "rm -rf .cache __pycache__ */__pycache__ *egg-info htmlcov", 13 | "snyk-protect": "snyk protect", 14 | "prepublish": "npm run snyk-protect" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/kengz/openai_lab.git" 19 | }, 20 | "keywords": [ 21 | "openai", 22 | "gym", 23 | "lab", 24 | "reinforcement", 25 | "learning" 26 | ], 27 | "author": "keng, laura", 28 | "license": "MIT", 29 | "bugs": { 30 | "url": "https://github.com/kengz/openai_lab/issues" 31 | }, 32 | "homepage": "https://github.com/kengz/openai_lab#readme", 33 | "dependencies": { 34 | "config": "^1.25.1", 35 | "grunt": "^1.0.1", 36 | "grunt-concurrent": "^2.3.1", 37 | "grunt-contrib-watch": "^1.0.0", 38 | "grunt-shell": "^2.1.0", 39 | "grunt-sync": "^0.6.2", 40 | "load-grunt-tasks": "^3.5.2", 41 | "lodash": "^4.17.4", 42 | "resolve-dir": "^1.0.0", 43 | "snyk": "^1.41.1" 44 | }, 45 | "snyk": true 46 | } 47 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | six 2 | h5py 3 | numpy>=1.12 4 | scipy>=0.18 5 | matplotlib==1.4.3 6 | seaborn>=0.7.1 7 | pandas>=0.18.1 8 | atari_py>=0.0.18 9 | Pillow>=3.3.1 10 | PyOpenGL>=3.1.0 11 | glances>=2.6.2 12 | mem_top==0.1.5 13 | pytest-cov>=2.3.1 14 | pytest-xdist>=1.15.0 15 | codacy-coverage>=1.3.3 16 | tensorflow>=1.0.0 17 | Keras>=1.2.2,<2.0.0 18 | -e git+https://github.com/openai/gym.git#egg=gym[all] 19 | -------------------------------------------------------------------------------- /rl/__init__.py: -------------------------------------------------------------------------------- 1 | # curse of python pathing, hack to solve rel import 2 | import glob 3 | import sys 4 | from os import path 5 | from os.path import dirname, basename, isfile 6 | file_path = path.normpath(path.join(path.dirname(__file__))) 7 | sys.path.insert(0, file_path) 8 | 9 | # another py curse, expose to prevent 'agent.' call 10 | pattern = "/*.py" 11 | modules = glob.glob(dirname(__file__) + pattern) 12 | __all__ = [basename(f)[:-3] for f in modules if isfile(f)] 13 | -------------------------------------------------------------------------------- /rl/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from rl.util import import_package_files 2 | 3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__) 4 | -------------------------------------------------------------------------------- /rl/agent/actor_critic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.dqn import DQN 3 | from rl.util import logger 4 | 5 | 6 | class ActorCritic(DQN): 7 | 8 | ''' 9 | Actor Critic algorithm. The actor's policy 10 | is adjusted in the direction that will lead to 11 | better actions, guided by the critic 12 | Implementation adapted from 13 | http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html 14 | 15 | Assumes one of the policies in actor_critic.py are being used 16 | ''' 17 | 18 | def __init__(self, env_spec, 19 | train_per_n_new_exp=1, 20 | gamma=0.95, lr=0.1, 21 | epi_change_lr=None, 22 | batch_size=16, n_epoch=5, hidden_layers=None, 23 | hidden_layers_activation='sigmoid', 24 | output_layer_activation='linear', 25 | auto_architecture=False, 26 | num_hidden_layers=3, 27 | first_hidden_layer_size=256, 28 | num_initial_channels=16, 29 | **kwargs): # absorb generic param without breaking 30 | # import only when needed to contain side-effects 31 | from keras.layers.core import Dense 32 | from keras.models import Sequential, load_model 33 | self.Dense = Dense 34 | self.Sequential = Sequential 35 | self.load_model = load_model 36 | 37 | super(ActorCritic, self).__init__(env_spec, 38 | train_per_n_new_exp, 39 | gamma, lr, 40 | epi_change_lr, 41 | batch_size, n_epoch, hidden_layers, 42 | hidden_layers_activation, 43 | output_layer_activation, 44 | auto_architecture, 45 | num_hidden_layers, 46 | first_hidden_layer_size, 47 | num_initial_channels, 48 | **kwargs) 49 | 50 | def build_model(self): 51 | self.build_actor() 52 | self.build_critic() 53 | logger.info("Actor and critic models built") 54 | 55 | def build_actor(self): 56 | actor = self.Sequential() 57 | super(ActorCritic, self).build_hidden_layers(actor) 58 | actor.add(self.Dense(self.env_spec['action_dim'], 59 | init='lecun_uniform', 60 | activation=self.output_layer_activation)) 61 | logger.info("Actor summary") 62 | actor.summary() 63 | self.actor = actor 64 | 65 | def build_critic(self): 66 | critic = self.Sequential() 67 | super(ActorCritic, self).build_hidden_layers(critic) 68 | critic.add(self.Dense(1, 69 | init='lecun_uniform', 70 | activation=self.output_layer_activation)) 71 | logger.info("Critic summary") 72 | critic.summary() 73 | self.critic = critic 74 | 75 | def compile_model(self): 76 | self.actor.compile( 77 | loss='mse', 78 | optimizer=self.optimizer.keras_optimizer) 79 | self.critic.compile( 80 | loss='mse', 81 | optimizer=self.optimizer.keras_optimizer) 82 | logger.info("Actor and critic compiled") 83 | 84 | def recompile_model(self, sys_vars): 85 | ''' 86 | Option to change model optimizer settings 87 | Currently only used for changing the learning rate 88 | Compiling does not affect the model weights 89 | ''' 90 | if self.epi_change_lr is not None: 91 | if (sys_vars['epi'] == self.epi_change_lr and 92 | sys_vars['t'] == 0): 93 | self.lr = self.lr / 10.0 94 | self.optimizer.change_optim_param(**{'lr': self.lr}) 95 | self.actor.compile( 96 | loss='mse', 97 | optimizer=self.optimizer.keras_optimizer) 98 | self.critic.compile( 99 | loss='mse', 100 | optimizer=self.optimizer.keras_optimizer) 101 | logger.info( 102 | 'Actor and critic models recompiled with new settings: ' 103 | 'Learning rate: {}'.format(self.lr)) 104 | 105 | def train_critic(self, minibatch): 106 | Q_vals = np.clip(self.critic.predict(minibatch['states']), 107 | -self.clip_val, self.clip_val) 108 | Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']), 109 | -self.clip_val, self.clip_val) 110 | Q_targets = minibatch['rewards'] + self.gamma * \ 111 | (1 - minibatch['terminals']) * Q_next_vals.squeeze() 112 | Q_targets = np.expand_dims(Q_targets, axis=1) 113 | 114 | actor_delta = Q_next_vals - Q_vals 115 | loss = self.critic.train_on_batch(minibatch['states'], Q_targets) 116 | 117 | # update memory, needed for PER 118 | errors = abs(np.sum(Q_vals - Q_targets, axis=1)) 119 | # Q size is only 1, from critic 120 | assert Q_targets.shape == (self.batch_size, 1) 121 | assert errors.shape == (self.batch_size, ) 122 | self.memory.update(errors) 123 | return loss, actor_delta 124 | 125 | def train_actor(self, minibatch, actor_delta): 126 | old_vals = self.actor.predict(minibatch['states']) 127 | if self.env_spec['actions'] == 'continuous': 128 | A_targets = np.zeros( 129 | (actor_delta.shape[0], self.env_spec['action_dim'])) 130 | for j in range(A_targets.shape[1]): 131 | A_targets[:, j] = actor_delta.squeeze() 132 | else: 133 | A_targets = minibatch['actions'] * actor_delta + \ 134 | (1 - minibatch['actions']) * old_vals 135 | 136 | loss = self.actor.train_on_batch(minibatch['states'], A_targets) 137 | return loss 138 | 139 | def train_an_epoch(self): 140 | minibatch = self.memory.rand_minibatch(self.batch_size) 141 | critic_loss, actor_delta = self.train_critic(minibatch) 142 | actor_loss = self.train_actor(minibatch, actor_delta) 143 | return critic_loss + actor_loss 144 | -------------------------------------------------------------------------------- /rl/agent/base_agent.py: -------------------------------------------------------------------------------- 1 | from rl.util import logger 2 | 3 | 4 | class Agent(object): 5 | 6 | ''' 7 | The base class of Agent, with the core methods 8 | ''' 9 | 10 | def __init__(self, env_spec, 11 | **kwargs): # absorb generic param without breaking 12 | self.env_spec = env_spec 13 | 14 | def compile(self, memory, optimizer, policy, preprocessor): 15 | # set 2 way references 16 | self.memory = memory 17 | self.optimizer = optimizer 18 | self.policy = policy 19 | self.preprocessor = preprocessor 20 | # back references 21 | setattr(memory, 'agent', self) 22 | setattr(optimizer, 'agent', self) 23 | setattr(policy, 'agent', self) 24 | setattr(preprocessor, 'agent', self) 25 | self.compile_model() 26 | logger.info( 27 | 'Compiled:\nAgent, Memory, Optimizer, Policy, ' 28 | 'Preprocessor:\n{}'.format( 29 | ', '.join([comp.__class__.__name__ for comp in 30 | [self, memory, optimizer, policy, preprocessor]]) 31 | )) 32 | 33 | def build_model(self): 34 | raise NotImplementedError() 35 | 36 | def compile_model(self): 37 | raise NotImplementedError() 38 | 39 | def select_action(self, state): 40 | self.policy.select_action(state) 41 | raise NotImplementedError() 42 | 43 | def update(self, sys_vars): 44 | '''Agent update apart from training the Q function''' 45 | self.policy.update(sys_vars) 46 | raise NotImplementedError() 47 | 48 | def to_train(self, sys_vars): 49 | raise NotImplementedError() 50 | 51 | def train(self, sys_vars): 52 | raise NotImplementedError() 53 | -------------------------------------------------------------------------------- /rl/agent/conv_dqn.py: -------------------------------------------------------------------------------- 1 | import math 2 | from rl.agent.dqn import DQN 3 | 4 | 5 | class ConvDQN(DQN): 6 | 7 | def __init__(self, *args, **kwargs): 8 | from keras.layers.core import Dense, Flatten 9 | from keras.layers.convolutional import Convolution2D 10 | from keras import backend as K 11 | if K.backend() == 'theano': 12 | K.set_image_dim_ordering('tf') 13 | self.Dense = Dense 14 | self.Flatten = Flatten 15 | self.Convolution2D = Convolution2D 16 | 17 | self.kernel = 4 18 | self.stride = (2, 2) 19 | super(ConvDQN, self).__init__(*args, **kwargs) 20 | 21 | def build_hidden_layers(self, model): 22 | ''' 23 | build the hidden layers into model using parameter self.hidden_layers 24 | Auto architecture infers the size of the hidden layers from the number 25 | of channels in the first hidden layer and number of layers 26 | With each successive layer the number of channels is doubled 27 | Kernel size is fixed at 4, and stride at (2, 2) 28 | No new layers are added if the cols or rows have dim <= 5 29 | Enables hyperparameter optimization over network architecture 30 | ''' 31 | if self.auto_architecture: 32 | num_channels = self.num_initial_channels 33 | cols = self.env_spec['state_dim'][0] 34 | rows = self.env_spec['state_dim'][1] 35 | # input layer 36 | model.add( 37 | self.Convolution2D( 38 | num_channels, 39 | self.kernel, 40 | self.kernel, 41 | subsample=self.stride, 42 | input_shape=self.env_spec['state_dim'], 43 | activation=self.hidden_layers_activation, 44 | # border_mode='same', 45 | init='lecun_uniform')) 46 | 47 | for i in range(1, self.num_hidden_layers): 48 | num_channels *= 2 49 | cols = math.ceil( 50 | math.floor(cols - self.kernel - 1) / self.stride[0]) + 1 51 | rows = math.ceil( 52 | math.floor(rows - self.kernel - 1) / self.stride[1]) + 1 53 | if cols > 5 and rows > 5: 54 | model.add( 55 | self.Convolution2D( 56 | num_channels, 57 | self.kernel, 58 | self.kernel, 59 | subsample=self.stride, 60 | activation=self.hidden_layers_activation, 61 | # border_mode='same', 62 | init='lecun_uniform')) 63 | else: 64 | # stop addition of too many layers 65 | # and from breakage by cols, rows growing to 0 66 | break 67 | 68 | else: 69 | model.add( 70 | self.Convolution2D( 71 | self.hidden_layers[0][0], 72 | self.hidden_layers[0][1], 73 | self.hidden_layers[0][2], 74 | subsample=self.hidden_layers[0][3], 75 | input_shape=self.env_spec['state_dim'], 76 | activation=self.hidden_layers_activation, 77 | # border_mode='same', 78 | init='lecun_uniform')) 79 | 80 | if (len(self.hidden_layers) > 1): 81 | for i in range(1, len(self.hidden_layers)): 82 | model.add( 83 | self.Convolution2D( 84 | self.hidden_layers[i][0], 85 | self.hidden_layers[i][1], 86 | self.hidden_layers[i][2], 87 | subsample=self.hidden_layers[i][3], 88 | activation=self.hidden_layers_activation, 89 | # border_mode='same', 90 | init='lecun_uniform')) 91 | 92 | model.add(self.Flatten()) 93 | model.add(self.Dense(256, 94 | init='lecun_uniform', 95 | activation=self.hidden_layers_activation)) 96 | 97 | return model 98 | -------------------------------------------------------------------------------- /rl/agent/ddpg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.dqn import DQN 3 | from rl.util import logger, clone_model 4 | 5 | 6 | class Actor(DQN): 7 | ''' 8 | Actor of DDPG, with its network and target network 9 | input is states, output is action 10 | very similar to DQN 11 | ''' 12 | 13 | def __init__(self, *args, tau=0.001, **kwargs): 14 | from keras import backend as K 15 | self.K = K 16 | self.tf = self.K.tf 17 | self.sess = self.K.get_session() 18 | self.tau = tau 19 | super(Actor, self).__init__(*args, **kwargs) 20 | 21 | def build_model(self): 22 | self.model = super(Actor, self).build_model() 23 | self.target_model = clone_model(self.model) 24 | 25 | self.actor_states = self.model.inputs[0] 26 | self.out = self.model.output 27 | self.scaled_out = self.tf.multiply( 28 | self.out, self.env_spec['action_bound_high']) 29 | self.network_params = self.model.trainable_weights 30 | 31 | self.target_actor_states = self.target_model.inputs[0] 32 | self.target_out = self.target_model.output 33 | self.target_scaled_out = self.tf.multiply( 34 | self.target_out, self.env_spec['action_bound_high']) 35 | self.target_network_params = self.target_model.trainable_weights 36 | 37 | # Op for updating target network 38 | self.update_target_network_op = [] 39 | for i, t_w in enumerate(self.target_network_params): 40 | op = t_w.assign( 41 | self.tf.multiply( 42 | self.tau, self.network_params[i] 43 | ) + self.tf.multiply(1. - self.tau, t_w)) 44 | self.update_target_network_op.append(op) 45 | 46 | # will be fed as self.action_gradient: critic_grads 47 | self.action_gradient = self.tf.placeholder( 48 | self.tf.float32, [None, self.env_spec['action_dim']]) 49 | 50 | # actor model gradient op, to be fed from critic 51 | self.actor_gradients = self.tf.gradients( 52 | self.scaled_out, self.network_params, -self.action_gradient) 53 | 54 | # Optimization op 55 | self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients( 56 | zip(self.actor_gradients, self.network_params)) 57 | return self.model 58 | 59 | def compile_model(self): 60 | pass 61 | 62 | def recompile_model(self, sys_vars): 63 | pass 64 | 65 | def update(self, sys_vars): 66 | self.sess.run(self.update_target_network_op) 67 | 68 | def predict(self, states): 69 | return self.sess.run(self.scaled_out, feed_dict={ 70 | self.actor_states: states 71 | }) 72 | 73 | def target_predict(self, next_states): 74 | return self.sess.run(self.target_scaled_out, feed_dict={ 75 | self.target_actor_states: next_states 76 | }) 77 | 78 | def train_tf(self, states, critic_action_gradient): 79 | return self.sess.run(self.optimize, feed_dict={ 80 | self.actor_states: states, 81 | self.action_gradient: critic_action_gradient 82 | }) 83 | 84 | 85 | class Critic(DQN): 86 | 87 | ''' 88 | Critic of DDPG, with its network and target network 89 | input is states and actions, output is Q value 90 | the action is from Actor 91 | ''' 92 | 93 | def __init__(self, *args, tau=0.001, critic_lr=0.001, **kwargs): 94 | from keras.layers import Dense, Merge 95 | from keras import backend as K 96 | self.Dense = Dense 97 | self.Merge = Merge 98 | self.K = K 99 | self.tf = self.K.tf 100 | self.sess = self.K.get_session() 101 | self.tau = tau 102 | self.critic_lr = critic_lr # suggestion: 10 x actor_lr 103 | super(Critic, self).__init__(*args, **kwargs) 104 | 105 | def build_critic_models(self): 106 | state_branch = self.Sequential() 107 | state_branch.add(self.Dense( 108 | self.hidden_layers[0], 109 | input_shape=(self.env_spec['state_dim'],), 110 | activation=self.hidden_layers_activation, 111 | init='lecun_uniform')) 112 | 113 | action_branch = self.Sequential() 114 | action_branch.add(self.Dense( 115 | self.hidden_layers[0], 116 | input_shape=(self.env_spec['action_dim'],), 117 | activation=self.hidden_layers_activation, 118 | init='lecun_uniform')) 119 | 120 | input_layer = self.Merge([state_branch, action_branch], mode='concat') 121 | 122 | model = self.Sequential() 123 | model.add(input_layer) 124 | 125 | if (len(self.hidden_layers) > 1): 126 | for i in range(1, len(self.hidden_layers)): 127 | model.add(self.Dense( 128 | self.hidden_layers[i], 129 | init='lecun_uniform', 130 | activation=self.hidden_layers_activation)) 131 | 132 | model.add(self.Dense(1, 133 | init='lecun_uniform', 134 | activation='linear')) # fixed 135 | logger.info('Critic model summary') 136 | model.summary() 137 | self.model = model 138 | 139 | logger.info("Model built") 140 | return self.model 141 | 142 | def build_model(self): 143 | self.model = self.build_critic_models() 144 | self.target_model = clone_model(self.model) 145 | 146 | self.critic_states = self.model.inputs[0] 147 | self.critic_actions = self.model.inputs[1] 148 | self.out = self.model.output 149 | self.network_params = self.model.trainable_weights 150 | 151 | self.target_critic_states = self.target_model.inputs[0] 152 | self.target_critic_actions = self.target_model.inputs[1] 153 | self.target_out = self.target_model.output 154 | self.target_network_params = self.target_model.trainable_weights 155 | 156 | # Op for updating target network 157 | self.update_target_network_op = [] 158 | for i, t_w in enumerate(self.target_network_params): 159 | op = t_w.assign( 160 | self.tf.multiply( 161 | self.tau, self.network_params[i] 162 | ) + self.tf.multiply(1. - self.tau, t_w)) 163 | self.update_target_network_op.append(op) 164 | 165 | # custom loss and optimization Op 166 | self.y = self.tf.placeholder(self.tf.float32, [None, 1]) 167 | self.loss = self.tf.losses.mean_squared_error(self.y, self.out) 168 | self.optimize = self.tf.train.AdamOptimizer( 169 | self.critic_lr).minimize(self.loss) 170 | 171 | self.action_gradient = self.tf.gradients(self.out, self.critic_actions) 172 | return self.model 173 | 174 | def update(self, sys_vars): 175 | self.sess.run(self.update_target_network_op) 176 | 177 | def get_action_gradient(self, states, actions): 178 | return self.sess.run(self.action_gradient, feed_dict={ 179 | self.critic_states: states, 180 | self.critic_actions: actions 181 | })[0] 182 | 183 | # def predict(self, inputs, action): 184 | # return self.sess.run(self.out, feed_dict={ 185 | # self.critic_states: inputs, 186 | # self.critic_actions: action 187 | # }) 188 | 189 | def target_predict(self, next_states, mu_prime): 190 | return self.sess.run(self.target_out, feed_dict={ 191 | self.target_critic_states: next_states, 192 | self.target_critic_actions: mu_prime 193 | }) 194 | 195 | def train_tf(self, states, actions, y): 196 | return self.sess.run([self.out, self.optimize, self.loss], feed_dict={ 197 | self.critic_states: states, 198 | self.critic_actions: actions, 199 | self.y: y 200 | }) 201 | 202 | 203 | class DDPG(DQN): 204 | 205 | ''' 206 | DDPG Algorithm, from https://arxiv.org/abs/1509.02971 207 | has Actor, Critic, and each has its own target network 208 | Implementation referred from https://github.com/pemami4911/deep-rl 209 | ''' 210 | 211 | def __init__(self, *args, **kwargs): 212 | # import only when needed to contain side-effects 213 | from keras import backend as K 214 | self.K = K 215 | self.sess = self.K.get_session() 216 | self.actor = Actor(*args, **kwargs) 217 | self.critic = Critic(*args, **kwargs) 218 | self.sess.run(self.K.tf.global_variables_initializer()) 219 | super(DDPG, self).__init__(*args, **kwargs) 220 | 221 | def build_model(self): 222 | pass 223 | 224 | def compile_model(self): 225 | pass 226 | 227 | def recompile_model(self, sys_vars): 228 | pass 229 | 230 | def select_action(self, state): 231 | return self.policy.select_action(state) 232 | 233 | def update(self, sys_vars): 234 | # Update target networks 235 | self.actor.update(sys_vars) 236 | self.critic.update(sys_vars) 237 | self.policy.update(sys_vars) 238 | self.update_n_epoch(sys_vars) 239 | 240 | def train_an_epoch(self): 241 | minibatch = self.memory.rand_minibatch(self.batch_size) 242 | 243 | # train critic 244 | mu_prime = self.actor.target_predict(minibatch['next_states']) 245 | q_val = self.critic.target_predict(minibatch['states'], mu_prime) 246 | q_prime = self.critic.target_predict( 247 | minibatch['next_states'], mu_prime) 248 | # reshape for element-wise multiplication 249 | # to feed into network, y shape needs to be (?, 1) 250 | y = minibatch['rewards'] + self.gamma * \ 251 | (1 - minibatch['terminals']) * np.reshape(q_prime, (-1)) 252 | y = np.reshape(y, (-1, 1)) 253 | 254 | # update memory, needed for PER 255 | errors = abs(np.sum(q_val - y, axis=1)) 256 | # Q size is only 1, from critic 257 | assert y.shape == (self.batch_size, 1) 258 | assert errors.shape == (self.batch_size, ) 259 | self.memory.update(errors) 260 | 261 | _, _, critic_loss = self.critic.train_tf( 262 | minibatch['states'], minibatch['actions'], y) 263 | 264 | # train actor 265 | # Update the actor policy using the sampled gradient 266 | actions = self.actor.predict(minibatch['states']) 267 | critic_action_gradient = self.critic.get_action_gradient( 268 | minibatch['states'], actions) 269 | # currently cant be gotten 270 | _actorloss = self.actor.train_tf( 271 | minibatch['states'], critic_action_gradient) 272 | 273 | loss = critic_loss 274 | return loss 275 | -------------------------------------------------------------------------------- /rl/agent/deep_exp_sarsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.deep_sarsa import DeepSarsa 3 | 4 | 5 | class DeepExpectedSarsa(DeepSarsa): 6 | 7 | ''' 8 | Deep Expected Sarsa agent. 9 | On policy, with updates after each experience 10 | Policy = epsilonGreedyPolicy 11 | ''' 12 | 13 | def compute_Q_states(self, minibatch): 14 | (Q_states, Q_next_states, _max) = super( 15 | DeepExpectedSarsa, self).compute_Q_states(minibatch) 16 | 17 | curr_e = self.policy.e 18 | curr_e_per_a = curr_e / self.env_spec['action_dim'] 19 | 20 | Q_next_states_max = np.amax(Q_next_states, axis=1) 21 | Q_next_states_selected = (1 - curr_e) * Q_next_states_max + \ 22 | np.sum(Q_next_states * curr_e_per_a, axis=1) 23 | return (Q_states, Q_next_states, Q_next_states_selected) 24 | -------------------------------------------------------------------------------- /rl/agent/deep_sarsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.dqn import DQN 3 | 4 | 5 | class DeepSarsa(DQN): 6 | 7 | ''' 8 | Deep Sarsa agent. 9 | On policy, with updates after each experience 10 | Policy = epsilonGreedyPolicy 11 | ''' 12 | 13 | def __init__(self, *args, **kwargs): 14 | super(DeepSarsa, self).__init__(*args, **kwargs) 15 | self.train_per_n_new_exp = 1 16 | self.batch_size = 1 17 | self.n_epoch = 1 18 | self.final_n_epoch = 1 19 | 20 | def compute_Q_states(self, minibatch): 21 | (Q_states, Q_next_states, _max) = super( 22 | DeepSarsa, self).compute_Q_states(minibatch) 23 | next_action = self.select_action(minibatch['next_states'][0]) 24 | Q_next_states_selected = Q_next_states[:, next_action] 25 | return (Q_states, Q_next_states, Q_next_states_selected) 26 | 27 | def train_an_epoch(self): 28 | minibatch = self.memory.pop() 29 | (Q_states, _next, Q_next_states_selected 30 | ) = self.compute_Q_states(minibatch) 31 | Q_targets = self.compute_Q_targets( 32 | minibatch, Q_states, Q_next_states_selected) 33 | loss = self.model.train_on_batch(minibatch['states'], Q_targets) 34 | 35 | errors = abs(np.sum(Q_states - Q_targets, axis=1)) 36 | assert Q_targets.shape == ( 37 | self.batch_size, self.env_spec['action_dim']) 38 | assert errors.shape == (self.batch_size, ) 39 | self.memory.update(errors) 40 | return loss 41 | -------------------------------------------------------------------------------- /rl/agent/double_conv_dqn.py: -------------------------------------------------------------------------------- 1 | from rl.agent.conv_dqn import ConvDQN 2 | from rl.agent.double_dqn import DoubleDQN 3 | 4 | 5 | class DoubleConvDQN(DoubleDQN, ConvDQN): 6 | 7 | ''' 8 | The base class of double convolutional DQNs 9 | extended from DoubleDQN and ConvDQN 10 | multiple inheritance will use the method from the first class 11 | if multiple ones exists 12 | ''' 13 | 14 | def build_hidden_layers(self, model): 15 | ConvDQN.build_hidden_layers(self, model) 16 | -------------------------------------------------------------------------------- /rl/agent/double_dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.dqn import DQN 3 | from rl.util import logger, clone_model, clone_optimizer 4 | 5 | 6 | class DoubleDQN(DQN): 7 | 8 | ''' 9 | The base class of double DQNs 10 | ''' 11 | 12 | def build_model(self): 13 | super(DoubleDQN, self).build_model() 14 | 15 | model_2 = clone_model(self.model) 16 | logger.info("Model 2 summary") 17 | model_2.summary() 18 | self.model_2 = model_2 19 | 20 | logger.info("Models 1 and 2 built") 21 | return self.model, self.model_2 22 | 23 | def compile_model(self): 24 | self.optimizer.keras_optimizer_2 = clone_optimizer( 25 | self.optimizer.keras_optimizer) 26 | self.model.compile( 27 | loss='mse', 28 | optimizer=self.optimizer.keras_optimizer) 29 | self.model_2.compile( 30 | loss='mse', 31 | optimizer=self.optimizer.keras_optimizer_2) 32 | logger.info("Models 1 and 2 compiled") 33 | 34 | def switch_models(self): 35 | # Switch model 1 and model 2, also the optimizers 36 | temp = self.model 37 | self.model = self.model_2 38 | self.model_2 = temp 39 | 40 | temp_optimizer = self.optimizer.keras_optimizer 41 | self.optimizer.keras_optimizer = self.optimizer.keras_optimizer_2 42 | self.optimizer.keras_optimizer_2 = temp_optimizer 43 | 44 | # def recompile_model(self, sys_vars): 45 | # '''rotate and recompile both models''' 46 | # # TODO fix this, double recompile breaks solving power 47 | # if self.epi_change_lr is not None: 48 | # self.switch_models() # to model_2 49 | # super(DoubleDQN, self).recompile_model(sys_vars) 50 | # self.switch_models() # back to model 51 | # super(DoubleDQN, self).recompile_model(sys_vars) 52 | # return self.model 53 | 54 | def compute_Q_states(self, minibatch): 55 | (Q_states, Q_next_states_select, _max) = super( 56 | DoubleDQN, self).compute_Q_states(minibatch) 57 | # Different from (single) dqn: Select max using model 2 58 | Q_next_states_max_ind = np.argmax(Q_next_states_select, axis=1) 59 | # same as dqn again, but use Q_next_states_max_ind above 60 | Q_next_states = np.clip( 61 | self.model_2.predict(minibatch['next_states']), 62 | -self.clip_val, self.clip_val) 63 | rows = np.arange(Q_next_states_max_ind.shape[0]) 64 | Q_next_states_max = Q_next_states[rows, Q_next_states_max_ind] 65 | 66 | return (Q_states, Q_next_states, Q_next_states_max) 67 | 68 | def train_an_epoch(self): 69 | self.switch_models() 70 | return super(DoubleDQN, self).train_an_epoch() 71 | -------------------------------------------------------------------------------- /rl/agent/dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.base_agent import Agent 3 | from rl.util import logger, log_self 4 | 5 | 6 | class DQN(Agent): 7 | 8 | ''' 9 | The base class of DQNs, with the core methods 10 | The simplest deep Q network, 11 | with epsilon-greedy method and 12 | Bellman equation for value, using neural net. 13 | ''' 14 | 15 | def __init__(self, env_spec, 16 | train_per_n_new_exp=1, 17 | gamma=0.95, lr=0.1, 18 | epi_change_lr=None, 19 | batch_size=16, n_epoch=5, hidden_layers=None, 20 | hidden_layers_activation='sigmoid', 21 | output_layer_activation='linear', 22 | auto_architecture=False, 23 | num_hidden_layers=3, 24 | first_hidden_layer_size=256, 25 | num_initial_channels=16, 26 | **kwargs): # absorb generic param without breaking 27 | # import only when needed to contain side-effects 28 | from keras.layers.core import Dense 29 | from keras.models import Sequential, load_model 30 | self.Dense = Dense 31 | self.Sequential = Sequential 32 | self.load_model = load_model 33 | 34 | super(DQN, self).__init__(env_spec) 35 | 36 | self.train_per_n_new_exp = train_per_n_new_exp 37 | self.gamma = gamma 38 | self.lr = lr 39 | self.epi_change_lr = epi_change_lr 40 | self.batch_size = batch_size 41 | self.n_epoch = 1 42 | self.final_n_epoch = n_epoch 43 | self.hidden_layers = hidden_layers or [4] 44 | self.hidden_layers_activation = hidden_layers_activation 45 | self.output_layer_activation = output_layer_activation 46 | self.clip_val = 10000 47 | self.auto_architecture = auto_architecture 48 | self.num_hidden_layers = num_hidden_layers 49 | self.first_hidden_layer_size = first_hidden_layer_size 50 | self.num_initial_channels = num_initial_channels 51 | log_self(self) 52 | self.build_model() 53 | 54 | def build_hidden_layers(self, model): 55 | ''' 56 | build the hidden layers into model using parameter self.hidden_layers 57 | ''' 58 | 59 | # Auto architecture infers the size of the hidden layers from the size 60 | # of the first layer. Each successive hidden layer is half the size of the 61 | # previous layer 62 | # Enables hyperparameter optimization over network architecture 63 | if self.auto_architecture: 64 | curr_layer_size = self.first_hidden_layer_size 65 | model.add(self.Dense(curr_layer_size, 66 | input_shape=(self.env_spec['state_dim'],), 67 | activation=self.hidden_layers_activation, 68 | init='lecun_uniform')) 69 | 70 | curr_layer_size = int(curr_layer_size / 2) 71 | for i in range(1, self.num_hidden_layers): 72 | model.add(self.Dense(curr_layer_size, 73 | init='lecun_uniform', 74 | activation=self.hidden_layers_activation)) 75 | curr_layer_size = int(curr_layer_size / 2) 76 | 77 | else: 78 | model.add(self.Dense(self.hidden_layers[0], 79 | input_shape=(self.env_spec['state_dim'],), 80 | activation=self.hidden_layers_activation, 81 | init='lecun_uniform')) 82 | # inner hidden layer: no specification of input shape 83 | if (len(self.hidden_layers) > 1): 84 | for i in range(1, len(self.hidden_layers)): 85 | model.add(self.Dense( 86 | self.hidden_layers[i], 87 | init='lecun_uniform', 88 | activation=self.hidden_layers_activation)) 89 | 90 | return model 91 | 92 | def build_model(self): 93 | model = self.Sequential() 94 | self.build_hidden_layers(model) 95 | model.add(self.Dense(self.env_spec['action_dim'], 96 | init='lecun_uniform', 97 | activation=self.output_layer_activation)) 98 | logger.info("Model summary") 99 | model.summary() 100 | self.model = model 101 | 102 | logger.info("Model built") 103 | return self.model 104 | 105 | def compile_model(self): 106 | self.model.compile( 107 | loss='mse', 108 | optimizer=self.optimizer.keras_optimizer) 109 | logger.info("Model compiled") 110 | 111 | def recompile_model(self, sys_vars): 112 | ''' 113 | Option to change model optimizer settings 114 | Currently only used for changing the learning rate 115 | Compiling does not affect the model weights 116 | ''' 117 | if self.epi_change_lr is not None: 118 | if (sys_vars['epi'] == self.epi_change_lr and 119 | sys_vars['t'] == 0): 120 | self.lr = self.lr / 10.0 121 | self.optimizer.change_optim_param(**{'lr': self.lr}) 122 | self.model.compile( 123 | loss='mse', 124 | optimizer=self.optimizer.keras_optimizer) 125 | logger.info('Model recompiled with new settings: ' 126 | 'Learning rate: {}'.format(self.lr)) 127 | return self.model 128 | 129 | def update_n_epoch(self, sys_vars): 130 | ''' 131 | Increase epochs at the beginning of each session, 132 | for training for later episodes, 133 | once it has more experience 134 | Best so far, increment num epochs every 2 up to a max of 5 135 | ''' 136 | if (self.n_epoch < self.final_n_epoch and 137 | sys_vars['t'] == 0 and 138 | sys_vars['epi'] % 2 == 0): 139 | self.n_epoch += 1 140 | return self.n_epoch 141 | 142 | def select_action(self, state): 143 | '''epsilon-greedy method''' 144 | return self.policy.select_action(state) 145 | 146 | def update(self, sys_vars): 147 | ''' 148 | Agent update apart from training the Q function 149 | ''' 150 | self.policy.update(sys_vars) 151 | self.update_n_epoch(sys_vars) 152 | self.recompile_model(sys_vars) 153 | 154 | def to_train(self, sys_vars): 155 | ''' 156 | return boolean condition if agent should train 157 | get n NEW experiences before training model 158 | ''' 159 | t = sys_vars['t'] 160 | done = sys_vars['done'] 161 | timestep_limit = self.env_spec['timestep_limit'] 162 | return (t > 0) and bool( 163 | t % self.train_per_n_new_exp == 0 or 164 | t == (timestep_limit-1) or 165 | done) 166 | 167 | def compute_Q_states(self, minibatch): 168 | # note the computed values below are batched in array 169 | Q_states = np.clip(self.model.predict(minibatch['states']), 170 | -self.clip_val, self.clip_val) 171 | Q_next_states = np.clip(self.model.predict(minibatch['next_states']), 172 | -self.clip_val, self.clip_val) 173 | Q_next_states_max = np.amax(Q_next_states, axis=1) 174 | return (Q_states, Q_next_states, Q_next_states_max) 175 | 176 | def compute_Q_targets(self, minibatch, Q_states, Q_next_states_max): 177 | # make future reward 0 if exp is terminal 178 | Q_targets_a = minibatch['rewards'] + self.gamma * \ 179 | (1 - minibatch['terminals']) * Q_next_states_max 180 | # set batch Q_targets of a as above, the rest as is 181 | # minibatch['actions'] is one-hot encoded 182 | Q_targets = minibatch['actions'] * Q_targets_a[:, np.newaxis] + \ 183 | (1 - minibatch['actions']) * Q_states 184 | return Q_targets 185 | 186 | def train_an_epoch(self): 187 | minibatch = self.memory.rand_minibatch(self.batch_size) 188 | 189 | (Q_states, _states, Q_next_states_max) = self.compute_Q_states( 190 | minibatch) 191 | Q_targets = self.compute_Q_targets( 192 | minibatch, Q_states, Q_next_states_max) 193 | loss = self.model.train_on_batch(minibatch['states'], Q_targets) 194 | 195 | errors = abs(np.sum(Q_states - Q_targets, axis=1)) 196 | assert Q_targets.shape == ( 197 | self.batch_size, self.env_spec['action_dim']) 198 | assert errors.shape == (self.batch_size, ) 199 | self.memory.update(errors) 200 | return loss 201 | 202 | def train(self, sys_vars): 203 | ''' 204 | Training is for the Q function (NN) only 205 | otherwise (e.g. policy) see self.update() 206 | step 1,2,3,4 of algo. 207 | ''' 208 | loss_total = 0 209 | for _epoch in range(self.n_epoch): 210 | loss = self.train_an_epoch() 211 | loss_total += loss 212 | avg_loss = loss_total / self.n_epoch 213 | sys_vars['loss'].append(avg_loss) 214 | return avg_loss 215 | 216 | def save(self, model_path, global_step=None): 217 | logger.info('Saving model checkpoint') 218 | self.model.save_weights(model_path) 219 | 220 | def restore(self, model_path): 221 | self.model.load_weights(model_path, by_name=False) 222 | -------------------------------------------------------------------------------- /rl/agent/freeze_dqn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.double_dqn import DoubleDQN 3 | from rl.agent.dqn import DQN 4 | from rl.util import logger, clone_model 5 | 6 | 7 | class FreezeDQN(DoubleDQN): 8 | 9 | ''' 10 | Extends DQN agent to freeze target Q network 11 | and periodically update them to the weights of the 12 | exploration model 13 | Avoids oscillations and breaks correlation 14 | between Q-network and target 15 | http://www0.cs.ucl.ac.uk/staff/d.silver/web/Resources_files/deep_rl.pdf 16 | Exploration model periodically cloned into target Q network 17 | ''' 18 | 19 | def compute_Q_states(self, minibatch): 20 | Q_states = np.clip(self.model.predict(minibatch['states']), 21 | -self.clip_val, self.clip_val) 22 | Q_next_states = np.clip(self.model_2.predict(minibatch['next_states']), 23 | -self.clip_val, self.clip_val) 24 | Q_next_states_max = np.amax(Q_next_states, axis=1) 25 | return (Q_states, Q_next_states, Q_next_states_max) 26 | 27 | def train_an_epoch(self): 28 | # Should call DQN to train an epoch, not DoubleDQN 29 | return DQN.train_an_epoch(self) 30 | 31 | def update_target_model(self): 32 | # Also, loading logic seems off 33 | self.model_2 = clone_model(self.model) 34 | logger.debug("Updated target model weights") 35 | 36 | def update(self, sys_vars): 37 | ''' 38 | Agent update apart from training the Q function 39 | ''' 40 | done = sys_vars['done'] 41 | timestep_check = sys_vars['t'] == (self.env_spec['timestep_limit'] - 1) 42 | if done or timestep_check: 43 | self.update_target_model() 44 | super(FreezeDQN, self).update(sys_vars) 45 | -------------------------------------------------------------------------------- /rl/agent/offpol_sarsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.dqn import DQN 3 | 4 | 5 | class OffPolicySarsa(DQN): 6 | 7 | ''' 8 | Deep Sarsa agent. 9 | Off policy. Reduces to Q learning when eval_e = 0 10 | Evaluation policy = epsilonGreedyPolicy, eval_e = 0.05 11 | Experience generating policy = Boltzmann or 12 | EpsilonGreedy with annealing 13 | ''' 14 | 15 | def __init__(self, *args, **kwargs): 16 | super(OffPolicySarsa, self).__init__(*args, **kwargs) 17 | self.eval_e = 0.05 18 | 19 | def compute_Q_states(self, minibatch): 20 | (Q_states, Q_next_states, _max) = super( 21 | OffPolicySarsa, self).compute_Q_states(minibatch) 22 | 23 | e_per_action = self.eval_e / self.env_spec['action_dim'] 24 | 25 | Q_next_states_max = np.amax(Q_next_states, axis=1) 26 | Q_next_states_selected = (1 - self.eval_e) * Q_next_states_max + \ 27 | np.sum(Q_next_states * e_per_action, axis=1) 28 | return (Q_states, None, Q_next_states_selected) 29 | -------------------------------------------------------------------------------- /rl/agent/q_table.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.agent.base_agent import Agent 3 | 4 | 5 | class Dummy(Agent): 6 | 7 | ''' 8 | A dummy agent that does random actions, for demo 9 | ''' 10 | 11 | def select_action(self, state): 12 | '''epsilon-greedy method''' 13 | action = np.random.choice(self.env_spec['actions']) 14 | return action 15 | 16 | def update(self, sys_vars): 17 | return 18 | 19 | def to_train(self, sys_vars): 20 | return True 21 | 22 | def train(self, sys_vars): 23 | return 24 | 25 | def build_model(self): 26 | return 27 | 28 | def compile_model(self): 29 | return 30 | 31 | 32 | class QTable(Agent): 33 | 34 | ''' 35 | The simplest Q learner - a table, 36 | with epsilon-greedy method and 37 | Bellman equation for value. 38 | ''' 39 | 40 | def __init__(self, env_spec, 41 | resolution=10, 42 | gamma=0.95, lr=0.1, 43 | init_e=1.0, final_e=0.1, exploration_anneal_episodes=1000, 44 | **kwargs): # absorb generic param without breaking 45 | super(QTable, self).__init__(env_spec) 46 | self.resolution = resolution 47 | self.gamma = gamma 48 | self.lr = lr 49 | self.init_e = init_e 50 | self.final_e = final_e 51 | self.e = self.init_e 52 | self.exploration_anneal_episodes = exploration_anneal_episodes 53 | self.build_model() 54 | 55 | def build_model(self): 56 | ''' 57 | init the 2D qtable by 58 | bijecting the state space into pixelated, flattened vector 59 | multiplied with 60 | list of possible discrete actions 61 | ''' 62 | self.pixelate_state_space(self.resolution) 63 | flat_state_size = self.resolution ** self.env_spec['state_dim'] 64 | self.qtable = np.random.uniform( 65 | low=-1, high=1, 66 | size=(flat_state_size, self.env_spec['action_dim'])) 67 | return self.qtable 68 | 69 | def compile_model(self): 70 | return 71 | 72 | def pixelate_state_space(self, resolution=10): 73 | '''chunk up the state space hypercube to specified resolution''' 74 | state_bounds = np.transpose( 75 | [self.env_spec['state_bound_low'], 76 | self.env_spec['state_bound_high']]) 77 | self.state_pixels = [np.linspace(*sb, num=resolution+1) 78 | for sb in state_bounds] 79 | return self.state_pixels 80 | 81 | def flatten_state(self, state): 82 | ''' 83 | collapse a hyperdim state by binning into state_pixels 84 | then flattening the pixel_state into 1-dim bijection 85 | ''' 86 | val_space_pairs = list(zip(state, self.state_pixels)) 87 | pixel_state = [np.digitize(*val_space) 88 | for val_space in val_space_pairs] # binning 89 | flat_state = int("".join([str(ps) for ps in pixel_state])) 90 | return flat_state 91 | 92 | def select_action(self, state): 93 | '''epsilon-greedy method''' 94 | if self.e > np.random.rand(): 95 | action = np.random.choice(self.env_spec['actions']) 96 | else: 97 | flat_state = self.flatten_state(state) 98 | action = np.argmax(self.qtable[flat_state, :]) 99 | return action 100 | 101 | def update_e(self): 102 | '''strategy to update epsilon''' 103 | self.e = max(self.e - 104 | (self.init_e - self.final_e) / 105 | float(self.exploration_anneal_episodes), 106 | self.final_e) 107 | return self.e 108 | 109 | def update(self, sys_vars): 110 | self.update_e() 111 | 112 | def to_train(self, sys_vars): 113 | return True 114 | 115 | def train(self, sys_vars): 116 | ''' 117 | run the basic bellman equation update 118 | ''' 119 | last_exp = self.memory.pop() 120 | state = last_exp['states'][0] 121 | flat_state = self.flatten_state(state) 122 | next_state = last_exp['next_states'][0] 123 | next_flat_state = self.flatten_state(next_state) 124 | action = np.argmax(last_exp['actions'][0]) # from one-hot 125 | reward = last_exp['rewards'][0] 126 | Q_state_action = self.qtable[flat_state, action] 127 | Q_next_state = self.qtable[next_flat_state, :] 128 | Q_next_state_max = np.amax(Q_next_state) 129 | loss = (reward + self.gamma * Q_next_state_max - Q_state_action) 130 | sys_vars['loss'].append(loss) 131 | 132 | self.qtable[flat_state, action] = Q_state_action + \ 133 | self.lr * loss 134 | return self.qtable 135 | -------------------------------------------------------------------------------- /rl/analytics.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import pandas as pd 4 | import platform 5 | import warnings 6 | from os import environ 7 | from rl.util import * 8 | 9 | warnings.filterwarnings("ignore", module="matplotlib") 10 | 11 | if platform.system() == 'Darwin': 12 | MPL_BACKEND = 'agg' if args.param_selection else 'macosx' 13 | else: 14 | MPL_BACKEND = 'TkAgg' 15 | 16 | STATS_COLS = [ 17 | 'best_session_epi', 18 | 'best_session_id', 19 | 'best_session_mean_rewards', 20 | 'best_session_stability', 21 | 'fitness_score', 22 | 'mean_rewards_per_epi_stats_mean', 23 | 'mean_rewards_stats_mean', 24 | 'mean_rewards_stats_max', 25 | 'epi_stats_mean', 26 | 'epi_stats_min', 27 | 'solved_ratio_of_sessions', 28 | 'max_total_rewards_stats_mean', 29 | 'trial_id', 30 | ] 31 | 32 | EXPERIMENT_DATA_Y_COLS = [ 33 | 'fitness_score', 34 | 'mean_rewards_stats_max', 35 | 'max_total_rewards_stats_mean', 36 | 'epi_stats_min', 37 | ] 38 | 39 | 40 | # import matplotlib scoped to the class for gc in multiprocessing 41 | def scoped_mpl_import(): 42 | import matplotlib 43 | matplotlib.rcParams['backend'] = MPL_BACKEND 44 | 45 | import matplotlib.pyplot as plt 46 | plt.rcParams['toolbar'] = 'None' # mute matplotlib toolbar 47 | 48 | import seaborn as sns 49 | sns.set(style="whitegrid", color_codes=True, font_scale=1.0, 50 | rc={'lines.linewidth': 1.0, 51 | 'backend': matplotlib.rcParams['backend']}) 52 | palette = sns.color_palette("Blues_d") 53 | palette.reverse() 54 | sns.set_palette(palette) 55 | 56 | return (matplotlib, plt, sns) 57 | 58 | 59 | class Grapher(object): 60 | 61 | ''' 62 | Grapher object that belongs to a Session 63 | to draw graphs from its data 64 | ''' 65 | 66 | def __init__(self, session): 67 | if environ.get('CI'): 68 | return 69 | (_mpl, self.plt, _sns) = scoped_mpl_import() 70 | self.session = session 71 | self.graph_filename = self.session.graph_filename 72 | self.subgraphs = {} 73 | self.figure = self.plt.figure(facecolor='white', figsize=(8, 9)) 74 | self.figure.suptitle(wrap_text(self.session.session_id)) 75 | self.init_figure() 76 | 77 | def init_figure(self): 78 | # graph 1 79 | ax1 = self.figure.add_subplot( 80 | 311, 81 | frame_on=False, 82 | title="\n\ntotal rewards per episode", 83 | ylabel='total rewards') 84 | p1, = ax1.plot([], []) 85 | self.subgraphs['total rewards'] = (ax1, p1) 86 | 87 | ax1e = ax1.twinx() 88 | ax1e.set_ylabel('exploration rate').set_color('r') 89 | ax1e.set_frame_on(False) 90 | ax1e.grid(False) 91 | p1e, = ax1e.plot([], [], 'r') 92 | self.subgraphs['e'] = (ax1e, p1e) 93 | 94 | # graph 2 95 | ax2 = self.figure.add_subplot( 96 | 312, 97 | frame_on=False, 98 | title='mean rewards over last 100 episodes', 99 | ylabel='mean rewards') 100 | p2, = ax2.plot([], [], 'g') 101 | self.subgraphs['mean rewards'] = (ax2, p2) 102 | 103 | # graph 3 104 | ax3 = self.figure.add_subplot( 105 | 313, 106 | frame_on=False, 107 | title='loss over time, episode', 108 | ylabel='loss') 109 | p3, = ax3.plot([], []) 110 | self.subgraphs['loss'] = (ax3, p3) 111 | 112 | self.plt.tight_layout() # auto-fix spacing 113 | self.plt.ion() # for live plot 114 | 115 | def plot(self): 116 | '''do live plotting''' 117 | if environ.get('CI'): 118 | return 119 | sys_vars = self.session.sys_vars 120 | ax1, p1 = self.subgraphs['total rewards'] 121 | p1.set_ydata(sys_vars['total_rewards_history']) 122 | p1.set_xdata(np.arange(len(p1.get_ydata()))) 123 | ax1.relim() 124 | ax1.autoscale_view(tight=True, scalex=True, scaley=True) 125 | 126 | ax1e, p1e = self.subgraphs['e'] 127 | p1e.set_ydata(sys_vars['explore_history']) 128 | p1e.set_xdata(np.arange(len(p1e.get_ydata()))) 129 | ax1e.relim() 130 | ax1e.autoscale_view(tight=True, scalex=True, scaley=True) 131 | 132 | ax2, p2 = self.subgraphs['mean rewards'] 133 | p2.set_ydata(sys_vars['mean_rewards_history']) 134 | p2.set_xdata(np.arange(len(p2.get_ydata()))) 135 | ax2.relim() 136 | ax2.autoscale_view(tight=True, scalex=True, scaley=True) 137 | 138 | ax3, p3 = self.subgraphs['loss'] 139 | p3.set_ydata(sys_vars['loss']) 140 | p3.set_xdata(np.arange(len(p3.get_ydata()))) 141 | ax3.relim() 142 | ax3.autoscale_view(tight=True, scalex=True, scaley=True) 143 | 144 | self.plt.draw() 145 | self.plt.pause(0.01) 146 | self.save() 147 | import gc 148 | gc.collect() 149 | 150 | def save(self): 151 | '''save graph to filename''' 152 | self.figure.savefig(self.graph_filename) 153 | 154 | def clear(self): 155 | if environ.get('CI'): 156 | return 157 | self.plt.close() 158 | del_self_attr(self) 159 | 160 | 161 | def calc_stability(sys_vars): 162 | ''' 163 | calculate the stability of a session using its sys_vars 164 | when problem is unsolved (unbounded), use 1 sigma 95% of max 165 | stability 1 = perfectly stable 166 | 0.5 = half-ish unstable 167 | 0 = totally unstable, cannot yield solution 168 | ''' 169 | total_r_history = sys_vars['total_rewards_history'] 170 | if sys_vars['SOLVED_MEAN_REWARD'] is None: 171 | min_rewards = min(total_r_history) 172 | max_rewards = max(total_r_history) 173 | rewards_gap = max_rewards - min_rewards 174 | r_threshold = max_rewards - (0.10 * rewards_gap) 175 | else: 176 | r_threshold = sys_vars['SOLVED_MEAN_REWARD'] 177 | # find index i.e. epi of first solved 178 | first_solved_epi = next( 179 | (idx for idx, total_r in enumerate(total_r_history) 180 | if total_r > r_threshold), None) 181 | last_epi = sys_vars['epi'] 182 | stable_epi_count = len([ 183 | total_r for total_r in total_r_history if total_r > r_threshold]) 184 | 185 | if (first_solved_epi is None) or (last_epi == first_solved_epi): 186 | mastery_gap = np.inf 187 | else: # get max if mastery_gap is smaller (faster) than needed - perfect 188 | mastery_gap = last_epi - first_solved_epi 189 | stability = stable_epi_count / mastery_gap 190 | return stability 191 | 192 | 193 | def fitness_score(stats): 194 | ''' 195 | calculate the fitness score (see doc Metrics for more) 196 | 1. solution rewards 197 | 2. solving speed: /epi 198 | 3. stability 199 | 4. consistency 200 | 5. granularity 201 | 6. amplification of good results 202 | 7. distinguishability 203 | ''' 204 | mean_rewards_per_epi = stats['mean_rewards_per_epi_stats']['mean'] 205 | stability = stats['stability_stats']['mean'] 206 | consistency = stats['solved_ratio_of_sessions'] 207 | amplifier = (1+stability)*((1+consistency)**2) 208 | distinguisher = amplifier ** np.sign(mean_rewards_per_epi) 209 | fitness = mean_rewards_per_epi * distinguisher 210 | return fitness 211 | 212 | 213 | def ideal_fitness_score(problem): 214 | ''' 215 | calculate the ideal fitness_score with perfect solved ratio 216 | for hyperparameter optimization to select 217 | ''' 218 | if problem['SOLVED_MEAN_REWARD'] is None: 219 | return np.inf # for unsolved environments 220 | solved_mean_reward = problem['SOLVED_MEAN_REWARD'] 221 | max_episodes = problem['MAX_EPISODES'] 222 | solved_epi_speedup = 3 223 | ideal_epi = max_episodes / solved_epi_speedup 224 | ideal_mean_rewards_per_epi = solved_mean_reward / ideal_epi 225 | ideal_stability = 1 226 | ideal_consistency = 1 227 | amplifier = (1+ideal_stability)*((1+ideal_consistency)**2) 228 | distinguisher = amplifier ** np.sign(ideal_mean_rewards_per_epi) 229 | ideal_fitness = ideal_mean_rewards_per_epi * distinguisher 230 | return ideal_fitness 231 | 232 | 233 | def basic_stats(array): 234 | '''generate the basic stats for a numerical array''' 235 | if not len(array): 236 | return None 237 | return { 238 | 'min': np.min(array).astype(float), 239 | 'max': np.max(array).astype(float), 240 | 'mean': np.mean(array).astype(float), 241 | 'std': np.std(array).astype(float), 242 | } 243 | 244 | 245 | def compose_data(trial): 246 | ''' 247 | compose raw data from an trial object 248 | into useful summary and full metrics for analysis 249 | ''' 250 | sys_vars_array = trial.data['sys_vars_array'] 251 | 252 | # collect all data from sys_vars_array 253 | solved_sys_vars_array = list(filter( 254 | lambda sv: sv['solved'], sys_vars_array)) 255 | errored_array = list(map( 256 | lambda sv: sv['errored'], sys_vars_array)) 257 | mean_rewards_array = np.array(list(map( 258 | lambda sv: sv['mean_rewards'], sys_vars_array))) 259 | max_total_rewards_array = np.array(list(map( 260 | lambda sv: np.max(sv['total_rewards_history']), sys_vars_array))) 261 | epi_array = np.array(list(map(lambda sv: sv['epi'], sys_vars_array))) 262 | mean_rewards_per_epi_array = np.divide(mean_rewards_array, epi_array + 1) 263 | stability_array = list(map(calc_stability, sys_vars_array)) 264 | t_array = np.array(list(map(lambda sv: sv['t'], sys_vars_array))) 265 | time_taken_array = np.array(list(map( 266 | lambda sv: timestamp_elapse_to_seconds(sv['time_taken']), 267 | sys_vars_array))) 268 | solved_epi_array = np.array(list(map( 269 | lambda sv: sv['epi'], solved_sys_vars_array))) 270 | solved_t_array = np.array(list(map( 271 | lambda sv: sv['t'], solved_sys_vars_array))) 272 | solved_time_taken_array = np.array(list(map( 273 | lambda sv: timestamp_elapse_to_seconds(sv['time_taken']), 274 | solved_sys_vars_array))) 275 | best_idx = list(mean_rewards_per_epi_array).index( 276 | max(mean_rewards_per_epi_array)) 277 | best_session_id = '{}_s{}'.format(trial.data['trial_id'], best_idx) 278 | 279 | # compose sys_vars stats 280 | stats = { 281 | 'best_session_epi': epi_array.tolist()[best_idx], 282 | 'best_session_id': best_session_id, 283 | 'best_session_mean_rewards': mean_rewards_array[best_idx], 284 | 'best_session_stability': stability_array[best_idx], 285 | 'errored': any(errored_array), 286 | 'epi_stats': basic_stats(epi_array), 287 | 'max_total_rewards_stats': basic_stats(max_total_rewards_array), 288 | 'mean_rewards_stats': basic_stats(mean_rewards_array), 289 | 'mean_rewards_per_epi_stats': basic_stats( 290 | mean_rewards_per_epi_array), 291 | 'num_of_sessions': len(sys_vars_array), 292 | 'solved_epi_stats': basic_stats(solved_epi_array), 293 | 'solved_num_of_sessions': len(solved_sys_vars_array), 294 | 'solved_ratio_of_sessions': float(len( 295 | solved_sys_vars_array)) / trial.times, 296 | 'solved_t_stats': basic_stats(solved_t_array), 297 | 'solved_time_taken_stats': basic_stats(solved_time_taken_array), 298 | 'stability_stats': basic_stats(stability_array), 299 | 't_stats': basic_stats(t_array), 300 | 'time_taken_stats': basic_stats(time_taken_array), 301 | } 302 | stats.update({ 303 | 'fitness_score': fitness_score(stats) 304 | }) 305 | 306 | # summary metrics picked from stats 307 | metrics = { 308 | 'best_session_epi': stats['best_session_epi'], 309 | 'best_session_id': stats['best_session_id'], 310 | 'best_session_mean_rewards': stats['best_session_mean_rewards'], 311 | 'best_session_stability': stats['best_session_stability'], 312 | 'fitness_score': stats['fitness_score'], 313 | 'mean_rewards_per_epi_stats_mean': stats[ 314 | 'mean_rewards_per_epi_stats']['mean'], 315 | 'solved_ratio_of_sessions': stats['solved_ratio_of_sessions'], 316 | 't_stats_mean': stats['t_stats']['mean'], 317 | } 318 | 319 | # param variables for independent vars of trials 320 | default_param = trial.experiment_spec['param'] 321 | param_variables = { 322 | pv: default_param[pv] for 323 | pv in trial.param_variables if pv in default_param} 324 | 325 | trial.data['metrics'].update(metrics) 326 | trial.data['param_variables'] = param_variables 327 | trial.data['stats'] = stats 328 | return trial.data 329 | 330 | 331 | # order a unique df categorical data for plotting 332 | def order_category(uniq_df): 333 | uniq_list = list(uniq_df) 334 | try: 335 | uniq_dict = {k: json.loads(k) for k in uniq_list} 336 | sorted_pair = sorted(uniq_dict.items(), key=lambda x: x[1]) 337 | return [pair[0] for pair in sorted_pair] 338 | except (json.JSONDecodeError, TypeError): 339 | return list(sorted(uniq_list)) 340 | 341 | 342 | # plot the experiment data from data_df 343 | # X are columns with name starting with 'variable_' 344 | # Y cols are defined below 345 | def plot_experiment(data_df, trial_id): 346 | if len(data_df) < 2: # no multi selection 347 | return 348 | (_mpl, _plt, sns) = scoped_mpl_import() 349 | experiment_id = parse_experiment_id(trial_id) 350 | hue = 'solved_ratio_of_sessions' 351 | data_df = data_df.sort_values(hue) 352 | fitness_hue = 'fitness_score_bin' 353 | data_df[fitness_hue] = pd.cut(data_df['fitness_score'], bins=5) 354 | X_cols = list(filter(lambda c: c.startswith('variable_'), data_df.columns)) 355 | col_size = len(X_cols) 356 | row_size = len(EXPERIMENT_DATA_Y_COLS) 357 | groups = data_df.groupby(hue) 358 | 359 | # for main grid plot 360 | sns_only = True 361 | big_fig, axes = sns.plt.subplots( 362 | row_size, col_size, figsize=(col_size*4, row_size*3), 363 | sharex='col', sharey='row') 364 | for ix, x in enumerate(X_cols): 365 | for iy, y in enumerate(EXPERIMENT_DATA_Y_COLS): 366 | big_ax = axes[iy] if col_size == 1 else axes[iy][ix] 367 | uniq_df = data_df[x].unique() 368 | if (data_df[x].dtype.name == 'category' or 369 | len(uniq_df) <= 5): 370 | order = order_category(uniq_df) 371 | sns.swarmplot( 372 | data=data_df, x=x, y=y, hue=hue, size=3, 373 | order=order, ax=big_ax) 374 | else: 375 | sns_only = False 376 | big_ax.margins(0.05) 377 | big_ax.xaxis.grid(False) 378 | for _, group in groups: 379 | big_ax.plot(group[x], group[y], label=hue, 380 | marker='o', ms=3, linestyle='') 381 | big_ax.set_xlabel(x) 382 | big_ax.set_ylabel(y) 383 | 384 | big_ax.legend_ = None # set common legend below 385 | # label only left and bottom axes 386 | if iy != row_size - 1: 387 | big_ax.set_xlabel('') 388 | if ix != 0: 389 | big_ax.set_ylabel('') 390 | 391 | big_fig.tight_layout() 392 | big_fig.suptitle(wrap_text(experiment_id)) 393 | legend_labels = None if sns_only else sorted(data_df[hue].unique()) 394 | legend_ms = 0.5 if sns_only else 1 395 | legend = sns.plt.legend(title='solved_ratio_of_sessions', 396 | labels=legend_labels, markerscale=legend_ms, 397 | fontsize=10, loc='center right', 398 | bbox_to_anchor=(1.1+col_size*0.1, row_size+0.1)) 399 | legend.get_title().set_fontsize('10') 400 | big_fig.subplots_adjust(top=0.96, right=0.9) 401 | 402 | filename = './data/{0}/{0}_analysis.png'.format( 403 | experiment_id) 404 | big_fig.savefig(filename) 405 | big_fig.clear() 406 | 407 | # use numerical, since contour only makes sense for ordered azes 408 | numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] 409 | numeric_X_cols = list( 410 | filter(lambda x: data_df[x].dtype in numerics, X_cols)) 411 | with sns.axes_style('white', {'axes.linewidth': 0.2}): 412 | g = sns.pairplot( 413 | data_df, vars=numeric_X_cols, hue=fitness_hue, 414 | size=3, aspect=1, plot_kws={'s': 50, 'alpha': 0.5}) 415 | g.fig.suptitle(wrap_text(experiment_id)) 416 | g = g.add_legend() 417 | filename = './data/{0}/{0}_analysis_correlation.png'.format( 418 | experiment_id) 419 | g.savefig(filename) 420 | g.fig.clear() 421 | 422 | sns.plt.close() 423 | 424 | 425 | def analyze_data(experiment_data_or_experiment_id): 426 | ''' 427 | get all the data from all trials.run() 428 | or read from all data files matching the prefix of trial_id 429 | e.g. usage without running: 430 | experiment_id = 'DevCartPole-v0_DQN_LinearMemoryWithForgetting_BoltzmannPolicy_2017-01-15_142810' 431 | analyze_data(experiment_id) 432 | ''' 433 | if isinstance(experiment_data_or_experiment_id, str): 434 | experiment_data = load_data_array_from_experiment_id( 435 | experiment_data_or_experiment_id) 436 | else: 437 | experiment_data = experiment_data_or_experiment_id 438 | 439 | stats_array, param_variables_array = [], [] 440 | for data in experiment_data: 441 | stats = flatten_dict(data['stats']) 442 | stats.update({'trial_id': data['trial_id']}) 443 | param_variables = flat_cast_dict(data['param_variables']) 444 | if stats['errored']: # remove errored trials 445 | continue 446 | stats_array.append(stats) 447 | param_variables_array.append(param_variables) 448 | 449 | raw_stats_df = pd.DataFrame.from_dict(stats_array) 450 | stats_df = raw_stats_df[STATS_COLS] 451 | 452 | param_variables_df = pd.DataFrame.from_dict(param_variables_array) 453 | param_variables_df.columns = [ 454 | 'variable_'+c for c in param_variables_df.columns] 455 | 456 | data_df = pd.concat([stats_df, param_variables_df], axis=1) 457 | for c in data_df.columns: 458 | if data_df[c].dtype == object: # guard 459 | data_df[c] = data_df[c].astype('category') 460 | 461 | data_df.sort_values( 462 | ['fitness_score'], ascending=False, inplace=True) 463 | data_df.reset_index(drop=True, inplace=True) 464 | 465 | trial_id = experiment_data[0]['trial_id'] 466 | save_experiment_data(data_df, trial_id) 467 | plot_experiment(data_df, trial_id) 468 | return data_df 469 | -------------------------------------------------------------------------------- /rl/hyperoptimizer/__init__.py: -------------------------------------------------------------------------------- 1 | from rl.util import import_package_files 2 | 3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__) 4 | -------------------------------------------------------------------------------- /rl/hyperoptimizer/base_hyperoptimizer.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import multiprocessing as mp 3 | import os 4 | import time 5 | from collections import OrderedDict 6 | from rl.util import logger, timestamp, PARALLEL_PROCESS_NUM, debug_mem_usage 7 | 8 | 9 | class HyperOptimizer(object): 10 | 11 | ''' 12 | The base class of hyperparam optimizer, with core methods 13 | read about it on the documentation 14 | input: Trial (and some specs), param space P (as standardized specs) 15 | Algo: 16 | 1. search the next p in P using its internal search algo, 17 | add to its internal `param_search_list` 18 | 2. run a (slow) function Trial(p) = score (inside trial data) 19 | 3. update search using feedback score 20 | 4. repeat till max steps or fitness condition met 21 | 22 | it will be ran by the experiment as: 23 | hyperopt = HyperOptimizer(Trial, **experiment_kwargs) 24 | experiment_data = hyperopt.run() 25 | ''' 26 | 27 | def __init__(self, Trial, **kwargs): 28 | self.Trial = Trial 29 | self.REQUIRED_ARGS = [ 30 | 'experiment_spec', 31 | 'experiment_id_override', 32 | 'times' 33 | ] 34 | self.PARALLEL_PROCESS_NUM = PARALLEL_PROCESS_NUM 35 | self.free_cpu = self.PARALLEL_PROCESS_NUM # for parallel run 36 | logger.info('Initialize {}'.format(self.__class__.__name__)) 37 | self.set_keys(**kwargs) 38 | self.init_search() 39 | 40 | def set_keys(self, **kwargs): 41 | assert all(k in kwargs for k in self.REQUIRED_ARGS), \ 42 | 'kwargs do not have all REQUIRED_ARGS' 43 | for k in kwargs: 44 | setattr(self, k, kwargs[k]) 45 | 46 | self.experiment_name = self.experiment_spec.get('experiment_name') 47 | self.run_timestamp = timestamp() 48 | self.experiment_id = self.experiment_id_override or '{}-{}'.format( 49 | self.experiment_name, self.run_timestamp) 50 | self.experiment_data = [] 51 | self.param_search_list = [] 52 | # the index of next param to try in param_search_list 53 | self.next_trial_num = len(self.param_search_list) 54 | 55 | self.default_param = self.experiment_spec['param'] 56 | unordered_param_range = self.experiment_spec['param_range'] 57 | # import ordering for param_range for search serialization 58 | self.param_range = OrderedDict(sorted(unordered_param_range.items())) 59 | self.param_range_keys = sorted(self.param_range.keys()) 60 | 61 | def compose_experiment_spec(self, param): 62 | new_experiment_spec = copy.deepcopy(self.experiment_spec) 63 | new_experiment_spec.pop('param_range', None) 64 | new_experiment_spec.update({ 65 | 'param': param, 66 | }) 67 | return new_experiment_spec 68 | 69 | def init_search(self): 70 | '''initialize the search algo and the search space''' 71 | raise NotImplementedError() 72 | 73 | def search(self): 74 | ''' 75 | algo step 1, search and return the next p for Trial(p), 76 | Its only job is to append to (or modify) 77 | its internal self.param_search_list using its search logic 78 | It may refer to self.experiment_data as search memory 79 | and whatever new pointer or special memory implemented by a HyperOptimizer class 80 | ''' 81 | raise NotImplementedError() 82 | 83 | def next_param(self): 84 | '''retrieve trial_num and param, advance the class next_trial_num''' 85 | assert self.next_trial_num < len(self.param_search_list), \ 86 | 'param_search_list expansion cannot keep up with next_trial_num' 87 | trial_num = self.next_trial_num 88 | param = self.param_search_list[self.next_trial_num] 89 | self.next_trial_num = self.next_trial_num + 1 90 | return (trial_num, param) 91 | 92 | def run_trial(self, trial_num, param): 93 | ''' 94 | algo step 2, construct and run Trial with the next param 95 | args trial_num, param must be provided externally, 96 | otherwise they will not progress within mp.process 97 | ''' 98 | experiment_spec = self.compose_experiment_spec(param) 99 | trial = self.Trial( 100 | experiment_spec, trial_num=trial_num, 101 | times=self.times, 102 | num_of_trials=self.num_of_trials, 103 | run_timestamp=self.run_timestamp, 104 | experiment_id_override=self.experiment_id_override) 105 | trial_data = trial.run() 106 | del trial 107 | import gc 108 | gc.collect() 109 | debug_mem_usage() 110 | return trial_data 111 | 112 | # retrieve the trial_num, param, fitness_score from trial_data 113 | @classmethod 114 | def get_fitness(cls, trial_data): 115 | trial_id = trial_data['trial_id'] 116 | trial_num = trial_id.split('_').pop() 117 | param = trial_data['experiment_spec']['param'] 118 | metrics = trial_data['metrics'] 119 | fitness_score = metrics['fitness_score'] 120 | return trial_num, param, fitness_score 121 | 122 | def update_search(self): 123 | '''algo step 3, update search algo using self.experiment_data''' 124 | raise NotImplementedError() 125 | 126 | def to_terminate(self): 127 | '''algo step 4, terminate when at max steps or fitness condition met''' 128 | raise NotImplementedError() 129 | 130 | # handler task after a search is complete from multiprocessing pool 131 | def post_search(self, trial_data): 132 | self.experiment_data.append(trial_data) 133 | self.update_search() 134 | self.free_cpu += 1 135 | 136 | @classmethod 137 | def pool_init(self): 138 | # you can never be too safe in multiprocessing gc 139 | import gc 140 | gc.collect() 141 | 142 | @classmethod 143 | def raise_error(cls, e): 144 | logger.error('Pool worker throws Exception') 145 | print(e.__cause__) 146 | time.sleep(1) 147 | os._exit(1) 148 | 149 | def run(self): 150 | ''' 151 | top level method to run the entire hyperoptimizer 152 | will gather and compose experiment_data, then return it 153 | ''' 154 | logger.info('Run {}'.format(self.__class__.__name__)) 155 | # crucial maxtasksperchild to free up memory by respawning worker 156 | pool = mp.Pool(self.PARALLEL_PROCESS_NUM, 157 | initializer=self.pool_init, maxtasksperchild=1) 158 | while (not self.to_terminate()): 159 | if self.free_cpu > 0: 160 | self.free_cpu -= 1 # update 161 | self.search() # add to self.param_search_list 162 | trial_num, param = self.next_param() 163 | pool.apply_async( 164 | self.run_trial, (trial_num, param), 165 | callback=self.post_search, error_callback=self.raise_error) 166 | else: 167 | pass # keep looping till free_cpu available 168 | time.sleep(0.02) # prevent cpu overwork from while loop 169 | pool.close() 170 | pool.join() 171 | return self.experiment_data 172 | -------------------------------------------------------------------------------- /rl/hyperoptimizer/grid_search.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import itertools 3 | from rl.hyperoptimizer.line_search import LineSearch 4 | 5 | 6 | class GridSearch(LineSearch): 7 | 8 | def init_search(self): 9 | ''' 10 | convert a dict of param ranges into 11 | a list of cartesian products of param_range 12 | e.g. {'a': [1,2], 'b': [3]} into 13 | [{'a': 1, 'b': 3}, {'a': 2, 'b': 3}] 14 | note that this is order-preserving, as required by design 15 | ''' 16 | range_vals = self.param_range.values() 17 | for vals in itertools.product(*range_vals): 18 | param = copy.deepcopy(self.default_param) 19 | param.update(dict(zip(self.param_range_keys, vals))) 20 | self.param_search_list.append(param) 21 | self.num_of_trials = len(self.param_search_list) 22 | -------------------------------------------------------------------------------- /rl/hyperoptimizer/line_search.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from rl.hyperoptimizer.base_hyperoptimizer import HyperOptimizer 3 | 4 | 5 | class LineSearch(HyperOptimizer): 6 | 7 | def init_search(self): 8 | ''' 9 | convert a dict of param ranges into 10 | a list parameter settings corresponding 11 | to a line search of the param range 12 | for each param 13 | All other parameters set to default vals 14 | note that this is order-preserving, as required by design 15 | ''' 16 | for key in self.param_range_keys: 17 | vals = self.param_range[key] 18 | for val in vals: 19 | param = copy.deepcopy(self.default_param) 20 | param[key] = val 21 | self.param_search_list.append(param) 22 | self.num_of_trials = len(self.param_search_list) 23 | 24 | def search(self): 25 | '''no action needed here for exhaustive trials''' 26 | return 27 | 28 | def update_search(self): 29 | '''no action needed here for exhaustive trials''' 30 | return 31 | 32 | def to_terminate(self): 33 | return not (self.next_trial_num < len(self.param_search_list)) 34 | -------------------------------------------------------------------------------- /rl/hyperoptimizer/random_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | from rl.analytics import ideal_fitness_score 4 | from rl.hyperoptimizer.base_hyperoptimizer import HyperOptimizer 5 | from rl.util import PROBLEMS, to_json, logger 6 | 7 | 8 | class RandomSearch(HyperOptimizer): 9 | 10 | ''' 11 | Random Search by sampling on hysphere around a search path 12 | algo: 13 | 1. init x a random position in space 14 | 2. until termination (max_eval or fitness, e.g. solved all), do: 15 | 2.1 sample new pos some radius away: next_x = x + r 16 | 2.2 if f(next_x) > f(x) then set x = next_x 17 | 18 | Extra search memory units: 19 | - search_path 20 | - best_point 21 | 22 | save for experiment resume, search_history: 23 | - search_path 24 | - best_point 25 | - param_search_list 26 | ''' 27 | 28 | # # calculate the constant radius needed to traverse unit cube 29 | # def cube_traversal_radius(self): 30 | # traversal_diameter = 1/np.power(self.max_evals, 31 | # 1/self.search_dim) 32 | # traversal_radius = traversal_diameter/2 33 | # return traversal_radius 34 | 35 | def decay_search_radius(self): 36 | ''' 37 | start of half cube for diameter (0.25 radius) then decay 38 | at 100 searches, will shrink to 1/10 of initial radius 0.025 39 | clip to prevent going too small (0.01) 40 | ''' 41 | min_radius = 0.01 42 | linear_decay_rate = self.next_trial_num/10./self.PARALLEL_PROCESS_NUM 43 | self.search_radius = np.clip( 44 | self.init_search_radius / linear_decay_rate, 45 | min_radius, self.init_search_radius) 46 | 47 | @classmethod 48 | def sample_hypersphere(cls, dim, r=1): 49 | '''Marsaglia algo for sampling uniformly on a hypersphere''' 50 | v = np.random.randn(dim) 51 | v = v * r / np.linalg.norm(v) 52 | return v 53 | 54 | def sample_cube(self): 55 | return np.random.rand(self.search_dim) 56 | 57 | def sample_r(self): 58 | return self.sample_hypersphere( 59 | self.search_dim, self.search_radius) 60 | 61 | # biject [0, 1] to [x_min, x_max] 62 | def biject_continuous(self, norm_val, x_min, x_max): 63 | return np.around(norm_val*(x_max - x_min) + x_min, self.precision) 64 | 65 | # biject [0, 1] to x_list = [a, b, c, ...] by binning 66 | def biject_discrete(self, norm_val, x_list): 67 | list_len = len(x_list) 68 | inds = np.arange(list_len) 69 | cont_val = self.biject_continuous(norm_val, 0, list_len) 70 | ind = np.digitize(cont_val, inds) - 1 71 | return x_list[ind] 72 | 73 | # biject one dimension: [0, 1] to a param_range val 74 | def biject_dim(self, norm_val, dim_spec): 75 | if isinstance(dim_spec, list): # discrete 76 | return self.biject_discrete(norm_val, dim_spec) 77 | else: # cont 78 | return self.biject_continuous( 79 | norm_val, dim_spec['min'], dim_spec['max']) 80 | return 81 | 82 | # biject a vector on unit cube into a param in param_space 83 | def biject_param(self, v): 84 | param = {} 85 | for i, param_key in enumerate(self.param_range_keys): 86 | dim_spec = self.param_range[param_key] 87 | param[param_key] = self.biject_dim(v[i], dim_spec) 88 | return param 89 | 90 | def init_search(self): 91 | ''' 92 | Initialize the random search internal variables 93 | ''' 94 | self.max_evals = self.experiment_spec['param']['max_evals'] 95 | self.num_of_trials = self.max_evals 96 | self.search_dim = len(self.param_range_keys) 97 | self.precision = 4 # decimal roundoff biject_continuous 98 | self.search_radius = self.init_search_radius = 0.5 99 | self.search_count = 0 # number of times search() has ran 100 | self.search_exhausted = False 101 | self.search_path = [] 102 | self.best_point = { 103 | 'trial_num': None, 104 | 'param': None, 105 | 'x': self.sample_cube(), 106 | 'fitness_score': float('-inf'), 107 | } 108 | problem = PROBLEMS.get(self.experiment_spec['problem']) 109 | self.ideal_fitness_score = ideal_fitness_score(problem) 110 | logger.info( 111 | 'ideal_fitness_scrore: {}'.format(self.ideal_fitness_score)) 112 | 113 | self.filename = './data/{}/random_search_history.json'.format( 114 | self.experiment_id) 115 | if self.experiment_id_override is not None: 116 | self.load() # resume 117 | 118 | def search(self): 119 | ''' 120 | algo step 2.1 sample new pos some radius away: next_x = x + r 121 | update search_path and param_search_list 122 | ''' 123 | self.search_count += 1 124 | if self.next_trial_num < len(self.search_path): # resuming 125 | next_x = self.search_path[self.next_trial_num] 126 | next_param = self.param_search_list[self.next_trial_num] 127 | else: 128 | next_x = np.clip(self.best_point['x'] + self.sample_r(), 0., 1.) 129 | # check if too close to previously searched x 130 | distances = [np.linalg.norm(next_x - old_x) 131 | for old_x in self.search_path] 132 | distances = np.around(distances, self.precision) 133 | 134 | if self.search_count > (10 * self.max_evals): 135 | self.search_exhausted = True # exhausted search space 136 | next_param = self.biject_param(next_x) 137 | self.search_path.append(next_x) 138 | self.param_search_list.append(next_param) 139 | elif len(distances) > 0 and np.amin(distances) == 0: 140 | self.search() 141 | else: 142 | next_param = self.biject_param(next_x) 143 | self.search_path.append(next_x) 144 | self.param_search_list.append(next_param) 145 | 146 | def update_search(self): 147 | ''' 148 | algo step 2.2 if f(next_x) > f(x) then set x = next_x 149 | invoked right after the latest run_trial() 150 | update self.best_point 151 | ''' 152 | if (self.next_trial_num < self.PARALLEL_PROCESS_NUM or 153 | self.next_trial_num < len(self.search_path)): 154 | # yet to have history or still resuming from history 155 | return 156 | assert len(self.experiment_data) > 0, \ 157 | 'self.experiment_data must not be empty for update_search' 158 | 159 | self.decay_search_radius() 160 | 161 | x = self.search_path[-1] 162 | trial_data = self.experiment_data[-1] 163 | trial_num, param, fitness_score = self.get_fitness(trial_data) 164 | if fitness_score > self.best_point['fitness_score']: 165 | self.best_point = { 166 | 'trial_num': trial_num, 167 | 'param': param, 168 | 'x': x, 169 | 'fitness_score': fitness_score, 170 | } 171 | self.save() 172 | 173 | def save(self): 174 | search_history = { 175 | 'search_path': self.search_path, 176 | 'search_count': self.search_count, 177 | 'best_point': self.best_point, 178 | 'param_search_list': self.param_search_list, 179 | } 180 | with open(self.filename, 'w') as f: 181 | f.write(to_json(search_history)) 182 | logger.info( 183 | 'Save search history to {}'.format(self.filename)) 184 | return 185 | 186 | def load(self): 187 | try: 188 | search_history = json.loads(open(self.filename).read()) 189 | self.search_path = search_history['search_path'] 190 | self.best_point = search_history['best_point'] 191 | self.param_search_list = search_history['param_search_list'] 192 | logger.info('Load search history from {}'.format(self.filename)) 193 | except (FileNotFoundError, json.JSONDecodeError): 194 | logger.info( 195 | 'Fail to load search history from {}'.format(self.filename)) 196 | return None 197 | 198 | def satisfy_fitness(self): 199 | ''' 200 | break on the first strong solution 201 | ''' 202 | best_fitness_score = self.best_point['fitness_score'] 203 | if self.next_trial_num < self.PARALLEL_PROCESS_NUM: 204 | return False 205 | elif best_fitness_score > self.ideal_fitness_score: 206 | logger.info( 207 | 'fitness_score {} > ideal_fitness_score {}, ' 208 | 'could terminate early'.format( 209 | best_fitness_score, self.ideal_fitness_score)) 210 | # return True 211 | # TODO fix ideal_fitness_score 212 | return False 213 | else: 214 | return False 215 | 216 | def to_terminate(self): 217 | return (self.search_exhausted or 218 | self.next_trial_num >= self.max_evals or 219 | self.satisfy_fitness()) 220 | -------------------------------------------------------------------------------- /rl/memory/__init__.py: -------------------------------------------------------------------------------- 1 | from rl.util import import_package_files 2 | 3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__) 4 | -------------------------------------------------------------------------------- /rl/memory/base_memory.py: -------------------------------------------------------------------------------- 1 | class Memory(object): 2 | 3 | ''' 4 | The base class of Memory, with the core methods 5 | ''' 6 | 7 | def __init__(self, env_spec, **kwargs): # absorb generic param without breaking 8 | '''Construct externally, and set at Agent.compile()''' 9 | self.env_spec = env_spec 10 | self.agent = None 11 | self.state = None 12 | 13 | def reset_state(self, init_state): 14 | '''reset the state of LinearMemory per episode env.reset()''' 15 | self.state = init_state 16 | 17 | def add_exp(self, action, reward, next_state, terminal): 18 | '''add an experience''' 19 | raise NotImplementedError() 20 | 21 | def get_exp(self, inds): 22 | '''get a batch of experiences by indices''' 23 | raise NotImplementedError() 24 | 25 | def pop(self): 26 | '''get the last experience (batched like get_exp()''' 27 | raise NotImplementedError() 28 | 29 | def size(self): 30 | '''get a batch of experiences by indices''' 31 | raise NotImplementedError() 32 | 33 | def rand_minibatch(self, size): 34 | '''get a batch of experiences by indices''' 35 | raise NotImplementedError() 36 | 37 | def update(self, updates): 38 | '''update elements of the memory as requires''' 39 | raise NotImplementedError() 40 | -------------------------------------------------------------------------------- /rl/memory/linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.memory.base_memory import Memory 3 | from rl.util import log_self 4 | from scipy.stats import halfnorm 5 | 6 | 7 | class LinearMemory(Memory): 8 | 9 | ''' 10 | The replay memory used for random minibatch training 11 | ''' 12 | 13 | # absorb generic param without breaking 14 | def __init__(self, env_spec, **kwargs): 15 | super(LinearMemory, self).__init__(env_spec) 16 | self.exp_keys = [ 17 | 'states', 'actions', 'rewards', 'next_states', 'terminals'] 18 | self.exp = {k: [] for k in self.exp_keys} 19 | log_self(self) 20 | 21 | def encode_action(self, action): 22 | '''encode action based on continuous/discrete before adding''' 23 | if self.agent.env_spec['actions'] == 'continuous': 24 | return action 25 | else: # do one-hot encoding 26 | action_arr = np.zeros(self.agent.env_spec['action_dim']) 27 | action_arr[action] = 1 28 | return action_arr 29 | 30 | def add_exp(self, action, reward, next_state, terminal): 31 | ''' 32 | after the env.step(a) that returns s', r, 33 | using the previously stored state for the s, 34 | form an experience tuple 35 | ''' 36 | self.exp['states'].append(self.state) 37 | self.exp['actions'].append(self.encode_action(action)) 38 | self.exp['rewards'].append(reward) 39 | self.exp['next_states'].append(next_state) 40 | self.exp['terminals'].append(int(terminal)) 41 | self.state = next_state 42 | 43 | def _get_exp(self, exp_name, inds): 44 | return np.array([self.exp[exp_name][i] for i in inds]) 45 | 46 | def get_exp(self, inds): 47 | return {k: self._get_exp(k, inds) for k in self.exp_keys} 48 | 49 | def pop(self): 50 | '''convenient method to get exp at [last_ind]''' 51 | assert self.size() > 0, 'memory is empty, cannot pop' 52 | return self.get_exp([self.size() - 1]) 53 | 54 | def size(self): 55 | return len(self.exp['rewards']) 56 | 57 | def rand_minibatch(self, size): 58 | '''plain random sampling''' 59 | memory_size = self.size() 60 | rand_inds = np.random.randint(memory_size, size=size) 61 | minibatch = self.get_exp(rand_inds) 62 | return minibatch 63 | 64 | def update(self, updates): 65 | pass 66 | 67 | 68 | class LinearMemoryWithForgetting(LinearMemory): 69 | 70 | ''' 71 | Linear memory with uniform sampling, retaining last 50k experiences 72 | ''' 73 | 74 | def __init__(self, env_spec, max_mem_len=50000, 75 | **kwargs): # absorb generic param without breaking 76 | super(LinearMemoryWithForgetting, self).__init__(env_spec) 77 | self.max_mem_len = max_mem_len 78 | 79 | def trim_exp(self): 80 | '''The forgetting mechanism''' 81 | if (self.size() > self.max_mem_len): 82 | for k in self.exp_keys: 83 | del self.exp[k][0] 84 | 85 | def add_exp(self, action, reward, next_state, terminal): 86 | ''' 87 | add exp as usual, but preserve only the recent episodes 88 | ''' 89 | super(LinearMemoryWithForgetting, self).add_exp( 90 | action, reward, next_state, terminal) 91 | self.trim_exp() 92 | 93 | 94 | class LeftTailMemory(LinearMemory): 95 | 96 | ''' 97 | Memory with sampling via a left-tail distribution 98 | ''' 99 | 100 | def rand_minibatch(self, size): 101 | ''' 102 | get a minibatch of random exp for training 103 | use simple memory decay, i.e. sample with a left tail 104 | distribution to draw more from latest memory 105 | then append with the most recent, untrained experience 106 | ''' 107 | memory_size = self.size() 108 | new_exp_size = self.agent.train_per_n_new_exp 109 | if memory_size <= size or memory_size <= new_exp_size: 110 | inds = np.random.randint(memory_size, size=size) 111 | else: 112 | new_memory_ind = max(0, memory_size - new_exp_size) 113 | old_memory_ind = max(0, new_memory_ind - 1) 114 | latest_inds = np.arange(new_memory_ind, memory_size) 115 | random_batch_size = size - new_exp_size 116 | rand_inds = (old_memory_ind - halfnorm.rvs( 117 | size=random_batch_size, 118 | scale=float(old_memory_ind)*0.80).astype(int)) 119 | inds = np.concatenate([rand_inds, latest_inds]).clip(0) 120 | minibatch = self.get_exp(inds) 121 | return minibatch 122 | -------------------------------------------------------------------------------- /rl/memory/prioritized_exp_replay.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.memory.linear import LinearMemoryWithForgetting 3 | 4 | 5 | class PrioritizedExperienceReplay(LinearMemoryWithForgetting): 6 | 7 | ''' 8 | Replay memory with random sampling weighted by the absolute 9 | size of the value function error 10 | 11 | Adapted from https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py 12 | memory unit 13 | ''' 14 | 15 | def __init__(self, env_spec, max_mem_len=None, e=0.01, alpha=0.6, 16 | **kwargs): 17 | if max_mem_len is None: # auto calculate mem len 18 | max_timestep = env_spec['timestep_limit'] 19 | max_epis = env_spec['problem']['MAX_EPISODES'] 20 | memory_epi = np.ceil(max_epis / 3.).astype(int) 21 | max_mem_len = max(10**6, max_timestep * memory_epi) 22 | super(PrioritizedExperienceReplay, self).__init__( 23 | env_spec, max_mem_len) 24 | self.exp_keys.append('error') 25 | self.exp = {k: [] for k in self.exp_keys} # reinit with added mem key 26 | # Prevents experiences with error of 0 from being replayed 27 | self.e = e 28 | # Controls how spiked the distribution is. alpha = 0 means uniform 29 | self.alpha = alpha 30 | self.curr_data_inds = None 31 | self.curr_tree_inds = None 32 | self.prio_tree = SumTree(self.max_mem_len) 33 | self.head = 0 34 | 35 | def get_priority(self, error): 36 | # add min_priority to prevent root of negative = complex 37 | p = (error + self.e) ** self.alpha 38 | assert np.isfinite(p) 39 | return p 40 | 41 | def add_exp(self, action, reward, next_state, terminal): 42 | '''Round robin memory updating''' 43 | # init error to reward first, update later 44 | error = abs(reward) 45 | p = self.get_priority(error) 46 | 47 | if self.size() < self.max_mem_len: # add as usual 48 | super(PrioritizedExperienceReplay, self).add_exp( 49 | action, reward, next_state, terminal) 50 | self.exp['error'].append(error) 51 | else: # replace round robin 52 | self.exp['states'][self.head] = self.state 53 | self.exp['actions'][self.head] = self.encode_action(action) 54 | self.exp['rewards'][self.head] = reward 55 | self.exp['next_states'][self.head] = next_state 56 | self.exp['terminals'][self.head] = int(terminal) 57 | self.exp['error'][self.head] = error 58 | self.state = next_state 59 | 60 | self.head += 1 61 | if self.head >= self.max_mem_len: 62 | self.head = 0 # reset for round robin 63 | 64 | self.prio_tree.add(p) 65 | 66 | assert self.head == self.prio_tree.head, 'prio_tree head is wrong' 67 | 68 | def rand_minibatch(self, size): 69 | '''random sampling weighted by priority''' 70 | self.curr_tree_inds, self.curr_data_inds = self.select_prio_inds(size) 71 | minibatch = self.get_exp(self.curr_data_inds) 72 | return minibatch 73 | 74 | def select_prio_inds(self, size): 75 | tree_inds = [] 76 | data_inds = [] 77 | segment = self.prio_tree.total() / size 78 | 79 | for i in range(size): 80 | a = segment * i 81 | b = segment * (i + 1) 82 | 83 | s = np.random.uniform(a, b) 84 | t_idx, d_idx = self.prio_tree.get(s) 85 | tree_inds.append(t_idx) 86 | data_inds.append(d_idx) 87 | 88 | return tree_inds, data_inds 89 | 90 | def update(self, updates): 91 | for i, u in enumerate(updates): 92 | t_idx = self.curr_tree_inds[i] 93 | d_idx = self.curr_data_inds[i] 94 | p = self.get_priority(u) 95 | self.prio_tree.update(t_idx, p) 96 | self.exp['error'][d_idx] = u 97 | 98 | 99 | class SumTree(object): 100 | 101 | ''' 102 | Adapted from https://github.com/jaara/AI-blog/blob/master/SumTree.py 103 | See https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/ 104 | for a good introduction to PER 105 | ''' 106 | 107 | def __init__(self, capacity): 108 | self.capacity = capacity 109 | self.tree = np.zeros(2*capacity - 1) 110 | self.head = 0 111 | 112 | def _propagate(self, idx, change): 113 | parent = (idx - 1) // 2 114 | self.tree[parent] += change 115 | if parent != 0: 116 | self._propagate(parent, change) 117 | 118 | def _retrieve(self, idx, s): 119 | left = 2 * idx + 1 120 | right = left + 1 121 | 122 | if left >= len(self.tree): 123 | return idx 124 | 125 | if s <= self.tree[left]: 126 | return self._retrieve(left, s) 127 | else: 128 | return self._retrieve(right, s-self.tree[left]) 129 | 130 | def total(self): 131 | return self.tree[0] 132 | 133 | def add(self, p): 134 | idx = self.head + self.capacity - 1 135 | self.update(idx, p) 136 | self.head += 1 137 | if self.head >= self.capacity: 138 | self.head = 0 139 | 140 | def update(self, idx, p): 141 | change = p - self.tree[idx] 142 | self.tree[idx] = p 143 | self._propagate(idx, change) 144 | 145 | def get(self, s): 146 | idx = self._retrieve(0, s) 147 | data_idx = idx - self.capacity + 1 148 | return idx, data_idx 149 | -------------------------------------------------------------------------------- /rl/memory/ranked.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.memory.linear import LinearMemory 3 | from rl.util import log_self 4 | import math 5 | 6 | 7 | class HighLowMemory(LinearMemory): 8 | 9 | ''' 10 | Memory divided into two: good and bad experiences 11 | As with RankedMemory experiences are grouped episodically 12 | Episodes with a total reward > threshold are assigned to good memory 13 | The threshold is recomputed every n episodes and 14 | episodes are reassigned accordingly. 15 | Memories are sampled from good experiences with a self.prob_high 16 | Memories are sampled from bad experiences with a 1 - self.prob_high 17 | Experiences are sampled from a maximum of 3 randomly selected episodes, 18 | per minibatch for each of the high and low memories 19 | TODO improvement: do a more natural continuous range to sort high low 20 | by self.epi_memory.sort(key=lambda epi_exp: epi_exp['total_rewards']) 21 | ''' 22 | 23 | # absorb generic param without breaking 24 | def __init__(self, env_spec, **kwargs): 25 | super(HighLowMemory, self).__init__(env_spec) 26 | # use the old self.exp as buffer, remember to clear 27 | self.last_exp = self.exp 28 | self.epi_memory_high = [] 29 | self.epi_memory_low = [] 30 | self.max_reward = -math.inf 31 | self.min_reward = math.inf 32 | # 1st 5 epis goes into bad half, recompute every 5 epis 33 | self.threshold = math.inf 34 | self.threshold_history = [] 35 | self.epi_num = 0 36 | self.prob_high = 0.66 37 | self.num_epis_to_sample = 3 38 | self.max_epis_in_mem = 15 39 | self.recompute_freq = 10 40 | log_self(self) 41 | 42 | def reassign_episodes(self): 43 | new_high, new_low = [] 44 | 45 | for mem in (self.epi_memory_high, self.epi_memory_low): 46 | for epi_exp in mem: 47 | if (epi_exp['total_rewards'] > self.threshold): 48 | new_high.append(epi_exp) 49 | else: 50 | new_low.append(epi_exp) 51 | 52 | self.epi_memory_high = new_high 53 | self.epi_memory_low = new_low 54 | 55 | def compute_threshold(self): 56 | self.threshold_history.append([self.threshold, 57 | self.max_reward, 58 | self.min_reward]) 59 | if (len(self.threshold_history) > 1): 60 | # Scaled because this threshold seems too severe based on trial 61 | # runs 62 | self.threshold = \ 63 | max(self.threshold, 64 | (self.max_reward + self.min_reward) / 2.0 * 0.75) 65 | else: 66 | self.threshold = (self.max_reward + self.min_reward) / 2.0 * 0.75 67 | self.reassign_episodes() 68 | self.max_reward = -math.inf 69 | self.min_reward = math.inf 70 | 71 | def add_exp(self, action, reward, next_state, terminal): 72 | super(HighLowMemory, self).add_exp( 73 | action, reward, next_state, terminal) 74 | if terminal: 75 | epi_exp = { 76 | 'exp': self.exp, 77 | 'total_rewards': np.sum(self.exp['rewards']), 78 | 'epi_num': self.epi_num 79 | } 80 | if (epi_exp['total_rewards'] <= self.threshold): 81 | self.epi_memory_low.append(epi_exp) 82 | else: 83 | self.epi_memory_high.append(epi_exp) 84 | if (self.epi_num > 0 and self.epi_num % self.recompute_freq == 0): 85 | self.compute_threshold() 86 | if (epi_exp['total_rewards'] > self.max_reward): 87 | self.max_reward = epi_exp['total_rewards'] 88 | if (epi_exp['total_rewards'] < self.min_reward): 89 | self.min_reward = epi_exp['total_rewards'] 90 | self.last_exp = self.exp 91 | self.exp = {k: [] for k in self.exp_keys} 92 | self.epi_num += 1 93 | # print("THRESHOLD HISTORY") 94 | # print(self.threshold_history) 95 | # print("HIGH MEM") 96 | # for epi in self.epi_memory_high: 97 | # print(str(epi['total_rewards'])+ " ,", end=" ") 98 | # print() 99 | # print("LOW MEM") 100 | # for epi in self.epi_memory_low: 101 | # print(str(epi['total_rewards'] )+ " ,", end=" ") 102 | # print() 103 | 104 | def pop(self): 105 | '''convenient method to get exp at [last_ind]''' 106 | buffer_exp = self.exp # store for restore later 107 | self.exp = self.last_exp 108 | res = super(HighLowMemory, self).pop() 109 | self.exp = buffer_exp 110 | return res 111 | 112 | def rand_minibatch(self, size): 113 | # base case, early exit 114 | high_samples = np.int(np.ceil(size * self.prob_high)) 115 | low_samples = size - high_samples 116 | 117 | if (len(self.epi_memory_high) == 0 and 118 | len(self.epi_memory_low) == 0): 119 | return super(HighLowMemory, self).rand_minibatch(size) 120 | 121 | if (len(self.epi_memory_high) == 0): 122 | high_samples = 0 123 | low_samples = size 124 | 125 | high_samples_per_epi = np.int( 126 | np.ceil(high_samples / self.num_epis_to_sample)) 127 | low_samples_per_epi = np.int( 128 | np.ceil(low_samples / self.num_epis_to_sample)) 129 | 130 | buffer_exp = self.exp 131 | minibatch_as_list = [] 132 | if high_samples > 0: 133 | for _i in range(4): 134 | idx = np.random.randint(0, len(self.epi_memory_high)) 135 | epi_exp = self.epi_memory_high[idx]['exp'] 136 | self.exp = epi_exp 137 | epi_minibatch = super(HighLowMemory, self).rand_minibatch( 138 | high_samples_per_epi) 139 | minibatch_as_list.append(epi_minibatch) 140 | 141 | if low_samples > 0: 142 | for _i in range(4): 143 | idx = np.random.randint(0, len(self.epi_memory_low)) 144 | epi_exp = self.epi_memory_low[idx]['exp'] 145 | self.exp = epi_exp 146 | epi_minibatch = super(HighLowMemory, self).rand_minibatch( 147 | low_samples_per_epi) 148 | minibatch_as_list.append(epi_minibatch) 149 | 150 | # set buffer back to original 151 | self.exp = buffer_exp 152 | 153 | # merge all minibatches from best_epi_memory into a minibatch 154 | minibatch = {} 155 | for k in self.exp_keys: 156 | k_exp = np.concatenate( 157 | [epi_exp[k] for epi_exp in minibatch_as_list] 158 | )[-size:] 159 | minibatch[k] = k_exp 160 | assert len( 161 | minibatch['rewards']) == size, 'minibatch has the wrong size' 162 | 163 | return minibatch 164 | 165 | def update(self, updates): 166 | pass 167 | 168 | 169 | class HighLowMemoryWithForgetting(HighLowMemory): 170 | 171 | ''' 172 | Like HighLowMemory but also has forgetting capability 173 | Controlled by max_epis_in_mem param 174 | ''' 175 | 176 | # absorb generic param without breaking 177 | def __init__(self, env_spec, **kwargs): 178 | super(HighLowMemoryWithForgetting, self).__init__(env_spec) 179 | self.max_epis_in_mem = 250 180 | log_self(self) 181 | 182 | def reassign_episodes(self): 183 | new_high, new_low = [] 184 | 185 | for mem in (self.epi_memory_high, self.epi_memory_low): 186 | for epi_exp in mem: 187 | if (self.epi_num - epi_exp['epi_num'] <= self.max_epis_in_mem): 188 | if (epi_exp['total_rewards'] > self.threshold): 189 | new_high.append(epi_exp) 190 | else: 191 | new_low.append(epi_exp) 192 | 193 | self.epi_memory_high = new_high 194 | self.epi_memory_low = new_low 195 | -------------------------------------------------------------------------------- /rl/model/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kengz/openai_lab/d0669d89268f2dc01c1cf878e4879775c7b6eb3c/rl/model/.gitkeep -------------------------------------------------------------------------------- /rl/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | from rl.util import import_package_files 2 | 3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__) 4 | -------------------------------------------------------------------------------- /rl/optimizer/adam.py: -------------------------------------------------------------------------------- 1 | from rl.optimizer.base_optimizer import Optimizer 2 | 3 | 4 | class AdamOptimizer(Optimizer): 5 | 6 | ''' 7 | Adam optimizer 8 | Potential param: 9 | lr (learning rate) 10 | beta_1 11 | beta_2 12 | epsilon 13 | decay 14 | Suggested to leave at default param with the expected of lr 15 | ''' 16 | 17 | def __init__(self, **kwargs): 18 | from keras.optimizers import Adam 19 | self.Adam = Adam 20 | 21 | self.optim_param_keys = ['lr', 'beta_1', 'beta_2', 'epsilon', 'decay'] 22 | super(AdamOptimizer, self).__init__(**kwargs) 23 | 24 | def init_optimizer(self): 25 | self.keras_optimizer = self.Adam(**self.optim_param) 26 | -------------------------------------------------------------------------------- /rl/optimizer/base_optimizer.py: -------------------------------------------------------------------------------- 1 | from rl.util import log_self, logger 2 | 3 | 4 | class Optimizer(object): 5 | 6 | ''' 7 | The base class of Optimizer, with the core methods 8 | ''' 9 | 10 | def __init__(self, **kwargs): 11 | '''Construct externally, and set at Agent.compile()''' 12 | self.agent = None 13 | self.keras_optimizer = None 14 | self.optim_param = {} 15 | self.update_optim_param(**kwargs) 16 | self.init_optimizer() 17 | log_self(self) 18 | 19 | def update_optim_param(self, **kwargs): 20 | o_param = { 21 | k: kwargs.get(k) for k in self.optim_param_keys 22 | if kwargs.get(k) is not None} 23 | self.optim_param.update(o_param) 24 | 25 | def init_optimizer(self): 26 | raise NotImplementedError() 27 | 28 | def change_optim_param(self, **new_param): 29 | self.update_optim_param(**new_param) 30 | self.init_optimizer() 31 | logger.info("Optimizer param changed") 32 | log_self(self) 33 | -------------------------------------------------------------------------------- /rl/optimizer/rmsprop.py: -------------------------------------------------------------------------------- 1 | from rl.optimizer.base_optimizer import Optimizer 2 | 3 | 4 | class RMSpropOptimizer(Optimizer): 5 | 6 | ''' 7 | RMS prop 8 | Potential param: 9 | lr (learning rate) 10 | rho 11 | decay 12 | epsilon 13 | ''' 14 | 15 | def __init__(self, **kwargs): 16 | from keras.optimizers import RMSprop 17 | self.RMSprop = RMSprop 18 | 19 | self.optim_param_keys = ['lr', 'rho', 'decay', 'epsilon'] 20 | super(RMSpropOptimizer, self).__init__(**kwargs) 21 | 22 | def init_optimizer(self): 23 | self.keras_optimizer = self.RMSprop(**self.optim_param) 24 | -------------------------------------------------------------------------------- /rl/optimizer/sgd.py: -------------------------------------------------------------------------------- 1 | from rl.optimizer.base_optimizer import Optimizer 2 | 3 | 4 | class SGDOptimizer(Optimizer): 5 | 6 | ''' 7 | Stochastic gradient descent 8 | Potential param: 9 | lr (learning rate) 10 | momentum 11 | decay 12 | nesterov 13 | ''' 14 | 15 | def __init__(self, **kwargs): 16 | from keras.optimizers import SGD 17 | self.SGD = SGD 18 | 19 | self.optim_param_keys = ['lr', 'momentum', 'decay', 'nesterov'] 20 | super(SGDOptimizer, self).__init__(**kwargs) 21 | 22 | def init_optimizer(self): 23 | self.keras_optimizer = self.SGD(**self.optim_param) 24 | -------------------------------------------------------------------------------- /rl/policy/__init__.py: -------------------------------------------------------------------------------- 1 | from rl.util import import_package_files 2 | 3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__) 4 | -------------------------------------------------------------------------------- /rl/policy/actor_critic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.policy.base_policy import Policy 3 | from rl.util import log_self 4 | 5 | 6 | class ArgmaxPolicy(Policy): 7 | 8 | ''' 9 | The argmax policy for actor critic agents 10 | Agent takes the action with the highest 11 | action score 12 | ''' 13 | 14 | def __init__(self, env_spec, 15 | **kwargs): # absorb generic param without breaking 16 | super(ArgmaxPolicy, self).__init__(env_spec) 17 | log_self(self) 18 | 19 | def select_action(self, state): 20 | agent = self.agent 21 | state = np.expand_dims(state, axis=0) 22 | A_score = agent.actor.predict(state)[0] # extract from batch predict 23 | assert A_score.ndim == 1 24 | action = np.argmax(A_score) 25 | return action 26 | 27 | def update(self, sys_vars): 28 | pass 29 | 30 | 31 | class SoftmaxPolicy(Policy): 32 | 33 | ''' 34 | The softmax policy for actor critic agents 35 | Action is drawn from the prob dist generated 36 | by softmax(acion_scores) 37 | ''' 38 | 39 | def __init__(self, env_spec, 40 | **kwargs): # absorb generic param without breaking 41 | super(SoftmaxPolicy, self).__init__(env_spec) 42 | self.clip_val = 500. 43 | log_self(self) 44 | 45 | def select_action(self, state): 46 | agent = self.agent 47 | state = np.expand_dims(state, axis=0) 48 | A_score = agent.actor.predict(state)[0] # extract from batch predict 49 | assert A_score.ndim == 1 50 | A_score = A_score.astype('float64') # fix precision overflow 51 | exp_values = np.exp( 52 | np.clip(A_score, -self.clip_val, self.clip_val)) 53 | assert np.isfinite(exp_values).all() 54 | probs = np.array(exp_values / np.sum(exp_values)) 55 | probs /= probs.sum() # renormalize to prevent floating pt error 56 | action = np.random.choice(agent.env_spec['actions'], p=probs) 57 | return action 58 | 59 | def update(self, sys_vars): 60 | pass 61 | 62 | 63 | class GaussianPolicy(Policy): 64 | 65 | ''' 66 | Continuous policy for actor critic models 67 | Output of the actor network is the mean action 68 | along each dimension. Action chosen is the mean 69 | plus some noise parameterized by the variance 70 | ''' 71 | 72 | def __init__(self, env_spec, 73 | variance=1.0, 74 | **kwargs): # absorb generic param without breaking 75 | super(GaussianPolicy, self).__init__(env_spec) 76 | self.variance = variance 77 | log_self(self) 78 | 79 | def select_action(self, state): 80 | agent = self.agent 81 | state = np.expand_dims(state, axis=0) 82 | a_mean = agent.actor.predict(state)[0] # extract from batch predict 83 | action = a_mean + np.random.normal( 84 | loc=0.0, scale=self.variance, size=a_mean.shape) 85 | action = np.clip(action, 86 | self.env_spec['action_bound_low'], 87 | self.env_spec['action_bound_high']) 88 | return action 89 | 90 | def update(self, sys_vars): 91 | pass 92 | 93 | 94 | class BoundedPolicy(Policy): 95 | 96 | ''' 97 | The bounded policy for actor critic agents 98 | and continous, bounded policy spaces 99 | Action bounded above and below by 100 | - action_bound, + action_bound 101 | ''' 102 | 103 | def __init__(self, env_spec, 104 | **kwargs): # absorb generic param without breaking 105 | super(BoundedPolicy, self).__init__(env_spec) 106 | self.action_bound = env_spec['action_bound_high'] 107 | assert env_spec['action_bound_high'] == -env_spec['action_bound_low'] 108 | log_self(self) 109 | 110 | def select_action(self, state): 111 | agent = self.agent 112 | state = np.expand_dims(state, axis=0) 113 | A_score = agent.actor.predict(state)[0] # extract from batch predict 114 | action = np.tanh(A_score) * self.action_bound 115 | return action 116 | 117 | def update(self, sys_vars): 118 | pass 119 | -------------------------------------------------------------------------------- /rl/policy/base_policy.py: -------------------------------------------------------------------------------- 1 | class Policy(object): 2 | 3 | ''' 4 | The base class of Policy, with the core methods 5 | ''' 6 | 7 | def __init__(self, env_spec, 8 | **kwargs): # absorb generic param without breaking 9 | '''Construct externally, and set at Agent.compile()''' 10 | self.env_spec = env_spec 11 | self.agent = None 12 | 13 | def select_action(self, state): 14 | raise NotImplementedError() 15 | 16 | def update(self, sys_vars): 17 | raise NotImplementedError() 18 | -------------------------------------------------------------------------------- /rl/policy/boltzmann.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.policy.base_policy import Policy 3 | from rl.util import log_self 4 | 5 | 6 | class BoltzmannPolicy(Policy): 7 | 8 | ''' 9 | The Boltzmann policy, where prob dist for selection 10 | p = exp(Q/tau) / sum(Q[a]/tau) 11 | ''' 12 | 13 | def __init__(self, env_spec, 14 | init_tau=5., final_tau=0.5, exploration_anneal_episodes=20, 15 | **kwargs): # absorb generic param without breaking 16 | super(BoltzmannPolicy, self).__init__(env_spec) 17 | self.init_tau = init_tau 18 | self.final_tau = final_tau 19 | self.tau = self.init_tau 20 | self.exploration_anneal_episodes = exploration_anneal_episodes 21 | self.clip_val = 500. 22 | log_self(self) 23 | 24 | def select_action(self, state): 25 | agent = self.agent 26 | state = np.expand_dims(state, axis=0) 27 | Q_state = agent.model.predict(state)[0] # extract from batch predict 28 | assert Q_state.ndim == 1 29 | Q_state = Q_state.astype('float64') # fix precision overflow 30 | exp_values = np.exp( 31 | np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) 32 | assert np.isfinite(exp_values).all() 33 | probs = np.array(exp_values / np.sum(exp_values)) 34 | probs /= probs.sum() # renormalize to prevent floating pt error 35 | action = np.random.choice(agent.env_spec['actions'], p=probs) 36 | return action 37 | 38 | def update(self, sys_vars): 39 | '''strategy to update tau in agent''' 40 | epi = sys_vars['epi'] 41 | rise = self.final_tau - self.init_tau 42 | slope = rise / float(self.exploration_anneal_episodes) 43 | self.tau = max(slope * epi + self.init_tau, self.final_tau) 44 | return self.tau 45 | 46 | 47 | class DoubleDQNBoltzmannPolicy(BoltzmannPolicy): 48 | 49 | ''' 50 | Same as the Boltzmann policy but for a Double DQN agent 51 | ''' 52 | 53 | def __init__(self, env_spec, 54 | init_tau=5., final_tau=0.5, exploration_anneal_episodes=20, 55 | **kwargs): # absorb generic param without breaking 56 | super(DoubleDQNBoltzmannPolicy, self).__init__( 57 | env_spec, init_tau, final_tau, 58 | exploration_anneal_episodes) 59 | 60 | def select_action(self, state): 61 | agent = self.agent 62 | state = np.expand_dims(state, axis=0) 63 | # extract from batch predict 64 | Q_state1 = agent.model.predict(state)[0] 65 | Q_state2 = agent.model_2.predict(state)[0] 66 | Q_state = Q_state1 + Q_state2 67 | assert Q_state.ndim == 1 68 | Q_state = Q_state.astype('float64') # fix precision overflow 69 | exp_values = np.exp( 70 | np.clip(Q_state / self.tau, -self.clip_val, self.clip_val)) 71 | assert np.isfinite(exp_values).all() 72 | probs = np.array(exp_values / np.sum(exp_values)) 73 | probs /= probs.sum() # renormalize to prevent floating pt error 74 | action = np.random.choice(agent.env_spec['actions'], p=probs) 75 | return action 76 | -------------------------------------------------------------------------------- /rl/policy/epsilon_greedy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.policy.base_policy import Policy 3 | from rl.util import log_self 4 | 5 | 6 | class EpsilonGreedyPolicy(Policy): 7 | 8 | ''' 9 | The Epsilon-greedy policy 10 | ''' 11 | 12 | def __init__(self, env_spec, 13 | init_e=1.0, final_e=0.1, exploration_anneal_episodes=30, 14 | **kwargs): # absorb generic param without breaking 15 | super(EpsilonGreedyPolicy, self).__init__(env_spec) 16 | self.init_e = init_e 17 | self.final_e = final_e 18 | self.e = self.init_e 19 | self.exploration_anneal_episodes = exploration_anneal_episodes 20 | log_self(self) 21 | 22 | def select_action(self, state): 23 | '''epsilon-greedy method''' 24 | agent = self.agent 25 | if self.e > np.random.rand(): 26 | action = np.random.choice(agent.env_spec['actions']) 27 | else: 28 | state = np.expand_dims(state, axis=0) 29 | # extract from batch predict 30 | Q_state = agent.model.predict(state)[0] 31 | assert Q_state.ndim == 1 32 | action = np.argmax(Q_state) 33 | return action 34 | 35 | def update(self, sys_vars): 36 | '''strategy to update epsilon in agent''' 37 | epi = sys_vars['epi'] 38 | rise = self.final_e - self.init_e 39 | slope = rise / float(self.exploration_anneal_episodes) 40 | self.e = max(slope * epi + self.init_e, self.final_e) 41 | return self.e 42 | 43 | 44 | class DoubleDQNEpsilonGreedyPolicy(EpsilonGreedyPolicy): 45 | 46 | ''' 47 | Policy to accompany double dqn agents 48 | When actions are not random this policy 49 | selects actions by symming the outputs from 50 | each of the two Q-state approximators 51 | before taking the max of the result 52 | ''' 53 | 54 | def __init__(self, env_spec, 55 | init_e=1.0, final_e=0.1, exploration_anneal_episodes=30, 56 | **kwargs): # absorb generic param without breaking 57 | super(DoubleDQNEpsilonGreedyPolicy, self).__init__( 58 | env_spec, init_e, final_e, 59 | exploration_anneal_episodes) 60 | 61 | def select_action(self, state): 62 | '''epsilon-greedy method''' 63 | agent = self.agent 64 | if self.e > np.random.rand(): 65 | action = np.random.choice(agent.env_spec['actions']) 66 | else: 67 | state = np.expand_dims(state, axis=0) 68 | # extract from batch predict 69 | Q_state1 = agent.model.predict(state)[0] 70 | Q_state2 = agent.model_2.predict(state)[0] 71 | Q_state = Q_state1 + Q_state2 72 | assert Q_state.ndim == 1 73 | action = np.argmax(Q_state) 74 | return action 75 | 76 | 77 | class DecayingEpsilonGreedyPolicy(EpsilonGreedyPolicy): 78 | 79 | ''' 80 | Inspired by alvacarce's solution to mountain car 81 | https://gym.openai.com/evaluations/eval_t3GN2Xb0R5KpyjkJUGsLw 82 | ''' 83 | 84 | def __init__(self, env_spec, 85 | init_e=1.0, final_e=0.1, exploration_anneal_episodes=30, 86 | **kwargs): # absorb generic param without breaking 87 | super(DecayingEpsilonGreedyPolicy, self).__init__( 88 | env_spec, init_e, final_e, exploration_anneal_episodes) 89 | self.e_decay = 0.9997 90 | 91 | def update(self, sys_vars): 92 | _epi = sys_vars['epi'] 93 | if self.e > self.final_e: 94 | self.e = self.e * self.e_decay 95 | return self.e 96 | 97 | 98 | class OscillatingEpsilonGreedyPolicy(EpsilonGreedyPolicy): 99 | 100 | ''' 101 | The epsilon-greedy policy with oscillating epsilon 102 | periodically agent.e will drop to a fraction of 103 | the current exploration rate 104 | ''' 105 | 106 | def update(self, sys_vars): 107 | '''strategy to update epsilon in agent''' 108 | super(OscillatingEpsilonGreedyPolicy, self).update( 109 | sys_vars) 110 | epi = sys_vars['epi'] 111 | if not (epi % 3) and epi > 15: 112 | # drop to 1/3 of the current exploration rate 113 | self.e = max(self.e/3., self.final_e) 114 | return self.e 115 | 116 | 117 | class TargetedEpsilonGreedyPolicy(EpsilonGreedyPolicy): 118 | 119 | ''' 120 | switch between active and inactive exploration cycles by 121 | partial mean rewards and its distance to the target mean rewards 122 | ''' 123 | 124 | def update(self, sys_vars): 125 | '''strategy to update epsilon in agent''' 126 | epi = sys_vars['epi'] 127 | assert sys_vars['SOLVED_MEAN_REWARD'] is not None, \ 128 | 'this policy needs an explicit target SOLVED_MEAN_REWARD' 129 | SOLVED_MEAN_REWARD = sys_vars['SOLVED_MEAN_REWARD'] 130 | REWARD_MEAN_LEN = sys_vars['REWARD_MEAN_LEN'] 131 | PARTIAL_MEAN_LEN = int(REWARD_MEAN_LEN * 0.20) 132 | if epi < 1: # corner case when no total_rewards_history to avg 133 | return 134 | # the partial mean for projection the entire mean 135 | partial_mean_reward = np.mean( 136 | sys_vars['total_rewards_history'][-PARTIAL_MEAN_LEN:]) 137 | # difference to target, and its ratio (1 if denominator is 0) 138 | min_reward = np.amin(sys_vars['total_rewards_history']) 139 | projection_gap = SOLVED_MEAN_REWARD - partial_mean_reward 140 | worst_gap = SOLVED_MEAN_REWARD - min_reward 141 | gap_ratio = projection_gap / worst_gap 142 | envelope = self.init_e + (self.final_e - self.init_e) / 2. * \ 143 | (float(epi)/float(self.exploration_anneal_episodes)) 144 | pessimistic_gap_ratio = envelope * min(2 * gap_ratio, 1) 145 | # if is in odd cycle, and diff is still big, actively explore 146 | active_exploration_cycle = not bool( 147 | int(epi/PARTIAL_MEAN_LEN) % 2) and ( 148 | projection_gap > abs(SOLVED_MEAN_REWARD * 0.05)) 149 | self.e = max(pessimistic_gap_ratio * self.init_e, self.final_e) 150 | 151 | if not active_exploration_cycle: 152 | self.e = max(self.e/2., self.final_e) 153 | return self.e 154 | -------------------------------------------------------------------------------- /rl/policy/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.util import log_self 3 | from rl.policy.base_policy import Policy 4 | from rl.policy.epsilon_greedy import EpsilonGreedyPolicy 5 | 6 | 7 | class NoNoisePolicy(Policy): 8 | 9 | ''' 10 | The base class for noise policy for DDPG 11 | default is no noise 12 | ''' 13 | 14 | def __init__(self, env_spec, 15 | **kwargs): # absorb generic param without breaking 16 | super(NoNoisePolicy, self).__init__(env_spec) 17 | log_self(self) 18 | 19 | def sample(self): 20 | '''implement noise here, default is none''' 21 | assert 'actions' in self.env_spec 22 | return 0 23 | 24 | def select_action(self, state): 25 | agent = self.agent 26 | state = np.expand_dims(state, axis=0) 27 | if self.env_spec['actions'] == 'continuous': 28 | action = agent.actor.predict(state)[0] + self.sample() 29 | action = np.clip(action, 30 | self.env_spec['action_bound_low'], 31 | self.env_spec['action_bound_high']) 32 | else: 33 | Q_state = agent.actor.predict(state)[0] 34 | assert Q_state.ndim == 1 35 | action = np.argmax(Q_state) 36 | return action 37 | 38 | def update(self, sys_vars): 39 | pass 40 | 41 | 42 | class LinearNoisePolicy(NoNoisePolicy): 43 | 44 | ''' 45 | policy with linearly decaying noise (1. / (1. + self.epi)) 46 | ''' 47 | 48 | def __init__(self, env_spec, exploration_anneal_episodes=20, 49 | **kwargs): # absorb generic param without breaking 50 | super(LinearNoisePolicy, self).__init__(env_spec) 51 | self.exploration_anneal_episodes = exploration_anneal_episodes 52 | self.n_step = 0 # init 53 | log_self(self) 54 | 55 | def sample(self): 56 | noise = (1. / (1. + self.n_step)) 57 | return noise 58 | 59 | def update(self, sys_vars): 60 | epi = sys_vars['epi'] 61 | if epi >= self.exploration_anneal_episodes: 62 | self.n_step = np.inf # noise divide to zero 63 | else: 64 | self.n_step = sys_vars['epi'] 65 | 66 | 67 | class EpsilonGreedyNoisePolicy(EpsilonGreedyPolicy, NoNoisePolicy): 68 | 69 | ''' 70 | akin to epsilon greedy decay, 71 | but return random sample instead 72 | ''' 73 | 74 | def sample(self): 75 | if self.e > np.random.rand(): 76 | noise = np.random.uniform( 77 | 0.5 * self.env_spec['action_bound_low'], 78 | 0.5 * self.env_spec['action_bound_high']) 79 | else: 80 | noise = 0 81 | return noise 82 | 83 | def select_action(self, state): 84 | return NoNoisePolicy.select_action(self, state) 85 | 86 | 87 | class AnnealedGaussianPolicy(LinearNoisePolicy): 88 | 89 | ''' 90 | Base class of random noise policy for DDPG 91 | Adopted from 92 | https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py 93 | ''' 94 | 95 | def __init__(self, env_spec, exploration_anneal_episodes, 96 | mu, sigma, sigma_min, 97 | **kwargs): # absorb generic param without breaking 98 | super(AnnealedGaussianPolicy, self).__init__( 99 | env_spec, exploration_anneal_episodes) 100 | self.size = env_spec['action_dim'] 101 | self.mu = mu 102 | self.sigma = sigma 103 | 104 | if sigma_min is not None: 105 | self.m = -(sigma - sigma_min) / self.exploration_anneal_episodes 106 | self.c = sigma 107 | self.sigma_min = sigma_min 108 | else: 109 | self.m = 0. 110 | self.c = sigma 111 | self.sigma_min = sigma 112 | 113 | @property 114 | def current_sigma(self): 115 | sigma = max(self.sigma_min, self.m * self.n_step + self.c) 116 | return sigma 117 | 118 | 119 | class GaussianWhiteNoisePolicy(AnnealedGaussianPolicy): 120 | 121 | def __init__(self, env_spec, exploration_anneal_episodes=20, 122 | mu=0., sigma=.3, sigma_min=None, 123 | **kwargs): # absorb generic param without breaking 124 | super(GaussianWhiteNoisePolicy, self).__init__( 125 | env_spec, exploration_anneal_episodes, 126 | mu, sigma, sigma_min) 127 | 128 | def sample(self): 129 | sample = np.random.normal(self.mu, self.current_sigma, self.size) 130 | return sample 131 | 132 | 133 | class OUNoisePolicy(AnnealedGaussianPolicy): 134 | 135 | ''' 136 | Based on 137 | http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 138 | ''' 139 | 140 | def __init__(self, env_spec, exploration_anneal_episodes=20, 141 | theta=.15, mu=0., sigma=.3, dt=1e-2, x0=None, sigma_min=None, 142 | **kwargs): # absorb generic param without breaking 143 | super(OUNoisePolicy, self).__init__( 144 | env_spec, exploration_anneal_episodes, 145 | mu, sigma, sigma_min, 146 | **kwargs) 147 | self.theta = theta 148 | self.mu = mu 149 | self.dt = dt 150 | self.x0 = x0 151 | self.reset_states() 152 | 153 | def reset_states(self): 154 | self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size) 155 | 156 | def sample(self): 157 | x = self.x_prev + self.theta * \ 158 | (self.mu - self.x_prev) * self.dt + self.current_sigma * \ 159 | np.sqrt(self.dt) * np.random.normal(size=self.size) 160 | self.x_prev = x 161 | return x 162 | -------------------------------------------------------------------------------- /rl/preprocessor/__init__.py: -------------------------------------------------------------------------------- 1 | from rl.util import import_package_files 2 | 3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__) 4 | -------------------------------------------------------------------------------- /rl/preprocessor/atari.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy as sp 3 | from rl.preprocessor.base_preprocessor import PreProcessor 4 | 5 | 6 | # Util functions for state preprocessing 7 | 8 | def resize_image(im): 9 | return sp.misc.imresize(im, (110, 84)) 10 | 11 | 12 | def crop_image(im): 13 | return im[-84:, :] 14 | 15 | 16 | def process_image_atari(im): 17 | ''' 18 | Image preprocessing from the paper 19 | Playing Atari with Deep Reinforcement Learning, 2013 20 | Takes an RGB image and converts it to grayscale, 21 | downsizes to 110 x 84 22 | and crops to square 84 x 84, taking bottomost rows of image 23 | ''' 24 | im_gray = np.dot(im[..., :3], [0.299, 0.587, 0.114]) 25 | im_resized = resize_image(im_gray) 26 | im_cropped = crop_image(im_resized) 27 | return im_cropped 28 | 29 | 30 | class Atari(PreProcessor): 31 | 32 | ''' 33 | Convert images to greyscale, downsize, crop, then stack 4 states 34 | NOTE: Image order is cols * rows * channels to match openai gym format 35 | Input to model is rows * cols * channels (== states) 36 | ''' 37 | 38 | def __init__(self, **kwargs): # absorb generic param without breaking): 39 | super(Atari, self).__init__() 40 | 41 | def preprocess_state(self): 42 | processed_state_queue = ( 43 | process_image_atari(self.state), 44 | process_image_atari(self.previous_state), 45 | process_image_atari(self.pre_previous_state), 46 | process_image_atari(self.pre_pre_previous_state)) 47 | processed_state = np.stack(processed_state_queue, axis=-1) 48 | return processed_state 49 | 50 | def preprocess_memory(self, action, reward, next_state, done): 51 | self.add_raw_exp(action, reward, next_state, done) 52 | if (self.exp_queue_size() < self.MAX_QUEUE_SIZE): # insufficient queue 53 | return 54 | (_state, action, reward, next_state, done) = self.exp_queue[-1] 55 | processed_next_state_queue = ( 56 | process_image_atari(self.exp_queue[-1][3]), 57 | process_image_atari(self.exp_queue[-2][3]), 58 | process_image_atari(self.exp_queue[-3][3]), 59 | process_image_atari(self.exp_queue[-4][3])) 60 | processed_state = self.preprocess_state() 61 | processed_next_state = np.stack(processed_next_state_queue, axis=-1) 62 | self.debug_state(processed_state, processed_next_state) 63 | processed_exp = (action, reward, processed_next_state, done) 64 | return processed_exp 65 | -------------------------------------------------------------------------------- /rl/preprocessor/base_preprocessor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.util import logger, log_self 3 | 4 | 5 | def create_dummy_states(state): 6 | state_shape = state.shape 7 | previous_state = np.zeros(state_shape) 8 | pre_previous_state = np.zeros(state_shape) 9 | pre_pre_previous_state = np.zeros(state_shape) 10 | if (previous_state.ndim == 1): 11 | previous_state = np.zeros([state_shape[0]]) 12 | pre_previous_state = np.zeros([state_shape[0]]) 13 | pre_pre_previous_state = np.zeros([state_shape[0]]) 14 | return (previous_state, pre_previous_state, pre_pre_previous_state) 15 | 16 | 17 | class PreProcessor(object): 18 | 19 | ''' 20 | The Base class for state preprocessing 21 | ''' 22 | 23 | def __init__(self, max_queue_size=4, **kwargs): 24 | '''Construct externally, and set at Agent.compile()''' 25 | self.agent = None 26 | self.state = None 27 | self.exp_queue = [] 28 | self.MAX_QUEUE_SIZE = max_queue_size 29 | self.never_debugged = True 30 | log_self(self) 31 | 32 | def reset_state(self, init_state): 33 | '''reset the state of LinearMemory per episode env.reset()''' 34 | self.state = np.array(init_state) # cast into np for safety 35 | (previous_state, pre_previous_state, 36 | pre_pre_previous_state) = create_dummy_states(self.state) 37 | self.previous_state = previous_state 38 | self.pre_previous_state = pre_previous_state 39 | self.pre_pre_previous_state = pre_pre_previous_state 40 | return self.preprocess_state() 41 | 42 | def exp_queue_size(self): 43 | return len(self.exp_queue) 44 | 45 | def debug_state(self, processed_state, processed_next_state): 46 | if self.never_debugged: 47 | logger.debug("State shape: {}".format(processed_state.shape)) 48 | logger.debug( 49 | "Next state shape: {}".format(processed_next_state.shape)) 50 | self.never_debugged = False 51 | 52 | def preprocess_env_spec(self, env_spec): 53 | '''helper to tweak env_spec according to preprocessor''' 54 | class_name = self.__class__.__name__ 55 | if class_name is 'StackStates': 56 | env_spec['state_dim'] = env_spec['state_dim'] * 2 57 | elif class_name is 'Atari': 58 | env_spec['state_dim'] = (84, 84, 4) 59 | return env_spec 60 | 61 | def preprocess_state(self): 62 | raise NotImplementedError() 63 | 64 | def advance_state(self, next_state): 65 | self.pre_pre_previous_state = self.pre_previous_state 66 | self.pre_previous_state = self.previous_state 67 | self.previous_state = self.state 68 | self.state = next_state 69 | 70 | def add_raw_exp(self, action, reward, next_state, done): 71 | ''' 72 | Buffer currently set to hold only last 4 experiences 73 | Amount needed for Atari games preprocessing 74 | ''' 75 | self.exp_queue.append([self.state, action, reward, next_state, done]) 76 | if (self.exp_queue_size() > self.MAX_QUEUE_SIZE): 77 | del self.exp_queue[0] 78 | self.advance_state(next_state) 79 | 80 | def preprocess_memory(self, action, reward, next_state, done): 81 | raise NotImplementedError() 82 | -------------------------------------------------------------------------------- /rl/preprocessor/linear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rl.preprocessor.base_preprocessor import PreProcessor 3 | 4 | 5 | class NoPreProcessor(PreProcessor): 6 | 7 | ''' 8 | Default class, no preprocessing 9 | ''' 10 | 11 | def __init__(self, **kwargs): # absorb generic param without breaking): 12 | super(NoPreProcessor, self).__init__() 13 | 14 | def preprocess_state(self): 15 | return self.state 16 | 17 | def preprocess_memory(self, action, reward, next_state, done): 18 | '''No state processing''' 19 | self.add_raw_exp(action, reward, next_state, done) 20 | (_state, action, reward, next_state, done) = self.exp_queue[-1] 21 | processed_exp = (action, reward, next_state, done) 22 | return processed_exp 23 | 24 | 25 | class StackStates(PreProcessor): 26 | 27 | ''' 28 | Current and last state are concatenated to form input to model 29 | ''' 30 | 31 | def __init__(self, **kwargs): # absorb generic param without breaking): 32 | super(StackStates, self).__init__(max_queue_size=2) 33 | 34 | def preprocess_state(self): 35 | processed_state = np.concatenate([self.previous_state, self.state]) 36 | return processed_state 37 | 38 | def preprocess_memory(self, action, reward, next_state, done): 39 | '''Concatenate: previous + current states''' 40 | self.add_raw_exp(action, reward, next_state, done) 41 | if (self.exp_queue_size() < self.MAX_QUEUE_SIZE): # insufficient queue 42 | return 43 | (state, action, reward, next_state, done) = self.exp_queue[-1] 44 | processed_state = self.preprocess_state() 45 | processed_next_state = np.concatenate([state, next_state]) 46 | self.debug_state(processed_state, processed_next_state) 47 | processed_exp = (action, reward, processed_next_state, done) 48 | return processed_exp 49 | 50 | 51 | class DiffStates(PreProcessor): 52 | 53 | ''' 54 | Different between current and last state is input to model 55 | ''' 56 | 57 | def __init__(self, **kwargs): # absorb generic param without breaking): 58 | super(DiffStates, self).__init__(max_queue_size=2) 59 | 60 | def preprocess_state(self): 61 | processed_state = self.state - self.previous_state 62 | return processed_state 63 | 64 | def preprocess_memory(self, action, reward, next_state, done): 65 | '''Change in state, curr_state - last_state''' 66 | self.add_raw_exp(action, reward, next_state, done) 67 | if (self.exp_queue_size() < self.MAX_QUEUE_SIZE): # insufficient queue 68 | return 69 | (state, action, reward, next_state, done) = self.exp_queue[-1] 70 | processed_state = self.preprocess_state() 71 | processed_next_state = next_state - state 72 | self.debug_state(processed_state, processed_next_state) 73 | processed_exp = (action, reward, processed_next_state, done) 74 | return processed_exp 75 | -------------------------------------------------------------------------------- /rl/spec/atari_experiment_specs.json: -------------------------------------------------------------------------------- 1 | { 2 | "dev_conv_dqn": { 3 | "problem": "DevBreakout-v0", 4 | "Agent": "ConvDQN", 5 | "HyperOptimizer": "GridSearch", 6 | "Memory": "LinearMemoryWithForgetting", 7 | "Optimizer": "AdamOptimizer", 8 | "Policy": "EpsilonGreedyPolicy", 9 | "PreProcessor": "Atari", 10 | "param": { 11 | "train_per_n_new_exp": 4, 12 | "lr": 0.001, 13 | "batch_size": 32, 14 | "gamma": 0.99, 15 | "hidden_layers": [ 16 | [16, 8, 8, [4, 4]], 17 | [32, 4, 4, [2, 2]] 18 | ], 19 | "hidden_layers_activation": "relu", 20 | "exploration_anneal_episodes": 3000, 21 | "epi_change_lr": 3000, 22 | "auto_architecture": true, 23 | "num_hidden_layers": 3, 24 | "num_initial_channels": 8, 25 | "max_mem_len": 500000 26 | 27 | }, 28 | "param_range": { 29 | "lr": [0.001, 0.0001], 30 | "hidden_layers": [ 31 | [ 32 | [16, 8, 8, [4, 4]], 33 | [32, 4, 4, [2, 2]] 34 | ] 35 | ] 36 | } 37 | }, 38 | "breakout_dqn": { 39 | "problem": "Breakout-v0", 40 | "Agent": "ConvDQN", 41 | "HyperOptimizer": "GridSearch", 42 | "Memory": "LinearMemoryWithForgetting", 43 | "Optimizer": "AdamOptimizer", 44 | "Policy": "EpsilonGreedyPolicy", 45 | "PreProcessor": "Atari", 46 | "param": { 47 | "train_per_n_new_exp": 4, 48 | "batch_size": 32, 49 | "lr": 0.001, 50 | "gamma": 0.99, 51 | "hidden_layers": [ 52 | [16, 8, 8, [4, 4]], 53 | [32, 4, 4, [2, 2]] 54 | ], 55 | "hidden_layers_activation": "relu", 56 | "exploration_anneal_episodes": 3000, 57 | "epi_change_lr": 3000, 58 | "max_mem_len": 500000 59 | }, 60 | "param_range": { 61 | "lr": [0.001, 0.01] 62 | } 63 | }, 64 | "breakout_double_dqn": { 65 | "problem": "Breakout-v0", 66 | "Agent": "DoubleConvDQN", 67 | "HyperOptimizer": "GridSearch", 68 | "Memory": "LinearMemoryWithForgetting", 69 | "Optimizer": "AdamOptimizer", 70 | "Policy": "DoubleDQNEpsilonGreedyPolicy", 71 | "PreProcessor": "Atari", 72 | "param": { 73 | "train_per_n_new_exp": 4, 74 | "batch_size": 32, 75 | "lr": 0.001, 76 | "gamma": 0.99, 77 | "hidden_layers": [ 78 | [16, 8, 8, [4, 4]], 79 | [32, 4, 4, [2, 2]] 80 | ], 81 | "hidden_layers_activation": "relu", 82 | "exploration_anneal_episodes": 3000, 83 | "epi_change_lr": 3000, 84 | "max_mem_len": 500000 85 | }, 86 | "param_range": { 87 | "lr": [0.001, 0.0001], 88 | "gamma": [0.97, 0.99] 89 | } 90 | }, 91 | "air_raid_dqn": { 92 | "problem": "AirRaid-v0", 93 | "Agent": "ConvDQN", 94 | "HyperOptimizer": "GridSearch", 95 | "Memory": "LinearMemoryWithForgetting", 96 | "Optimizer": "AdamOptimizer", 97 | "Policy": "EpsilonGreedyPolicy", 98 | "PreProcessor": "Atari", 99 | "param": { 100 | "train_per_n_new_exp": 4, 101 | "batch_size": 32, 102 | "lr": 0.001, 103 | "gamma": 0.99, 104 | "hidden_layers": [ 105 | [16, 8, 8, [4, 4]], 106 | [32, 4, 4, [2, 2]] 107 | ], 108 | "hidden_layers_activation": "relu", 109 | "exploration_anneal_episodes": 10000, 110 | "epi_change_lr": 10000, 111 | "max_mem_len": 500000 112 | }, 113 | "param_range": { 114 | "lr": [0.001, 0.0001], 115 | "gamma": [0.97, 0.99] 116 | } 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /rl/spec/box2d_experiment_specs.json: -------------------------------------------------------------------------------- 1 | { 2 | "lunar_dqn": { 3 | "problem": "LunarLander-v2", 4 | "Agent": "DQN", 5 | "HyperOptimizer": "GridSearch", 6 | "Memory": "LinearMemoryWithForgetting", 7 | "Optimizer": "AdamOptimizer", 8 | "Policy": "EpsilonGreedyPolicy", 9 | "PreProcessor": "StackStates", 10 | "param": { 11 | "train_per_n_new_exp": 5, 12 | "batch_size": 32, 13 | "lr": 0.005, 14 | "gamma": 0.99, 15 | "hidden_layers": [400, 200], 16 | "hidden_layers_activation": "sigmoid", 17 | "output_layer_activation": "linear", 18 | "exploration_anneal_episodes": 150, 19 | "epi_change_lr": 200 20 | }, 21 | "param_range": { 22 | "lr": [0.0005, 0.001, 0.005, 0.01, 0.02], 23 | "gamma": [0.95, 0.97, 0.99, 0.999], 24 | "hidden_layers": [ 25 | [400, 200], 26 | [800, 400], 27 | [400, 200, 100], 28 | [400, 200, 100, 50] 29 | ] 30 | } 31 | }, 32 | "rand_lunar_dqn": { 33 | "problem": "LunarLander-v2", 34 | "Agent": "DQN", 35 | "HyperOptimizer": "RandomSearch", 36 | "Memory": "LinearMemoryWithForgetting", 37 | "Optimizer": "AdamOptimizer", 38 | "Policy": "EpsilonGreedyPolicy", 39 | "PreProcessor": "NoPreProcessor", 40 | "param": { 41 | "max_evals": 100, 42 | "train_per_n_new_exp": 5, 43 | "batch_size": 32, 44 | "lr": 0.001, 45 | "gamma": 0.99, 46 | "hidden_layers": [300, 150, 75], 47 | "hidden_layers_activation": "relu", 48 | "output_layer_activation": "linear", 49 | "exploration_anneal_episodes": 150, 50 | "epi_change_lr": 200 51 | }, 52 | "param_range": { 53 | "lr": { 54 | "min": 0.0005, 55 | "max": 0.05 56 | }, 57 | "gamma": { 58 | "min": 0.97, 59 | "max": 0.9999 60 | }, 61 | "hidden_layers": [ 62 | [400, 200], 63 | [800, 400], 64 | [200, 100, 50], 65 | [400, 200, 100], 66 | [400, 200, 100, 50] 67 | ] 68 | } 69 | }, 70 | "lunar_double_dqn": { 71 | "problem": "LunarLander-v2", 72 | "Agent": "DoubleDQN", 73 | "HyperOptimizer": "GridSearch", 74 | "Memory": "LinearMemoryWithForgetting", 75 | "Optimizer": "AdamOptimizer", 76 | "Policy": "DoubleDQNBoltzmannPolicy", 77 | "PreProcessor": "StackStates", 78 | "param": { 79 | "train_per_n_new_exp": 5, 80 | "batch_size": 32, 81 | "lr": 0.005, 82 | "gamma": 0.99, 83 | "hidden_layers": [800, 400], 84 | "hidden_layers_activation": "sigmoid", 85 | "output_layer_activation": "linear", 86 | "exploration_anneal_episodes": 150, 87 | "epi_change_lr": 200 88 | }, 89 | "param_range": { 90 | "lr": [0.0005, 0.001, 0.005, 0.01, 0.02], 91 | "gamma": [0.95, 0.97, 0.99, 0.999], 92 | "hidden_layers": [ 93 | [400, 200], 94 | [800, 400], 95 | [400, 200, 100], 96 | [400, 200, 100, 50] 97 | ] 98 | } 99 | }, 100 | "lunar_double_dqn_nopreprocess": { 101 | "problem": "LunarLander-v2", 102 | "Agent": "DoubleDQN", 103 | "HyperOptimizer": "GridSearch", 104 | "Memory": "LinearMemoryWithForgetting", 105 | "Optimizer": "AdamOptimizer", 106 | "Policy": "DoubleDQNBoltzmannPolicy", 107 | "PreProcessor": "NoPreProcessor", 108 | "param": { 109 | "train_per_n_new_exp": 5, 110 | "batch_size": 32, 111 | "lr": 0.005, 112 | "gamma": 0.99, 113 | "hidden_layers": [800, 400], 114 | "hidden_layers_activation": "sigmoid", 115 | "output_layer_activation": "linear", 116 | "exploration_anneal_episodes": 150, 117 | "epi_change_lr": 200 118 | }, 119 | "param_range": { 120 | "lr": [0.005, 0.01, 0.02], 121 | "gamma": [0.97, 0.99, 0.999], 122 | "hidden_layers": [ 123 | [400, 200], 124 | [800, 400] 125 | ] 126 | } 127 | }, 128 | "lunar_freeze": { 129 | "problem": "LunarLander-v2", 130 | "Agent": "FreezeDQN", 131 | "HyperOptimizer": "GridSearch", 132 | "Memory": "LinearMemoryWithForgetting", 133 | "Optimizer": "AdamOptimizer", 134 | "Policy": "BoltzmannPolicy", 135 | "PreProcessor": "StackStates", 136 | "param": { 137 | "train_per_n_new_exp": 5, 138 | "batch_size": 32, 139 | "lr": 0.001, 140 | "gamma": 0.99, 141 | "hidden_layers": [300, 150, 75], 142 | "hidden_layers_activation": "relu", 143 | "output_layer_activation": "linear", 144 | "exploration_anneal_episodes": 150, 145 | "epi_change_lr": 200 146 | }, 147 | "param_range": { 148 | "lr": [0.0001, 0.0005, 0.001, 0.005], 149 | "gamma": [0.97, 0.99, 0.999], 150 | "hidden_layers": [ 151 | [200, 100], 152 | [400, 200], 153 | [300, 150, 75], 154 | [400, 200, 100] 155 | ] 156 | } 157 | }, 158 | "lunar_sarsa": { 159 | "problem": "LunarLander-v2", 160 | "Agent": "DeepExpectedSarsa", 161 | "HyperOptimizer": "GridSearch", 162 | "Memory": "LinearMemoryWithForgetting", 163 | "Optimizer": "AdamOptimizer", 164 | "Policy": "EpsilonGreedyPolicy", 165 | "PreProcessor": "StackStates", 166 | "param": { 167 | "train_per_n_new_exp": 1, 168 | "lr": 0.001, 169 | "gamma": 0.99, 170 | "hidden_layers": [300, 150, 75], 171 | "hidden_layers_activation": "relu", 172 | "output_layer_activation": "linear", 173 | "exploration_anneal_episodes": 150, 174 | "epi_change_lr": 200 175 | }, 176 | "param_range": { 177 | "lr": [0.0001, 0.0005, 0.001, 0.005], 178 | "gamma": [0.97, 0.99, 0.999], 179 | "hidden_layers": [ 180 | [200, 100], 181 | [400, 200], 182 | [300, 150, 75], 183 | [400, 200, 100] 184 | ] 185 | } 186 | }, 187 | "lunar_offpol_sarsa": { 188 | "problem": "LunarLander-v2", 189 | "Agent": "OffPolicySarsa", 190 | "HyperOptimizer": "GridSearch", 191 | "Memory": "LinearMemoryWithForgetting", 192 | "Optimizer": "AdamOptimizer", 193 | "Policy": "EpsilonGreedyPolicy", 194 | "PreProcessor": "StackStates", 195 | "param": { 196 | "train_per_n_new_exp": 5, 197 | "batch_size": 32, 198 | "lr": 0.001, 199 | "gamma": 0.99, 200 | "hidden_layers": [800, 400], 201 | "hidden_layers_activation": "sigmoid", 202 | "output_layer_activation": "linear", 203 | "exploration_anneal_episodes": 150, 204 | "epi_change_lr": 200 205 | }, 206 | "param_range": { 207 | "lr": [0.001, 0.005, 0.01], 208 | "gamma": [0.97, 0.99, 0.999], 209 | "hidden_layers": [ 210 | [400, 200], 211 | [800, 400], 212 | [400, 200, 100] 213 | ] 214 | } 215 | }, 216 | "lunar_ac_softmax": { 217 | "problem": "LunarLander-v2", 218 | "Agent": "ActorCritic", 219 | "HyperOptimizer": "GridSearch", 220 | "Memory": "LinearMemoryWithForgetting", 221 | "Optimizer": "AdamOptimizer", 222 | "Policy": "SoftmaxPolicy", 223 | "PreProcessor": "NoPreProcessor", 224 | "param": { 225 | "lr": 0.02, 226 | "gamma": 0.99, 227 | "hidden_layers": [64], 228 | "hidden_layers_activation": "sigmoid" 229 | }, 230 | "param_range": { 231 | "lr": [0.001, 0.005, 0.01], 232 | "gamma": [0.99, 0.999], 233 | "hidden_layers": [ 234 | [400, 300], 235 | [800, 400], 236 | [800, 600] 237 | ] 238 | } 239 | }, 240 | "lunar_cont_ddpg_linearnoise": { 241 | "problem": "LunarLanderContinuous-v2", 242 | "Agent": "DDPG", 243 | "HyperOptimizer": "GridSearch", 244 | "Memory": "LinearMemoryWithForgetting", 245 | "Optimizer": "AdamOptimizer", 246 | "Policy": "LinearNoisePolicy", 247 | "PreProcessor": "NoPreProcessor", 248 | "param": { 249 | "batch_size": 64, 250 | "n_epoch": 1, 251 | "tau": 0.005, 252 | "lr": 0.001, 253 | "critic_lr": 0.001, 254 | "exploration_anneal_episodes": 100, 255 | "gamma": 0.99, 256 | "hidden_layers": [600, 300], 257 | "hidden_layers_activation": "relu", 258 | "output_layer_activation": "tanh" 259 | }, 260 | "param_range": { 261 | "lr": [0.0001, 0.0005, 0.001], 262 | "critic_lr": [0.001, 0.005, 0.01], 263 | "gamma": [0.97, 0.99, 0.999], 264 | "hidden_layers": [ 265 | [400, 300], 266 | [600, 300], 267 | [800, 400, 200] 268 | ] 269 | } 270 | }, 271 | "lunar_cont_ddpg_per_linearnoise": { 272 | "problem": "LunarLanderContinuous-v2", 273 | "Agent": "DDPG", 274 | "HyperOptimizer": "GridSearch", 275 | "Memory": "PrioritizedExperienceReplay", 276 | "Optimizer": "AdamOptimizer", 277 | "Policy": "LinearNoisePolicy", 278 | "PreProcessor": "NoPreProcessor", 279 | "param": { 280 | "batch_size": 64, 281 | "n_epoch": 1, 282 | "tau": 0.005, 283 | "lr": 0.001, 284 | "critic_lr": 0.001, 285 | "exploration_anneal_episodes": 100, 286 | "gamma": 0.97, 287 | "hidden_layers": [400, 300], 288 | "hidden_layers_activation": "relu", 289 | "output_layer_activation": "tanh" 290 | }, 291 | "param_range": { 292 | "lr": [0.0001, 0.0005, 0.001], 293 | "critic_lr": [0.001, 0.005, 0.01], 294 | "gamma": [0.97, 0.99, 0.999], 295 | "hidden_layers": [ 296 | [400, 300], 297 | [600, 300], 298 | [800, 400, 200] 299 | ] 300 | } 301 | }, 302 | "walker_ddpg_linearnoise": { 303 | "problem": "BipedalWalker-v2", 304 | "Agent": "DDPG", 305 | "HyperOptimizer": "GridSearch", 306 | "Memory": "LinearMemoryWithForgetting", 307 | "Optimizer": "AdamOptimizer", 308 | "Policy": "LinearNoisePolicy", 309 | "PreProcessor": "NoPreProcessor", 310 | "param": { 311 | "batch_size": 64, 312 | "n_epoch": 1, 313 | "tau": 0.005, 314 | "lr": 0.001, 315 | "critic_lr": 0.001, 316 | "exploration_anneal_episodes": 100, 317 | "gamma": 0.97, 318 | "hidden_layers": [400, 300], 319 | "hidden_layers_activation": "relu", 320 | "output_layer_activation": "tanh" 321 | }, 322 | "param_range": { 323 | "lr": [0.0001, 0.0005, 0.001], 324 | "critic_lr": [0.001, 0.005, 0.01], 325 | "gamma": [0.97, 0.99, 0.999], 326 | "hidden_layers": [ 327 | [400, 300], 328 | [600, 300], 329 | [800, 400, 200] 330 | ] 331 | } 332 | }, 333 | "walker_ddpg_per_linearnoise": { 334 | "problem": "BipedalWalker-v2", 335 | "Agent": "DDPG", 336 | "HyperOptimizer": "GridSearch", 337 | "Memory": "PrioritizedExperienceReplay", 338 | "Optimizer": "AdamOptimizer", 339 | "Policy": "LinearNoisePolicy", 340 | "PreProcessor": "NoPreProcessor", 341 | "param": { 342 | "batch_size": 64, 343 | "n_epoch": 1, 344 | "tau": 0.005, 345 | "lr": 0.0005, 346 | "critic_lr": 0.001, 347 | "gamma": 0.97, 348 | "hidden_layers": [400, 200], 349 | "hidden_layers_activation": "relu", 350 | "output_layer_activation": "tanh" 351 | }, 352 | "param_range": { 353 | "lr": [0.0001, 0.0005], 354 | "critic_lr": [0.001, 0.005], 355 | "gamma": [0.95, 0.97, 0.99], 356 | "hidden_layers": [ 357 | [200, 100], 358 | [400, 300], 359 | [800, 400] 360 | ] 361 | } 362 | } 363 | } 364 | -------------------------------------------------------------------------------- /rl/spec/component_locks.json: -------------------------------------------------------------------------------- 1 | { 2 | "double_network": { 3 | "type": "mutex", 4 | "details": "double_network agents need policies that invokes both networks properly", 5 | "head": "Agent", 6 | "Agent": [ 7 | "DoubleConvDQN", 8 | "DoubleDQN" 9 | ], 10 | "Policy": [ 11 | "DoubleDQNBoltzmannPolicy", 12 | "DoubleDQNEpsilonGreedyPolicy" 13 | ] 14 | }, 15 | "ddpg": { 16 | "type": "mutex", 17 | "details": "ddpg uses white-noise policy", 18 | "head": "Agent", 19 | "Agent": [ 20 | "DDPG" 21 | ], 22 | "Policy": [ 23 | "GaussianWhiteNoisePolicy", 24 | "LinearNoisePolicy", 25 | "NoNoisePolicy", 26 | "OUNoisePolicy" 27 | ] 28 | }, 29 | "actor_critic": { 30 | "type": "mutex", 31 | "details": "actor critic uses custom Q computation in its policy", 32 | "head": "Agent", 33 | "Agent": [ 34 | "ActorCritic" 35 | ], 36 | "Policy": [ 37 | "ArgmaxPolicy", 38 | "BoundedPolicy", 39 | "GaussianPolicy", 40 | "SoftmaxPolicy" 41 | ] 42 | }, 43 | "actor_critic_discrete": { 44 | "type": "subset", 45 | "details": "actor critic discrete components cannot work in continuous action space", 46 | "head": "problem", 47 | "problem": [ 48 | "Acrobot-v1", 49 | "AirRaid-v0", 50 | "Alien-v0", 51 | "Assault-v0", 52 | "Breakout-v0", 53 | "CartPole-v0", 54 | "CartPole-v1", 55 | "DevBreakout-v0", 56 | "DevCartPole-v0", 57 | "FlappyBird-v0", 58 | "LunarLander-v2", 59 | "MountainCar-v0", 60 | "MsPacman-v0", 61 | "Pong-v0", 62 | "Qbert-v0", 63 | "Snake-v0", 64 | "SpaceInvader-v0", 65 | "TestPassCartPole-v0" 66 | ], 67 | "Policy": [ 68 | "ArgmaxPolicy", 69 | "SoftmaxPolicy" 70 | ] 71 | }, 72 | "discrete_action": { 73 | "type": "subset", 74 | "details": "discrete components cannot work in continuous action space", 75 | "head": "problem", 76 | "problem": [ 77 | "Acrobot-v1", 78 | "AirRaid-v0", 79 | "Alien-v0", 80 | "Assault-v0", 81 | "Breakout-v0", 82 | "CartPole-v0", 83 | "CartPole-v1", 84 | "DevBreakout-v0", 85 | "DevCartPole-v0", 86 | "FlappyBird-v0", 87 | "LunarLander-v2", 88 | "MountainCar-v0", 89 | "MsPacman-v0", 90 | "Pong-v0", 91 | "Qbert-v0", 92 | "Snake-v0", 93 | "SpaceInvader-v0", 94 | "TestPassCartPole-v0" 95 | ], 96 | "Agent": [ 97 | "ConvDQN", 98 | "DeepExpectedSarsa", 99 | "DeepSarsa", 100 | "DoubleConvDQN", 101 | "DoubleDQN", 102 | "DQN", 103 | "Dummy", 104 | "FreezeDQN", 105 | "OffPolicySarsa", 106 | "QTable" 107 | ], 108 | "Policy": [ 109 | "BoltzmannPolicy", 110 | "DecayingEpsilonGreedyPolicy", 111 | "DoubleDQNBoltzmannPolicy", 112 | "DoubleDQNEpsilonGreedyPolicy", 113 | "EpsilonGreedyPolicy", 114 | "OscillatingEpsilonGreedyPolicy", 115 | "TargetedEpsilonGreedyPolicy" 116 | ] 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /rl/spec/dev_experiment_specs.json: -------------------------------------------------------------------------------- 1 | { 2 | "dummy": { 3 | "problem": "CartPole-v0", 4 | "Agent": "Dummy", 5 | "HyperOptimizer": "GridSearch", 6 | "Memory": "LinearMemory", 7 | "Optimizer": "SGDOptimizer", 8 | "Policy": "EpsilonGreedyPolicy", 9 | "PreProcessor": "NoPreProcessor", 10 | "param": {} 11 | }, 12 | "q_table": { 13 | "problem": "CartPole-v0", 14 | "Agent": "QTable", 15 | "HyperOptimizer": "GridSearch", 16 | "Memory": "LinearMemory", 17 | "Optimizer": "SGDOptimizer", 18 | "Policy": "EpsilonGreedyPolicy", 19 | "PreProcessor": "NoPreProcessor", 20 | "param": { 21 | "lr": 0.01, 22 | "gamma": 0.99, 23 | "exploration_anneal_episodes": 100 24 | } 25 | }, 26 | "test_dqn_pass": { 27 | "problem": "TestPassCartPole-v0", 28 | "Agent": "DQN", 29 | "HyperOptimizer": "GridSearch", 30 | "Memory": "LinearMemoryWithForgetting", 31 | "Optimizer": "AdamOptimizer", 32 | "Policy": "BoltzmannPolicy", 33 | "PreProcessor": "NoPreProcessor", 34 | "param": { 35 | "lr": 0.01, 36 | "decay": 0.0, 37 | "gamma": 0.99, 38 | "hidden_layers": [16], 39 | "hidden_layers_activation": "sigmoid", 40 | "exploration_anneal_episodes": 10 41 | }, 42 | "param_range": { 43 | "lr": [0.0001, 0.0005], 44 | "gamma": [0.97, 0.99] 45 | } 46 | }, 47 | "test_dqn_grid_search": { 48 | "problem": "DevCartPole-v0", 49 | "Agent": "DQN", 50 | "HyperOptimizer": "GridSearch", 51 | "Memory": "LinearMemoryWithForgetting", 52 | "Optimizer": "AdamOptimizer", 53 | "Policy": "BoltzmannPolicy", 54 | "PreProcessor": "NoPreProcessor", 55 | "param": { 56 | "lr": 0.01, 57 | "decay": 0.0, 58 | "gamma": 0.99, 59 | "hidden_layers": [16], 60 | "hidden_layers_activation": "sigmoid", 61 | "exploration_anneal_episodes": 10 62 | }, 63 | "param_range": { 64 | "lr": [0.0001, 0.0005], 65 | "gamma": [0.97, 0.99] 66 | } 67 | }, 68 | "test_dqn_random_search": { 69 | "problem": "DevCartPole-v0", 70 | "Agent": "DQN", 71 | "HyperOptimizer": "RandomSearch", 72 | "Memory": "LinearMemoryWithForgetting", 73 | "Optimizer": "AdamOptimizer", 74 | "Policy": "BoltzmannPolicy", 75 | "PreProcessor": "NoPreProcessor", 76 | "param": { 77 | "max_evals": 3, 78 | "lr": 0.01, 79 | "decay": 0.0, 80 | "gamma": 0.99, 81 | "hidden_layers": [16], 82 | "hidden_layers_activation": "sigmoid", 83 | "exploration_anneal_episodes": 10 84 | }, 85 | "param_range": { 86 | "lr": { 87 | "min": 0.0001, 88 | "max": 0.005 89 | }, 90 | "gamma": { 91 | "min": 0.90, 92 | "max": 0.999 93 | } 94 | } 95 | }, 96 | "dev_dqn": { 97 | "problem": "DevCartPole-v0", 98 | "Agent": "DQN", 99 | "HyperOptimizer": "GridSearch", 100 | "Memory": "PrioritizedExperienceReplay", 101 | "Optimizer": "AdamOptimizer", 102 | "Policy": "BoltzmannPolicy", 103 | "PreProcessor": "NoPreProcessor", 104 | "param": { 105 | "lr": 0.01, 106 | "decay": 0.0, 107 | "gamma": 0.99, 108 | "n_epoch": 1, 109 | "hidden_layers": [32], 110 | "hidden_layers_activation": "sigmoid", 111 | "exploration_anneal_episodes": 10, 112 | "auto_architecture": false, 113 | "num_hidden_layers": 3, 114 | "first_hidden_layer_size": 512, 115 | "e": 0.01, 116 | "alpha": 0.6, 117 | "max_mem_len": 7 118 | }, 119 | "param_range": { 120 | "gamma": [0.97, 0.99], 121 | "lr": [0.01, 0.1] 122 | } 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /rl/spec/problems.json: -------------------------------------------------------------------------------- 1 | { 2 | "DevCartPole-v0": { 3 | "GYM_ENV_NAME": "CartPole-v0", 4 | "SOLVED_MEAN_REWARD": 195.0, 5 | "MAX_EPISODES": 4, 6 | "REWARD_MEAN_LEN": 100 7 | }, 8 | "TestPassCartPole-v0": { 9 | "GYM_ENV_NAME": "CartPole-v0", 10 | "SOLVED_MEAN_REWARD": 50.0, 11 | "MAX_EPISODES": 20, 12 | "REWARD_MEAN_LEN": 100 13 | }, 14 | "CartPole-v0": { 15 | "GYM_ENV_NAME": "CartPole-v0", 16 | "SOLVED_MEAN_REWARD": 195.0, 17 | "MAX_EPISODES": 250, 18 | "REWARD_MEAN_LEN": 100 19 | }, 20 | "CartPole-v1": { 21 | "GYM_ENV_NAME": "CartPole-v1", 22 | "SOLVED_MEAN_REWARD": 475.0, 23 | "MAX_EPISODES": 500, 24 | "REWARD_MEAN_LEN": 100 25 | }, 26 | "Acrobot-v1": { 27 | "GYM_ENV_NAME": "Acrobot-v1", 28 | "SOLVED_MEAN_REWARD": null, 29 | "MAX_EPISODES": 600, 30 | "REWARD_MEAN_LEN": 100 31 | }, 32 | "MountainCar-v0": { 33 | "GYM_ENV_NAME": "MountainCar-v0", 34 | "SOLVED_MEAN_REWARD": -110.0, 35 | "MAX_EPISODES": 1400, 36 | "REWARD_MEAN_LEN": 100 37 | }, 38 | "MountainCarContinuous-v0": { 39 | "GYM_ENV_NAME": "MountainCarContinuous-v0", 40 | "SOLVED_MEAN_REWARD": 90.0, 41 | "MAX_EPISODES": 5000, 42 | "REWARD_MEAN_LEN": 100 43 | }, 44 | "Pendulum-v0": { 45 | "GYM_ENV_NAME": "Pendulum-v0", 46 | "SOLVED_MEAN_REWARD": null, 47 | "MAX_EPISODES": 300, 48 | "REWARD_MEAN_LEN": 100 49 | }, 50 | "LunarLander-v2": { 51 | "GYM_ENV_NAME": "LunarLander-v2", 52 | "SOLVED_MEAN_REWARD": 200.0, 53 | "MAX_EPISODES": 600, 54 | "REWARD_MEAN_LEN": 100 55 | }, 56 | "LunarLanderContinuous-v2": { 57 | "GYM_ENV_NAME": "LunarLanderContinuous-v2", 58 | "SOLVED_MEAN_REWARD": 200.0, 59 | "MAX_EPISODES": 800, 60 | "REWARD_MEAN_LEN": 100 61 | }, 62 | "BipedalWalker-v2": { 63 | "GYM_ENV_NAME": "BipedalWalker-v2", 64 | "SOLVED_MEAN_REWARD": 300.0, 65 | "MAX_EPISODES": 5000, 66 | "REWARD_MEAN_LEN": 100 67 | }, 68 | "BipedalWalkerHardcore-v2": { 69 | "GYM_ENV_NAME": "BipedalWalkerHardcore-v2", 70 | "SOLVED_MEAN_REWARD": 300.0, 71 | "MAX_EPISODES": 5000, 72 | "REWARD_MEAN_LEN": 100 73 | }, 74 | "CarRacing-v0": { 75 | "GYM_ENV_NAME": "CarRacing-v0", 76 | "SOLVED_MEAN_REWARD": 900.0, 77 | "MAX_EPISODES": 5000, 78 | "REWARD_MEAN_LEN": 100 79 | }, 80 | "AirRaid-v0": { 81 | "GYM_ENV_NAME": "AirRaid-v0", 82 | "SOLVED_MEAN_REWARD": null, 83 | "MAX_EPISODES": 5000, 84 | "REWARD_MEAN_LEN": 100 85 | }, 86 | "Alien-v0": { 87 | "GYM_ENV_NAME": "Alien-v0", 88 | "SOLVED_MEAN_REWARD": null, 89 | "MAX_EPISODES": 5000, 90 | "REWARD_MEAN_LEN": 100 91 | }, 92 | "Assault-v0": { 93 | "GYM_ENV_NAME": "Assault-v0", 94 | "SOLVED_MEAN_REWARD": null, 95 | "MAX_EPISODES": 5000, 96 | "REWARD_MEAN_LEN": 100 97 | }, 98 | "DevBreakout-v0": { 99 | "GYM_ENV_NAME": "Breakout-v0", 100 | "SOLVED_MEAN_REWARD": null, 101 | "MAX_EPISODES": 1, 102 | "REWARD_MEAN_LEN": 100 103 | }, 104 | "Breakout-v0": { 105 | "GYM_ENV_NAME": "Breakout-v0", 106 | "SOLVED_MEAN_REWARD": null, 107 | "MAX_EPISODES": 5000, 108 | "REWARD_MEAN_LEN": 100 109 | }, 110 | "MsPacman-v0": { 111 | "GYM_ENV_NAME": "MsPacman-v0", 112 | "SOLVED_MEAN_REWARD": null, 113 | "MAX_EPISODES": 5000, 114 | "REWARD_MEAN_LEN": 100 115 | }, 116 | "Pong-v0": { 117 | "GYM_ENV_NAME": "Pong-v0", 118 | "SOLVED_MEAN_REWARD": null, 119 | "MAX_EPISODES": 5000, 120 | "REWARD_MEAN_LEN": 100 121 | }, 122 | "Qbert-v0": { 123 | "GYM_ENV_NAME": "Qbert-v0", 124 | "SOLVED_MEAN_REWARD": null, 125 | "MAX_EPISODES": 5000, 126 | "REWARD_MEAN_LEN": 100 127 | }, 128 | "SpaceInvader-v0": { 129 | "GYM_ENV_NAME": "SpaceInvader-v0", 130 | "SOLVED_MEAN_REWARD": null, 131 | "MAX_EPISODES": 5000, 132 | "REWARD_MEAN_LEN": 100 133 | }, 134 | "FlappyBird-v0": { 135 | "GYM_ENV_NAME": "FlappyBird-v0", 136 | "SOLVED_MEAN_REWARD": null, 137 | "MAX_EPISODES": 1000, 138 | "REWARD_MEAN_LEN": 100 139 | }, 140 | "Snake-v0": { 141 | "GYM_ENV_NAME": "Snake-v0", 142 | "SOLVED_MEAN_REWARD": null, 143 | "MAX_EPISODES": 1000, 144 | "REWARD_MEAN_LEN": 100 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /rl/spec/pygame_experiment_specs.json: -------------------------------------------------------------------------------- 1 | { 2 | "flappy": { 3 | "problem": "FlappyBird-v0", 4 | "Agent": "ConvDQN", 5 | "HyperOptimizer": "GridSearch", 6 | "Memory": "LinearMemoryWithForgetting", 7 | "Optimizer": "AdamOptimizer", 8 | "Policy": "EpsilonGreedyPolicy", 9 | "PreProcessor": "NoPreProcessor", 10 | "param": { 11 | "train_per_n_new_exp": 4, 12 | "batch_size": 32, 13 | "lr": 0.001, 14 | "gamma": 0.99, 15 | "hidden_layers": [ 16 | [16, 8, 8, [4, 4]], 17 | [32, 4, 4, [2, 2]] 18 | ], 19 | "hidden_layers_activation": "relu", 20 | "exploration_anneal_episodes": 5000, 21 | "epi_change_lr": 5000 22 | }, 23 | "param_range": { 24 | "lr": [0.001, 0.0001], 25 | "gamma": [0.97, 0.99] 26 | } 27 | }, 28 | "snake": { 29 | "problem": "Snake-v0", 30 | "Agent": "ConvDQN", 31 | "HyperOptimizer": "GridSearch", 32 | "Memory": "LinearMemoryWithForgetting", 33 | "Optimizer": "AdamOptimizer", 34 | "Policy": "EpsilonGreedyPolicy", 35 | "PreProcessor": "NoPreProcessor", 36 | "param": { 37 | "train_per_n_new_exp": 4, 38 | "batch_size": 32, 39 | "lr": 0.001, 40 | "gamma": 0.99, 41 | "hidden_layers": [ 42 | [16, 8, 8, [4, 4]], 43 | [32, 4, 4, [2, 2]] 44 | ], 45 | "hidden_layers_activation": "relu", 46 | "exploration_anneal_episodes": 5000, 47 | "epi_change_lr": 5000 48 | }, 49 | "param_range": { 50 | "lr": [0.001, 0.0001], 51 | "gamma": [0.97, 0.99] 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /rl/util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import inspect 4 | import json 5 | import logging 6 | import multiprocessing as mp 7 | import numpy as np 8 | import re 9 | import sys 10 | import zipfile 11 | from datetime import datetime, timedelta 12 | from os import path, listdir, environ, getpid 13 | from textwrap import wrap 14 | 15 | PARALLEL_PROCESS_NUM = mp.cpu_count() 16 | TIMESTAMP_REGEX = r'(\d{4}_\d{2}_\d{2}_\d{6})' 17 | SPEC_PATH = path.join(path.dirname(__file__), 'spec') 18 | COMPONENT_LOCKS = json.loads( 19 | open(path.join(SPEC_PATH, 'component_locks.json')).read()) 20 | LOCK_HEAD_REST_SIG = { 21 | # signature list of [head, rest] in component lock 22 | 'mutex': [[0, 0], [1, 1]], 23 | 'subset': [[0, 0], [1, 0], [1, 1]], 24 | } 25 | 26 | 27 | # parse_args to add flag 28 | parser = argparse.ArgumentParser(description='Set flags for functions') 29 | parser.add_argument("-b", "--blind", 30 | help="dont render graphics", 31 | action="store_const", 32 | dest="render", 33 | const=False, 34 | default=True) 35 | parser.add_argument("-d", "--debug", 36 | help="activate debug log", 37 | action="store_const", 38 | dest="loglevel", 39 | const=logging.DEBUG, 40 | default=logging.INFO) 41 | parser.add_argument("-e", "--experiment", 42 | help="specify experiment to run", 43 | action="store", 44 | type=str, 45 | nargs='?', 46 | dest="experiment", 47 | default="dev_dqn") 48 | parser.add_argument("-p", "--param_selection", 49 | help="run parameter selection if present", 50 | action="store_true", 51 | dest="param_selection", 52 | default=False) 53 | parser.add_argument("-q", "--quiet", 54 | help="change log to warning level", 55 | action="store_const", 56 | dest="loglevel", 57 | const=logging.WARNING, 58 | default=logging.INFO) 59 | parser.add_argument("-t", "--times", 60 | help="number of times session is run", 61 | action="store", 62 | nargs='?', 63 | type=int, 64 | dest="times", 65 | default=1) 66 | parser.add_argument("-x", "--max_episodes", 67 | help="manually set environment max episodes", 68 | action="store", 69 | nargs='?', 70 | type=int, 71 | dest="max_epis", 72 | default=-1) 73 | args = parser.parse_args([]) if environ.get('CI') else parser.parse_args() 74 | 75 | # Goddam python logger 76 | logger = logging.getLogger(__name__) 77 | handler = logging.StreamHandler(sys.stdout) 78 | handler.setFormatter( 79 | logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s')) 80 | logger.setLevel(args.loglevel) 81 | logger.addHandler(handler) 82 | logger.propagate = False 83 | environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # mute tf warnings on optimized setup 84 | 85 | 86 | def check_equal(iterator): 87 | '''check if list contains all the same elements''' 88 | iterator = iter(iterator) 89 | try: 90 | first = next(iterator) 91 | except StopIteration: 92 | return True 93 | return all(first == rest for rest in iterator) 94 | 95 | 96 | def check_lock(lock_name, lock, experiment_spec): 97 | ''' 98 | refer to rl/spec/component_locks.json 99 | check a spec's component lock using binary signatures 100 | e.g. head = problem (discrete) 101 | rest = [Agent, Policy] (to be discrete too) 102 | first check if rest all has the same signature, i.e. same set 103 | then check pair [bin_head, bin_rest] in valid_lock_sig_list 104 | as specified by the lock's type 105 | ''' 106 | lock_type = lock['type'] 107 | valid_lock_sig_list = LOCK_HEAD_REST_SIG[lock_type] 108 | lock_head = lock['head'] 109 | bin_head = (experiment_spec[lock_head] in lock[lock_head]) 110 | bin_rest_list = [] 111 | for k, v_list in lock.items(): 112 | if k in experiment_spec and k != lock_head: 113 | bin_rest_list.append(experiment_spec[k] in v_list) 114 | # rest must all have the same signature 115 | rest_equal = check_equal(bin_rest_list) 116 | if not rest_equal: 117 | logger.warn( 118 | 'All components need to be of the same set, ' 119 | 'check component lock "{}" and your spec "{}"'.format( 120 | lock_name, experiment_spec['experiment_name'])) 121 | 122 | bin_rest = bin_rest_list[0] 123 | lock_sig = [bin_head, bin_rest] 124 | lock_valid = lock_sig in valid_lock_sig_list 125 | if not lock_valid: 126 | logger.warn( 127 | 'Component lock violated: "{}", spec: "{}"'.format( 128 | lock_name, experiment_spec['experiment_name'])) 129 | return lock_valid 130 | 131 | 132 | def check_component_locks(experiment_spec): 133 | ''' 134 | check the spec components for all locks 135 | to ensure no lock is violated 136 | refer to rl/spec/component_locks.json 137 | ''' 138 | for lock_name, lock in COMPONENT_LOCKS.items(): 139 | check_lock(lock_name, lock, experiment_spec) 140 | return 141 | 142 | 143 | # import and safeguard the PROBLEMS, EXPERIMENT_SPECS with checks 144 | def import_guard_asset(): 145 | PROBLEMS = json.loads(open(path.join(SPEC_PATH, 'problems.json')).read()) 146 | EXPERIMENT_SPECS = {} 147 | spec_files = [spec_json for spec_json in listdir( 148 | SPEC_PATH) if spec_json.endswith('experiment_specs.json')] 149 | for filename in spec_files: 150 | specs = json.loads(open(path.join(SPEC_PATH, filename)).read()) 151 | EXPERIMENT_SPECS.update(specs) 152 | 153 | REQUIRED_PROBLEM_KEYS = [ 154 | 'GYM_ENV_NAME', 'SOLVED_MEAN_REWARD', 155 | 'MAX_EPISODES', 'REWARD_MEAN_LEN'] 156 | REQUIRED_SPEC_KEYS = [ 157 | 'problem', 'Agent', 'HyperOptimizer', 158 | 'Memory', 'Optimizer', 'Policy', 'PreProcessor', 'param'] 159 | 160 | for problem_name, problem in PROBLEMS.items(): 161 | assert all(k in problem for k in REQUIRED_PROBLEM_KEYS), \ 162 | '{} needs all REQUIRED_PROBLEM_KEYS'.format( 163 | problem_name) 164 | 165 | for experiment_name, spec in EXPERIMENT_SPECS.items(): 166 | assert all(k in spec for k in REQUIRED_SPEC_KEYS), \ 167 | '{} needs all REQUIRED_SPEC_KEYS'.format(experiment_name) 168 | EXPERIMENT_SPECS[experiment_name]['experiment_name'] = experiment_name 169 | check_component_locks(spec) # check component_locks.json 170 | if 'param_range' not in EXPERIMENT_SPECS[experiment_name]: 171 | continue 172 | 173 | param_range = EXPERIMENT_SPECS[experiment_name]['param_range'] 174 | for param_key, param_val in param_range.items(): 175 | if isinstance(param_val, list): 176 | param_range[param_key] = sorted(param_val) 177 | elif isinstance(param_val, dict): 178 | pass 179 | else: 180 | assert False, \ 181 | 'param_range value must be list or dict: {}.{}:{}'.format( 182 | experiment_name, param_key, param_val) 183 | 184 | EXPERIMENT_SPECS[experiment_name]['param_range'] = param_range 185 | return PROBLEMS, EXPERIMENT_SPECS 186 | 187 | PROBLEMS, EXPERIMENT_SPECS = import_guard_asset() 188 | 189 | 190 | def log_self(subject): 191 | max_info_len = 300 192 | info = '{}, param: {}'.format( 193 | subject.__class__.__name__, 194 | to_json(subject.__dict__)) 195 | trunc_info = ( 196 | info[:max_info_len] + '...' if len(info) > max_info_len else info) 197 | logger.debug(trunc_info) 198 | 199 | 200 | def wrap_text(text): 201 | return '\n'.join(wrap(text, 60)) 202 | 203 | 204 | def make_line(line='-'): 205 | if environ.get('CI'): 206 | return 207 | columns = 80 208 | line_str = line*int(columns) 209 | return line_str 210 | 211 | 212 | def log_delimiter(msg, line='-'): 213 | delim_msg = '''\n{0}\n{1}\n{0}\n\n'''.format( 214 | make_line(line), msg) 215 | logger.info(delim_msg) 216 | 217 | 218 | def log_trial_delimiter(trial, action): 219 | log_delimiter('{} Trial #{}/{} on PID {}:\n{}'.format( 220 | action, trial.trial_num, trial.num_of_trials, 221 | getpid(), trial.trial_id), '=') 222 | 223 | 224 | def log_session_delimiter(sess, action): 225 | log_delimiter( 226 | '{} Session #{}/{} of Trial #{}/{} on PID {}:\n{}'.format( 227 | action, sess.session_num, sess.num_of_sessions, 228 | sess.trial.trial_num, sess.trial.num_of_trials, 229 | getpid(), sess.session_id)) 230 | 231 | 232 | def timestamp(): 233 | '''timestamp used for filename''' 234 | timestamp_str = '{:%Y_%m_%d_%H%M%S}'.format(datetime.now()) 235 | assert re.search(TIMESTAMP_REGEX, timestamp_str) 236 | return timestamp_str 237 | 238 | 239 | def timestamp_elapse(s1, s2): 240 | '''calculate the time elapsed between timestamps from s1 to s2''' 241 | FMT = '%Y_%m_%d_%H%M%S' 242 | delta_t = datetime.strptime(s2, FMT) - datetime.strptime(s1, FMT) 243 | return str(delta_t) 244 | 245 | 246 | def timestamp_elapse_to_seconds(s1): 247 | a = datetime.strptime(s1, '%H:%M:%S') 248 | secs = timedelta(hours=a.hour, minutes=a.minute, seconds=a.second).seconds 249 | return secs 250 | 251 | 252 | # own custom sorted json serializer, cuz python 253 | def to_json(o, level=0): 254 | INDENT = 2 255 | SPACE = " " 256 | NEWLINE = "\n" 257 | ret = "" 258 | if isinstance(o, dict): 259 | ret += "{" + NEWLINE 260 | comma = "" 261 | for k in sorted(o.keys()): 262 | v = o[k] 263 | ret += comma 264 | comma = ",\n" 265 | ret += SPACE * INDENT * (level+1) 266 | ret += '"' + str(k) + '":' + SPACE 267 | ret += to_json(v, level + 1) 268 | 269 | ret += NEWLINE + SPACE * INDENT * level + "}" 270 | elif isinstance(o, str): 271 | ret += '"' + o + '"' 272 | elif isinstance(o, list) or isinstance(o, tuple): 273 | ret += "[" + ",".join([to_json(e, level+1) for e in o]) + "]" 274 | elif isinstance(o, bool): 275 | ret += "true" if o else "false" 276 | elif isinstance(o, int): 277 | ret += str(o) 278 | elif isinstance(o, float): 279 | ret += '%.7g' % o 280 | elif isinstance(o, np.ndarray) and np.issubdtype(o.dtype, np.integer): 281 | ret += "[" + ','.join(map(str, o.flatten().tolist())) + "]" 282 | elif isinstance(o, np.ndarray) and np.issubdtype(o.dtype, np.inexact): 283 | ret += "[" + \ 284 | ','.join(map(lambda x: '%.7g' % x, o.flatten().tolist())) + "]" 285 | elif o is None: 286 | ret += 'null' 287 | elif hasattr(o, '__class__'): 288 | ret += '"' + o.__class__.__name__ + '"' 289 | else: 290 | raise TypeError( 291 | "Unknown type '%s' for json serialization" % str(type(o))) 292 | return ret 293 | 294 | 295 | # format object and its properties into printable dict 296 | def format_obj_dict(obj, keys): 297 | if isinstance(obj, dict): 298 | return to_json( 299 | {k: obj.get(k) for k in keys if obj.get(k) is not None}) 300 | else: 301 | return to_json( 302 | {k: getattr(obj, k, None) for k in keys 303 | if getattr(obj, k, None) is not None}) 304 | 305 | 306 | # cast dict to have flat values (int, float, str) 307 | def flat_cast_dict(d): 308 | for k in d: 309 | v = d[k] 310 | if not isinstance(v, (int, float)): 311 | d[k] = str(v) 312 | return d 313 | 314 | 315 | def flatten_dict(d, parent_key='', sep='_'): 316 | items = [] 317 | for k, v in d.items(): 318 | new_key = parent_key + sep + k if parent_key else k 319 | if isinstance(v, collections.MutableMapping): 320 | items.extend(flatten_dict(v, new_key, sep=sep).items()) 321 | else: 322 | items.append((new_key, v)) 323 | return dict(items) 324 | 325 | 326 | def get_module(GREF, dot_path): 327 | # get module from globals() by string dot_path 328 | path_arr = dot_path.split('.') 329 | # base level from globals 330 | mod = GREF.get(path_arr.pop(0)) 331 | for deeper_path in path_arr: 332 | mod = getattr(mod, deeper_path) 333 | return mod 334 | 335 | 336 | def import_package_files(globals_, locals_, __file__): 337 | ''' 338 | Dynamically import all the public attributes of the python modules in this 339 | file's directory (the package directory) and return a list of their names. 340 | ''' 341 | exports = [] 342 | # globals_, locals_ = globals(), locals() 343 | package_path = path.dirname(__file__) 344 | package_name = path.basename(package_path) 345 | 346 | for filename in listdir(package_path): 347 | modulename, ext = path.splitext(filename) 348 | if modulename[0] != '_' and ext in ('.py', '.pyw'): 349 | subpackage = '{}.{}'.format( 350 | package_name, modulename) # pkg relative 351 | module = __import__(subpackage, globals_, locals_, [modulename]) 352 | modict = module.__dict__ 353 | names = (modict['__all__'] if '__all__' in modict else 354 | [name for name in 355 | modict if inspect.isclass(modict[name])]) # all public 356 | exports.extend(names) 357 | globals_.update((name, modict[name]) for name in names) 358 | 359 | return exports 360 | 361 | 362 | def clean_id_str(id_str): 363 | return id_str.split('/').pop().split('.').pop(0) 364 | 365 | 366 | def parse_trial_id(id_str): 367 | c_id_str = clean_id_str(id_str) 368 | if re.search(TIMESTAMP_REGEX, c_id_str): 369 | name_time_trial = re.split(TIMESTAMP_REGEX, c_id_str) 370 | if len(name_time_trial) == 3: 371 | return c_id_str 372 | else: 373 | return None 374 | else: 375 | return None 376 | 377 | 378 | def parse_experiment_id(id_str): 379 | c_id_str = clean_id_str(id_str) 380 | if re.search(TIMESTAMP_REGEX, c_id_str): 381 | name_time_trial = re.split(TIMESTAMP_REGEX, c_id_str) 382 | name_time_trial.pop() 383 | experiment_id = ''.join(name_time_trial) 384 | return experiment_id 385 | else: 386 | return None 387 | 388 | 389 | def parse_experiment_name(id_str): 390 | c_id_str = clean_id_str(id_str) 391 | experiment_id = parse_experiment_id(c_id_str) 392 | if experiment_id is None: 393 | experiment_name = c_id_str 394 | else: 395 | experiment_name = re.sub(TIMESTAMP_REGEX, '', experiment_id).strip('-') 396 | assert experiment_name in EXPERIMENT_SPECS, \ 397 | '{} not in EXPERIMENT_SPECS'.format(experiment_name) 398 | return experiment_name 399 | 400 | 401 | def load_data_from_trial_id(id_str): 402 | experiment_id = parse_experiment_id(id_str) 403 | trial_id = parse_trial_id(id_str) 404 | data_filename = './data/{}/{}.json'.format(experiment_id, trial_id) 405 | try: 406 | data = json.loads(open(data_filename).read()) 407 | except (FileNotFoundError, json.JSONDecodeError): 408 | data = None 409 | return data 410 | 411 | 412 | def load_data_array_from_experiment_id(id_str): 413 | # to load all ./data files for a series of trials 414 | experiment_id = parse_experiment_id(id_str) 415 | data_path = './data/{}'.format(experiment_id) 416 | trial_id_array = [ 417 | f for f in listdir(data_path) 418 | if (path.isfile(path.join(data_path, f)) and 419 | f.startswith(experiment_id) and 420 | f.endswith('.json')) 421 | ] 422 | return list(filter(None, [load_data_from_trial_id(trial_id) 423 | for trial_id in trial_id_array])) 424 | 425 | 426 | def save_experiment_data(data_df, trial_id): 427 | experiment_id = parse_experiment_id(trial_id) 428 | filedir = './data/{0}'.format(experiment_id) 429 | filename = '{0}_analysis_data.csv'.format(experiment_id) 430 | filepath = '{}/{}'.format(filedir, filename) 431 | data_df.round(6).to_csv(filepath, index=False) 432 | 433 | # zip the csv and best trial json for upload to PR 434 | zipfile.ZipFile(filepath+'.zip', mode='w').write( 435 | filepath, arcname=filename) 436 | trial_filename = data_df.loc[0, 'trial_id'] + '.json' 437 | trial_filepath = '{}/{}'.format(filedir, trial_filename) 438 | zipfile.ZipFile(trial_filepath+'.zip', mode='w').write( 439 | trial_filepath, arcname=trial_filename) 440 | 441 | logger.info( 442 | 'experiment data saved to {}'.format(filepath)) 443 | 444 | 445 | def configure_hardware(RAND_SEED): 446 | '''configure rand seed, GPU''' 447 | from keras import backend as K 448 | if K.backend() == 'tensorflow': 449 | K.tf.set_random_seed(RAND_SEED) 450 | else: 451 | K.theano.tensor.shared_randomstreams.RandomStreams(seed=RAND_SEED) 452 | 453 | if K.backend() != 'tensorflow': 454 | # GPU config for tf only 455 | return 456 | 457 | process_num = PARALLEL_PROCESS_NUM if args.param_selection else 1 458 | tf = K.tf 459 | gpu_options = tf.GPUOptions( 460 | allow_growth=True, 461 | per_process_gpu_memory_fraction=1./float(process_num)) 462 | config = tf.ConfigProto( 463 | gpu_options=gpu_options, 464 | allow_soft_placement=True) 465 | sess = tf.Session(config=config) 466 | K.set_session(sess) 467 | return sess 468 | 469 | 470 | def debug_mem_usage(): 471 | import psutil 472 | from mem_top import mem_top 473 | pid = getpid() 474 | logger.debug( 475 | 'MEM USAGE for PID {}, MEM_INFO: {}\n{}'.format( 476 | pid, psutil.Process().memory_info(), mem_top())) 477 | 478 | 479 | def del_self_attr(subject): 480 | self_attrs = list(subject.__dict__.keys()) 481 | for attr in self_attrs: 482 | delattr(subject, attr) 483 | import gc 484 | gc.collect() 485 | 486 | 487 | # clone a keras model without file I/O 488 | def clone_model(model, custom_objects=None): 489 | from keras.models import model_from_config 490 | custom_objects = custom_objects or {} 491 | config = { 492 | 'class_name': model.__class__.__name__, 493 | 'config': model.get_config(), 494 | } 495 | clone = model_from_config(config, custom_objects=custom_objects) 496 | clone.set_weights(model.get_weights()) 497 | return clone 498 | 499 | 500 | # clone a keras optimizer without file I/O 501 | def clone_optimizer(optimizer): 502 | from keras.optimizers import optimizer_from_config 503 | if isinstance(optimizer, str): 504 | return get(optimizer) 505 | params = dict([(k, v) for k, v in optimizer.get_config().items()]) 506 | config = { 507 | 'class_name': optimizer.__class__.__name__, 508 | 'config': params, 509 | } 510 | clone = optimizer_from_config(config) 511 | return clone 512 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from setuptools import setup 4 | from setuptools.command.test import test as TestCommand 5 | 6 | 7 | # explicitly config 8 | test_args = [ 9 | '-n 4', 10 | '--cov-report=term', 11 | '--cov-report=html', 12 | '--cov=rl', 13 | 'test' 14 | ] 15 | 16 | 17 | class PyTest(TestCommand): 18 | user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")] 19 | 20 | def initialize_options(self): 21 | TestCommand.initialize_options(self) 22 | self.pytest_args = test_args 23 | 24 | def run_tests(self): 25 | # import here, cause outside the eggs aren't loaded 26 | import pytest 27 | errno = pytest.main(self.pytest_args) 28 | sys.exit(errno) 29 | 30 | 31 | # Utility function to read the README file. 32 | def read(fname): 33 | return open(os.path.join(os.path.dirname(__file__), fname)).read() 34 | 35 | 36 | # the setup 37 | setup( 38 | name='openai_lab', 39 | version='1.0.0', 40 | description='An experimentation system for Reinforcement Learning using OpenAI and Keras', 41 | long_description=read('README.md'), 42 | keywords='openai gym', 43 | url='https://github.com/kengz/openai_lab', 44 | author='kengz,lgraesser', 45 | author_email='kengzwl@gmail.com', 46 | license='MIT', 47 | packages=[], 48 | zip_safe=False, 49 | include_package_data=True, 50 | install_requires=[], 51 | dependency_links=[], 52 | extras_require={ 53 | 'dev': [], 54 | 'docs': [], 55 | 'testing': [] 56 | }, 57 | classifiers=[], 58 | tests_require=['pytest', 'pytest-cov'], 59 | test_suite='test', 60 | cmdclass={'test': PyTest} 61 | ) 62 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kengz/openai_lab/d0669d89268f2dc01c1cf878e4879775c7b6eb3c/test/__init__.py -------------------------------------------------------------------------------- /test/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import rl 3 | from os import environ 4 | 5 | environ['CI'] = environ.get('CI') or 'true' 6 | 7 | 8 | def pytest_runtest_setup(item): 9 | for problem in rl.util.PROBLEMS: 10 | if problem == 'TestPassCartPole-v0': 11 | pass 12 | else: 13 | rl.util.PROBLEMS[problem]['MAX_EPISODES'] = 3 14 | -------------------------------------------------------------------------------- /test/test_atari.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pytest 3 | from os import environ 4 | from rl.experiment import run 5 | from . import conftest 6 | import pandas as pd 7 | 8 | 9 | class AtariTest(unittest.TestCase): 10 | 11 | @unittest.skipIf(environ.get('CI'), "Delay CI test until dev stable") 12 | @classmethod 13 | def test_breakout_dqn(cls): 14 | data_df = run('breakout_dqn') 15 | assert isinstance(data_df, pd.DataFrame) 16 | 17 | @unittest.skipIf(environ.get('CI'), "Delay CI test until dev stable") 18 | @classmethod 19 | def test_breakout_double_dqn(cls): 20 | data_df = run('breakout_double_dqn') 21 | assert isinstance(data_df, pd.DataFrame) 22 | -------------------------------------------------------------------------------- /test/test_box2d.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pytest 3 | from os import environ 4 | from rl.experiment import run 5 | from . import conftest 6 | import pandas as pd 7 | 8 | 9 | class Box2DTest(unittest.TestCase): 10 | 11 | @classmethod 12 | def test_lunar_dqn(cls): 13 | data_df = run('lunar_dqn') 14 | assert isinstance(data_df, pd.DataFrame) 15 | 16 | @classmethod 17 | def test_lunar_double_dqn(cls): 18 | data_df = run('lunar_double_dqn') 19 | assert isinstance(data_df, pd.DataFrame) 20 | 21 | @classmethod 22 | def test_lunar_freeze(cls): 23 | data_df = run('lunar_freeze') 24 | assert isinstance(data_df, pd.DataFrame) 25 | 26 | @classmethod 27 | def test_walker_ddpg_linearnoise(cls): 28 | data_df = run('walker_ddpg_linearnoise') 29 | assert isinstance(data_df, pd.DataFrame) 30 | -------------------------------------------------------------------------------- /test/test_classic.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pytest 3 | from os import environ 4 | from rl.experiment import run 5 | from . import conftest 6 | import pandas as pd 7 | 8 | 9 | class ClassicTest(unittest.TestCase): 10 | 11 | @classmethod 12 | def test_quickstart_dqn(cls): 13 | data_df = run('quickstart_dqn') 14 | assert isinstance(data_df, pd.DataFrame) 15 | 16 | @classmethod 17 | def test_dqn_epsilon(cls): 18 | data_df = run('dqn_epsilon') 19 | assert isinstance(data_df, pd.DataFrame) 20 | 21 | @classmethod 22 | def test_dqn(cls): 23 | data_df = run('dqn') 24 | assert isinstance(data_df, pd.DataFrame) 25 | 26 | @classmethod 27 | def test_dqn_per(cls): 28 | data_df = run('dqn_per') 29 | assert isinstance(data_df, pd.DataFrame) 30 | 31 | @classmethod 32 | def test_double_dqn(cls): 33 | data_df = run('double_dqn') 34 | assert isinstance(data_df, pd.DataFrame) 35 | 36 | @classmethod 37 | def test_sarsa(cls): 38 | data_df = run('sarsa') 39 | assert isinstance(data_df, pd.DataFrame) 40 | 41 | @classmethod 42 | def test_exp_sarsa(cls): 43 | data_df = run('exp_sarsa') 44 | assert isinstance(data_df, pd.DataFrame) 45 | 46 | @classmethod 47 | def test_offpol_sarsa(cls): 48 | data_df = run('offpol_sarsa') 49 | assert isinstance(data_df, pd.DataFrame) 50 | 51 | @classmethod 52 | def test_cartpole_ac_argmax(cls): 53 | data_df = run('cartpole_ac_argmax') 54 | assert isinstance(data_df, pd.DataFrame) 55 | 56 | @classmethod 57 | def test_dqn_v1(cls): 58 | data_df = run('dqn_v1') 59 | assert isinstance(data_df, pd.DataFrame) 60 | 61 | @classmethod 62 | def test_acrobot(cls): 63 | data_df = run('acrobot') 64 | assert isinstance(data_df, pd.DataFrame) 65 | 66 | @classmethod 67 | def test_pendulum_ddpg_linearnoise(cls): 68 | data_df = run('pendulum_ddpg_linearnoise') 69 | assert isinstance(data_df, pd.DataFrame) 70 | 71 | @classmethod 72 | def test_mountain_dqn(cls): 73 | data_df = run('mountain_dqn') 74 | assert isinstance(data_df, pd.DataFrame) 75 | -------------------------------------------------------------------------------- /test/test_dev.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pytest 3 | from os import environ 4 | from rl.experiment import run 5 | from . import conftest 6 | import pandas as pd 7 | 8 | 9 | class DevTest(unittest.TestCase): 10 | 11 | @classmethod 12 | def test_clean_import(cls): 13 | print(dir()) 14 | assert 'keras' not in dir( 15 | ), 'keras import should be contained within classes' 16 | assert 'matplotlib' not in dir( 17 | ), 'matplotlib import should be contained within classes' 18 | 19 | @classmethod 20 | def test_gym_tour(cls): 21 | data_df = run('dummy') 22 | assert isinstance(data_df, pd.DataFrame) 23 | 24 | @classmethod 25 | def test_q_table(cls): 26 | data_df = run('q_table') 27 | assert isinstance(data_df, pd.DataFrame) 28 | 29 | @unittest.skipIf(environ.get('CI'), 30 | "Causing build to crash since it's unstable.") 31 | @classmethod 32 | def test_dqn_pass(cls): 33 | data_df = run('test_dqn_pass') 34 | max_total_rewards = data_df['max_total_rewards_stats_mean'][0] 35 | print(max_total_rewards) 36 | assert max_total_rewards > 50, 'dqn failed to hit max_total_rewards' 37 | 38 | # TODO cant run searches with these shits together, will hang everything wtf 39 | # @classmethod 40 | # def test_dqn_grid_search(cls): 41 | # data_df = run('test_dqn_grid_search', param_selection=True) 42 | # assert isinstance(data_df, pd.DataFrame) 43 | 44 | # TODO cant run searches with these shits together, will hang everything wtf 45 | # @classmethod 46 | # def test_dqn_random_search(cls): 47 | # data_df = run('test_dqn_random_search', param_selection=True) 48 | # assert isinstance(data_df, pd.DataFrame) 49 | --------------------------------------------------------------------------------