├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── .snyk
├── Gruntfile.js
├── LICENSE
├── README.md
├── bin
    ├── copy-config
    ├── setup
    ├── setup_macOS
    └── setup_ubuntu
├── circle.yml
├── config
    ├── .theanorc
    ├── example-default.json
    └── keras.json
├── data
    └── .gitkeep
├── environment.yml
├── main.py
├── package.json
├── requirements.txt
├── rl
    ├── __init__.py
    ├── agent
    │   ├── __init__.py
    │   ├── actor_critic.py
    │   ├── base_agent.py
    │   ├── conv_dqn.py
    │   ├── ddpg.py
    │   ├── deep_exp_sarsa.py
    │   ├── deep_sarsa.py
    │   ├── double_conv_dqn.py
    │   ├── double_dqn.py
    │   ├── dqn.py
    │   ├── freeze_dqn.py
    │   ├── offpol_sarsa.py
    │   └── q_table.py
    ├── analytics.py
    ├── experiment.py
    ├── hyperoptimizer
    │   ├── __init__.py
    │   ├── base_hyperoptimizer.py
    │   ├── grid_search.py
    │   ├── line_search.py
    │   └── random_search.py
    ├── memory
    │   ├── __init__.py
    │   ├── base_memory.py
    │   ├── linear.py
    │   ├── prioritized_exp_replay.py
    │   └── ranked.py
    ├── model
    │   └── .gitkeep
    ├── optimizer
    │   ├── __init__.py
    │   ├── adam.py
    │   ├── base_optimizer.py
    │   ├── rmsprop.py
    │   └── sgd.py
    ├── policy
    │   ├── __init__.py
    │   ├── actor_critic.py
    │   ├── base_policy.py
    │   ├── boltzmann.py
    │   ├── epsilon_greedy.py
    │   └── noise.py
    ├── preprocessor
    │   ├── __init__.py
    │   ├── atari.py
    │   ├── base_preprocessor.py
    │   └── linear.py
    ├── spec
    │   ├── atari_experiment_specs.json
    │   ├── box2d_experiment_specs.json
    │   ├── classic_experiment_specs.json
    │   ├── component_locks.json
    │   ├── dev_experiment_specs.json
    │   ├── problems.json
    │   └── pygame_experiment_specs.json
    └── util.py
├── setup.py
└── test
    ├── __init__.py
    ├── conftest.py
    ├── test_atari.py
    ├── test_box2d.py
    ├── test_classic.py
    └── test_dev.py


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | _If you're contributing new algorithms and its solutions to OpenAI Lab, please follow below. Otherwise, clear below and concisely describe your commits._
 2 | 
 3 | ### Solution Submission
 4 | 
 5 | Once accepted, we will add the following to the OpenAI Lab [Best Solutions](http://kengz.me/openai_lab/#problems) and the code.
 6 | 
 7 | - name the Pull Request title like `Solution: CartPole-v0 with DQN`
 8 | - add PR label `solution`
 9 | 
10 | Then, submit the following:
11 | 
12 | - [ ] problem: CartPole-v0
13 | - [ ] algorithm (commit code if new): DQN
14 | - [ ] best `fitness_score`: _the highest_
15 | - [ ] author: _your name_
16 | - [ ] commit `experiment_spec`: dqn
17 | - _attach (not commit)_ the experiment files:
18 |     - [ ] `<experiment_id>_analysis_data.csv` (zip)
19 |     - [ ] `<best_trial_id>.json` (zip)
20 |     - [ ] `<best_trial_session_id>.png`
21 |     - [ ] `<experiment_id>_analysis.png`
22 |     - [ ] `<experiment_id>_analysis_correlation.png`
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | .bundle
 4 | .config
 5 | coverage
 6 | InstalledFiles
 7 | lib/bundler/man
 8 | pkg
 9 | rdoc
10 | spec/reports
11 | test/tmp
12 | test/version_tmp
13 | tmp
14 | *.DS_STORE
15 | build/
16 | .cache
17 | .vagrant
18 | .sass-cache
19 | 
20 | # YARD artifacts
21 | .yardoc
22 | _yardoc
23 | doc/
24 | .idea/
25 | 
26 | # Python ignores
27 | __pycache__/
28 | *.py[cod]
29 | *$py.class
30 | *.egg*
31 | *.manifest
32 | .cache/
33 | htmlcov/
34 | .coverage
35 | venv/
36 | ENV/
37 | .env/
38 | openai_lab/
39 | src/
40 | 
41 | node_modules/
42 | 
43 | .DS_Store
44 | 
45 | *checkpoint*
46 | *.tfl*
47 | *.meta
48 | data/
49 | model/
50 | *.png
51 | *.txt
52 | *.json
53 | *.csv
54 | *.log
55 | *.h5
56 | *.xml
57 | 


--------------------------------------------------------------------------------
/.snyk:
--------------------------------------------------------------------------------
 1 | # Snyk (https://snyk.io) policy file, patches or ignores known vulnerabilities.
 2 | version: v1.7.1
 3 | ignore: {}
 4 | # patches apply the minimum changes required to fix a vulnerability
 5 | patch:
 6 |   'npm:debug:20170905':
 7 |     - grunt-contrib-watch > tiny-lr > debug:
 8 |         patched: '2017-09-30T01:17:03.873Z'
 9 |     - grunt-contrib-watch > tiny-lr > body-parser > debug:
10 |         patched: '2017-09-30T01:17:03.873Z'
11 | 


--------------------------------------------------------------------------------
/Gruntfile.js:
--------------------------------------------------------------------------------
  1 | const _ = require('lodash')
  2 | const fs = require('fs')
  3 | const resolve = require('resolve-dir')
  4 | 
  5 | 
  6 | // generic experimentId matcher. index 2: experimentId, 3 or 4: experimentName
  7 | const expIdRegex = /(\-e\s+)?(([a-zA-Z0-9_]+)\-\d{4}_\d{2}_\d{2}_\d{6}|([a-zA-Z0-9_]+))/
  8 | const historyPath = './config/history.json'
  9 | const finishMsg = `
 10 | ===========================================
 11 | Experiments complete. Press Ctrl+C to exit.
 12 | ===========================================
 13 | `
 14 | 
 15 | 
 16 | module.exports = function(grunt) {
 17 |   process.env.NODE_ENV = grunt.option('prod') ? 'production' : 'development'
 18 | 
 19 |   const config = require('config')
 20 |   const dataSrc = 'data'
 21 |   const dataDest = resolve(config.data_sync_destination)
 22 |   const experiments = config.experiments
 23 |   const experimentTasks = _.map(experiments, function(name) {
 24 |     return `shell:experiment:${name}`
 25 |   })
 26 | 
 27 |   function writeHistory(history) {
 28 |     grunt.log.ok(`Writing updated lab history ${JSON.stringify(history, null, 2)}`)
 29 |     fs.writeFileSync(historyPath, JSON.stringify(history, null, 2))
 30 |     return history
 31 |   }
 32 | 
 33 |   function readHistory() {
 34 |     if (grunt.option('resume')) {
 35 |       try {
 36 |         return JSON.parse(fs.readFileSync(historyPath, 'utf8'))
 37 |       } catch (err) {
 38 |         grunt.log.ok(`No existing ${historyPath} to resume, creating new`)
 39 |         return writeHistory({})
 40 |       }
 41 |     } else {
 42 |       return {}
 43 |     }
 44 |   }
 45 | 
 46 |   let history = readHistory()
 47 | 
 48 |   function getExpId(filepath) {
 49 |     if (!fs.lstatSync(filepath).isFile()) {
 50 |       // write history on folder being created
 51 |       return filepath
 52 |     } else if (_.endsWith(filepath, '.json')) {
 53 |       // write history on json written (fallback guard)
 54 |       let expIdPath = _.join(_.initial(filepath.split('_')), '_')
 55 |       return expIdPath.split('/').pop()
 56 |     } else {
 57 |       return false
 58 |     }
 59 |   }
 60 | 
 61 |   function updateHistory(filepath) {
 62 |     let expId = getExpId(filepath)
 63 |     if (!expId) {
 64 |       return
 65 |     }
 66 |     const matchedPath = expId.split('/').pop().match(expIdRegex)
 67 |     if (matchedPath) {
 68 |       const experimentId = matchedPath[2]
 69 |       const experimentName = matchedPath[3] || matchedPath[4]
 70 |       history[experimentName] = experimentId
 71 |       writeHistory(history)
 72 |     }
 73 |   }
 74 | 
 75 |   function remoteCmd() {
 76 |     return grunt.option('remote') ? 'xvfb-run -a -s "-screen 0 1400x900x24" --' : ''
 77 |   }
 78 | 
 79 |   function bestCmd() {
 80 |     return grunt.option('best') ? '' : ' -bp'
 81 |   }
 82 | 
 83 |   function debugCmd() {
 84 |     return grunt.option('debug') ? ' -d' : ''
 85 |   }
 86 | 
 87 |   function quietCmd() {
 88 |     return grunt.option('quiet') ? ' -q' : ''
 89 |   }
 90 | 
 91 |   function notiCmd(experiment) {
 92 |     return (grunt.option('prod') && !grunt.option('analyze')) ? `NOTI_SLACK_DEST='${config.NOTI_SLACK_DEST}' NOTI_SLACK_TOK='${config.NOTI_SLACK_TOK}' noti -k -t 'Experiment completed' -m '[${new Date().toISOString()}] ${experiment} on ${process.env.USER}'` : ''
 93 |   }
 94 | 
 95 |   function resumeExperimentStr(eStr) {
 96 |     const matchedExp = eStr.match(expIdRegex)
 97 |     if (matchedExp) {
 98 |       const experimentIdOrName = matchedExp[2]
 99 |       const experimentName = matchedExp[3] || matchedExp[4]
100 |       if (history[experimentName]) {
101 |         return eStr.replace(experimentIdOrName, history[experimentName])
102 |       }
103 |     }
104 |     return eStr
105 |   }
106 | 
107 |   function composeCommand(experimentStr) {
108 |     var eStr = experimentStr
109 |     if (grunt.option('resume') || grunt.option('analyze')) {
110 |       eStr = resumeExperimentStr(eStr)
111 |     }
112 | 
113 |     const envCmd = 'if (conda env list | grep --quiet "openai_lab"); then echo "activating conda"; source activate openai_lab; elif [ -d ./.env ]; then echo "activating virtualenv"; source .env/bin/activate; else echo "using system python"; fi; '
114 | 
115 |     // override with custom command if has 'python'
116 |     const pyCmd = _.includes(eStr, 'python') ? eStr : `python3 main.py${bestCmd()}${debugCmd()}${quietCmd()} -t 5 -e ${eStr}`
117 |     const cmd = `${envCmd}${remoteCmd()} ${pyCmd} | tee ./data/terminal.log; ${notiCmd(eStr)}`
118 |     grunt.log.ok(`Composed command: ${cmd}`)
119 |     return cmd
120 |   }
121 | 
122 | 
123 |   require('load-grunt-tasks')(grunt)
124 | 
125 |   grunt.initConfig({
126 |     sync: {
127 |       main: {
128 |         files: [{
129 |           cwd: dataSrc,
130 |           src: ['**'],
131 |           dest: dataDest,
132 |         }],
133 |         pretend: !grunt.option('prod'), // Don't do real IO
134 |       }
135 |     },
136 | 
137 |     watch: {
138 |       data: {
139 |         files: `${dataSrc}/**`,
140 |         tasks: ['sync'],
141 |         options: {
142 |           debounceDelay: 20 * 60 * 1000,
143 |           interval: 60000,
144 |         },
145 |       }
146 |     },
147 | 
148 |     shell: {
149 |       options: {
150 |         execOptions: {
151 |           killSignal: 'SIGINT',
152 |           env: process.env
153 |         }
154 |       },
155 |       experiment: {
156 |         command(experimentStr) {
157 |           return composeCommand(experimentStr)
158 |         },
159 |         options: {
160 |           stdout: true
161 |         }
162 |       },
163 |       finish: `echo "${finishMsg}"`,
164 |       clear: 'rm -rf .cache __pycache__ */__pycache__ *egg-info htmlcov .coverage* *.xml data/**/ data/*.log config/history.json',
165 |     },
166 | 
167 |     concurrent: {
168 |       default: ['watch', ['lab', 'shell:finish']],
169 |       options: {
170 |         logConcurrentOutput: true
171 |       }
172 |     },
173 |   })
174 | 
175 |   grunt.event.on('watch', function(action, filepath, target) {
176 |     updateHistory(filepath)
177 |   })
178 | 
179 |   grunt.registerTask('lab', 'run all the experiments', experimentTasks)
180 |   grunt.registerTask('lab_sync', 'run lab with auto file syncing', ['concurrent:default'])
181 |   grunt.registerTask('default', ['lab_sync'])
182 | 
183 |   grunt.registerTask('analyze', function() {
184 |     grunt.option('analyze', true)
185 |     grunt.option('resume', true)
186 |     grunt.task.run('default')
187 |   })
188 |   grunt.registerTask('clear', ['shell:clear'])
189 | }
190 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016 Wah Loon Keng, Laura Graesser
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenAI Lab [![GitHub release](https://img.shields.io/github/release/kengz/openai_lab.svg)](https://github.com/kengz/openai_lab) [![CircleCI](https://circleci.com/gh/kengz/openai_lab.svg?style=shield)](https://circleci.com/gh/kengz/openai_lab) [![Codacy Badge](https://api.codacy.com/project/badge/Grade/9e55f845b10b4b51b213620bfb98e4b3)](https://www.codacy.com/app/kengzwl/openai_lab?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=kengz/openai_lab&amp;utm_campaign=Badge_Grade) [![Codacy Badge](https://api.codacy.com/project/badge/Coverage/9e55f845b10b4b51b213620bfb98e4b3)](https://www.codacy.com/app/kengzwl/openai_lab?utm_source=github.com&utm_medium=referral&utm_content=kengz/openai_lab&utm_campaign=Badge_Coverage) [![GitHub stars](https://img.shields.io/github/stars/kengz/openai_lab.svg?style=social&label=Star)](https://github.com/kengz/openai_lab) [![GitHub forks](https://img.shields.io/github/forks/kengz/openai_lab.svg?style=social&label=Fork)](https://github.com/kengz/openai_lab)
 2 | 
 3 | ---
 4 | 
 5 | <p align="center"><b><a href="https://github.com/kengz/SLM-Lab">NOTICE: Please use the next version, SLM-Lab.</a></b></p>
 6 | 
 7 | ---
 8 | 
 9 | <p align="center"><b><a href="http://kengz.me/openai_lab">OpenAI Lab Documentation</a></b></p>
10 | 
11 | ---
12 | 
13 | _An experimentation framework for Reinforcement Learning using OpenAI Gym, Tensorflow, and Keras._
14 | 
15 | _OpenAI Lab_ is created to do Reinforcement Learning (RL) like science - _theorize, experiment_. It provides an easy interface to [OpenAI Gym](https://gym.openai.com/) and [Keras](https://keras.io/), with an automated experimentation and evaluation framework.
16 | 
17 | ### Features
18 | 
19 | 1. **Unified RL environment and agent interface** using OpenAI Gym, Tensorflow, Keras, so you can focus on developing the algorithms.
20 | 2. **[Core RL algorithms implementations](http://kengz.me/openai_lab/#agents-matrix), with reusable modular components** for developing deep RL algorithms.
21 | 3. **[An experimentation framework](http://kengz.me/openai_lab/#experiments)** for running hundreds of trials of hyperparameter optimizations, with logs, plots and analytics for testing new RL algorithms. Experimental settings are stored in standardized JSONs for reproducibility and comparisons.
22 | 4. **[Automated analytics of the experiments](http://kengz.me/openai_lab/#analysis)** for evaluating the RL agents and environments, and to help pick the best solution.
23 | 5. **The [Fitness Matrix](http://kengz.me/openai_lab/#fitness-matrix)**, a table of the best scores of RL algorithms v.s. the environments; useful for research.
24 | 
25 | 
26 | With OpenAI Lab, we could focus on researching the essential elements of reinforcement learning such as the algorithm, policy, memory, and parameter tuning. It allows us to build agents efficiently using existing components with the implementations from research ideas. We could then test the research hypotheses systematically by running experiments.
27 | 
28 | *Read more about the research problems the Lab addresses in [Motivations](http://kengz.me/openai_lab/#motivations). Ultimately, the Lab is a generalized framework for doing reinforcement learning, agnostic of OpenAI Gym and Keras. E.g. Pytorch-based implementations are on the roadmap.*
29 | 
30 | 
31 | ### Implemented Algorithms
32 | 
33 | A list of the core RL algorithms implemented/planned.
34 | 
35 | To see their scores against OpenAI gym environments, go to **[Fitness Matrix](http://kengz.me/openai_lab/#fitness-matrix)**.
36 | 
37 | 
38 | |algorithm|implementation|eval score (pending)|
39 | |:---|:---|:---|
40 | |[DQN](https://arxiv.org/abs/1312.5602)|[DQN](https://github.com/kengz/openai_lab/blob/master/rl/agent/dqn.py)|-|
41 | |[Double DQN](https://arxiv.org/abs/1509.06461)|[DoubleDQN](https://github.com/kengz/openai_lab/blob/master/rl/agent/double_dqn.py)|-|
42 | |[Dueling DQN](https://arxiv.org/abs/1511.06581)|-|-|
43 | |Sarsa|[DeepSarsa](https://github.com/kengz/openai_lab/blob/master/rl/agent/deep_sarsa.py)|-|
44 | |Off-Policy Sarsa|[OffPolicySarsa](https://github.com/kengz/openai_lab/blob/master/rl/agent/offpol_sarsa.py)|-|
45 | |[PER (Prioritized Experience Replay)](https://arxiv.org/abs/1511.05952)|[PrioritizedExperienceReplay](https://github.com/kengz/openai_lab/blob/master/rl/memory/prioritized_exp_replay.py)|-|
46 | |[CEM (Cross Entropy Method)](https://en.wikipedia.org/wiki/Cross-entropy_method)|next|-|
47 | |[REINFORCE](http://incompleteideas.net/sutton/williams-92.pdf)|-|-|
48 | |[DPG (Deterministic Policy Gradient) off-policy actor-critic](http://jmlr.org/proceedings/papers/v32/silver14.pdf)|[ActorCritic](https://github.com/kengz/openai_lab/blob/master/rl/agent/actor_critic.py)|-|
49 | |[DDPG (Deep-DPG) actor-critic with target networks](https://arxiv.org/abs/1509.02971)|[DDPG](https://github.com/kengz/openai_lab/blob/master/rl/agent/ddpg.py)|-|
50 | |[A3C (asynchronous advantage actor-critic)](https://arxiv.org/pdf/1602.01783.pdf)|-|-|
51 | |Dyna|next|-|
52 | |[TRPO](https://arxiv.org/abs/1502.05477)|-|-|
53 | |Q*(lambda)|-|-|
54 | |Retrace(lambda)|-|-|
55 | |[Neural Episodic Control (NEC)](https://arxiv.org/abs/1703.01988)|-|-|
56 | |[EWC (Elastic Weight Consolidation)](https://arxiv.org/abs/1612.00796)|-|-|
57 | 
58 | 
59 | ### Run the Lab
60 | 
61 | Next, see [Installation](http://kengz.me/openai_lab/#installation) and jump to [Quickstart](http://kengz.me/openai_lab/#quickstart).
62 | 
63 | 
64 | <div style="max-width: 100%"><img alt="Timelapse of OpenAI Lab" src="http://kengz.me/openai_lab/images/lab_demo_dqn.gif" /></div>
65 | 
66 | *Timelapse of OpenAI Lab, solving CartPole-v0.*
67 | 


--------------------------------------------------------------------------------
/bin/copy-config:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # copy keys file if not already exist
 4 | CONFIG_DIR=`pwd`/config
 5 | EXAMPLE_CONFIG="$CONFIG_DIR/example-default.json"
 6 | DEV_CONFIG="$CONFIG_DIR/default.json"
 7 | PROD_CONFIG="$CONFIG_DIR/production.json"
 8 | 
 9 | if [ ! -e "$DEV_CONFIG" ]; then
10 |   cp $EXAMPLE_CONFIG $DEV_CONFIG
11 |   echo "[ --- Created $DEV_CONFIG --- ]"
12 | fi
13 | 
14 | if [ ! -e "$PROD_CONFIG" ]; then
15 |   cp $EXAMPLE_CONFIG $PROD_CONFIG
16 |   echo "[ --- Created $PROD_CONFIG --- ]"
17 | fi
18 | 


--------------------------------------------------------------------------------
/bin/setup:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script runs the same sequence as the CircleCI build
 3 | # Run this as:
 4 | # bin/setup
 5 | 
 6 | 
 7 | # Fail on the first error; killable by SIGINT
 8 | set -e
 9 | trap "exit" INT
10 | 
11 | 
12 | read -p "
13 | ================================================
14 | 
15 | Welcome to the OpenAI Lab setup script;
16 | This will invoke sudo; alternatively,
17 | inspect bin/setup_ubuntu or bin/setup_macOS and run the lines manually.
18 | 
19 | Press enter to continue, Ctrl+c to quit:
20 | 
21 | ================================================
22 | "
23 | 
24 | # copy keys file if not already exist
25 | BIN_DIR=`pwd`/bin
26 | $BIN_DIR/copy-config
27 | 
28 | # determine if is Mac OSX, or Linux; then run accordingly
29 | if [ $(uname) == "Darwin" ];
30 | # Mac runs below
31 | then (
32 |   $BIN_DIR/setup_macOS;
33 |   );
34 | else (
35 |   $BIN_DIR/setup_ubuntu;
36 |   );
37 | fi
38 | 
39 | 
40 | echo "
41 | ================================================
42 | 
43 | Setup done.
44 | Running basic installation checks.
45 | 
46 | ================================================
47 | "
48 | 
49 | # post-installation checks
50 | python3 -c "import tensorflow; print('tensorflow version:'); print(tensorflow.__version__)"
51 | python3 -c "import gym; gym.make('LunarLander-v2')"
52 | python3 -c "import gym; gym.make('SpaceInvaders-v0')"
53 | 
54 | 
55 | echo "
56 | ================================================
57 | 
58 | Installation complete.
59 | 
60 | ================================================
61 | "


--------------------------------------------------------------------------------
/bin/setup_macOS:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script sets up OpenAI Lab for macOS
 3 | 
 4 | # Fail on the first error; killable by SIGINT
 5 | set -e
 6 | trap "exit" INT
 7 | 
 8 | # install system dependencies
 9 | if which brew >/dev/null; then
10 |   echo "Brew is already installed"
11 | else
12 |   ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
13 | fi
14 | 
15 | # system dependencies for full openai gym
16 | hb_list=(cmake boost boost-python sdl2 swig wget)
17 | for item in "${hb_list[@]}"; do
18 |   brew info "${item}" | grep --quiet 'Not installed' && brew install "${item}"
19 | done
20 | 
21 | # install noti for auto-notification
22 | if which noti >/dev/null; then
23 |   echo "Noti is already installed"
24 | else
25 |   curl -L https://github.com/variadico/noti/releases/download/v2.5.0/noti2.5.0.darwin-amd64.tar.gz | tar -xz
26 |   sudo mv noti /usr/local/bin/
27 | fi
28 | 
29 | # install nodejs (for npm and file watcher)
30 | if which node >/dev/null; then
31 |   echo "Nodejs is already installed"
32 | else
33 |   brew install node
34 | fi
35 | # install npm modules
36 | if [ -d ./node_modules ]; then
37 |   echo "Npm modules already installed"
38 | else
39 |   npm install; sudo npm i -g grunt-cli
40 | fi
41 | 
42 | # install python3
43 | if which python3 >/dev/null; then
44 |   echo "Python3 is already installed"
45 | else
46 |   brew install python3
47 | fi
48 | 
49 | # install python dependencies
50 | sudo python3 -m pip install -r requirements.txt
51 | 


--------------------------------------------------------------------------------
/bin/setup_ubuntu:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script sets up OpenAI Lab for Linux Ubuntu
 3 | 
 4 | # Fail on the first error; killable by SIGINT
 5 | set -e
 6 | trap "exit" INT
 7 | 
 8 | # install system dependencies
 9 | sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get update
10 | sudo apt-get install -y gcc-4.9 g++-4.9 libhdf5-dev libopenblas-dev git
11 | 
12 | # system dependencies for full openai gym
13 | sudo apt-get install -y cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
14 | 
15 | # install noti for auto-notification
16 | if which noti >/dev/null; then
17 |   echo "Noti is already installed"
18 | else
19 |   curl -L https://github.com/variadico/noti/releases/download/v2.5.0/noti2.5.0.linux-amd64.tar.gz | tar -xz
20 |   sudo mv noti /usr/bin/
21 | fi
22 | 
23 | # install nodejs (for npm and file watcher)
24 | if which node >/dev/null; then
25 |   echo "Nodejs is already installed"
26 | else
27 |   curl -sL https://deb.nodesource.com/setup_7.x | sudo -E bash -
28 |   sudo apt-get install -y nodejs
29 | fi
30 | # install npm modules
31 | if [ -d ./node_modules ]; then
32 |   echo "Npm modules already installed"
33 | else
34 |   npm install; sudo npm i -g grunt-cli
35 | fi
36 | 
37 | # install python3
38 | if which python3 >/dev/null; then
39 |   echo "Python3 is already installed"
40 | else
41 |   sudo apt-get -y install python3-dev python3-pip python3-setuptools
42 | fi
43 | 
44 | # install python dependencies
45 | sudo python3 -m pip install -r requirements.txt
46 | 


--------------------------------------------------------------------------------
/circle.yml:
--------------------------------------------------------------------------------
 1 | machine:
 2 |   python:
 3 |     version: 3.5.2
 4 | 
 5 | dependencies:
 6 |   pre:
 7 |     - sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test && sudo apt-get update
 8 |     - sudo apt-get install -y gcc-4.9 g++-4.9 libhdf5-dev libopenblas-dev git python3-tk tk-dev python3-dev python3-setuptools
 9 |     - sudo apt-get install -y cmake zlib1g-dev libjpeg-dev xvfb libav-tools xorg-dev python-opengl libboost-all-dev libsdl2-dev swig
10 |     - pip install -U pip
11 |   override:
12 |     - pip install -r requirements.txt
13 |     - mkdir ~/.keras && cp ./config/keras.json ~/.keras/
14 | test:
15 |   override:
16 |     - xvfb-run -a -s "-screen 0 1400x900x24" -- python setup.py test
17 |     - coverage xml && python-codacy-coverage -r coverage.xml
18 | 
19 | general:
20 |   branches:
21 |     ignore:
22 |       - doc
23 |       - gh-pages
24 | 


--------------------------------------------------------------------------------
/config/.theanorc:
--------------------------------------------------------------------------------
1 | [global]
2 | floatX = float32
3 | device = gpu0
4 | 
5 | [lib]
6 | cnmem = 0.2
7 | 


--------------------------------------------------------------------------------
/config/example-default.json:
--------------------------------------------------------------------------------
1 | {
2 |   "data_sync_destination": "~/Dropbox/openai_lab/data",
3 |   "NOTI_SLACK_DEST": "#rl-monitor",
4 |   "NOTI_SLACK_TOK": "GET_SLACK_BOT_TOKEN_FROM_https://my.slack.com/services/new/bot",
5 |   "experiments": [
6 |     "quickstart_dqn"
7 |   ]
8 | }
9 | 


--------------------------------------------------------------------------------
/config/keras.json:
--------------------------------------------------------------------------------
1 | {
2 |   "epsilon": 1e-07,
3 |   "image_dim_ordering": "tf",
4 |   "floatx": "float32",
5 |   "backend": "tensorflow"
6 | }
7 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kengz/openai_lab/d0669d89268f2dc01c1cf878e4879775c7b6eb3c/data/.gitkeep


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: openai_lab
 2 | channels:
 3 |   - conda-forge
 4 | dependencies:
 5 | - python>=3.5
 6 | - anaconda
 7 | - six
 8 | - h5py
 9 | - matplotlib==1.4.3
10 | - seaborn>=0.7.1
11 | - Pillow>=3.3.1
12 | - PyOpenGL>=3.1.0
13 | - glances>=2.6.2
14 | - pytest-cov>=2.3.1
15 | - pytest-xdist>=1.15.0
16 | - pip:
17 |   - codacy-coverage>=1.3.3
18 |   - mem_top==0.1.5
19 |   - atari_py>=0.0.18
20 |   - cmake==0.6.0
21 |   - tensorflow>=1.0.0
22 |   - Keras>=1.2.2,<2.0.0
23 |   - "--editable=git+https://github.com/openai/gym.git#egg=gym[all]"
24 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from rl.experiment import run
2 | from rl.util import args
3 | 
4 | if __name__ == '__main__':
5 |     run(args.experiment, **vars(args))
6 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "openai_lab",
 3 |   "version": "1.0.0",
 4 |   "description": "An experimentation system for Reinforcement Learning using OpenAI and Keras",
 5 |   "main": "index.js",
 6 |   "directories": {
 7 |     "test": "test"
 8 |   },
 9 |   "scripts": {
10 |     "start": "grunt",
11 |     "test": "python3 setup.py test",
12 |     "posttest": "rm -rf .cache __pycache__ */__pycache__ *egg-info htmlcov",
13 |     "snyk-protect": "snyk protect",
14 |     "prepublish": "npm run snyk-protect"
15 |   },
16 |   "repository": {
17 |     "type": "git",
18 |     "url": "git+https://github.com/kengz/openai_lab.git"
19 |   },
20 |   "keywords": [
21 |     "openai",
22 |     "gym",
23 |     "lab",
24 |     "reinforcement",
25 |     "learning"
26 |   ],
27 |   "author": "keng, laura",
28 |   "license": "MIT",
29 |   "bugs": {
30 |     "url": "https://github.com/kengz/openai_lab/issues"
31 |   },
32 |   "homepage": "https://github.com/kengz/openai_lab#readme",
33 |   "dependencies": {
34 |     "config": "^1.25.1",
35 |     "grunt": "^1.0.1",
36 |     "grunt-concurrent": "^2.3.1",
37 |     "grunt-contrib-watch": "^1.0.0",
38 |     "grunt-shell": "^2.1.0",
39 |     "grunt-sync": "^0.6.2",
40 |     "load-grunt-tasks": "^3.5.2",
41 |     "lodash": "^4.17.4",
42 |     "resolve-dir": "^1.0.0",
43 |     "snyk": "^1.41.1"
44 |   },
45 |   "snyk": true
46 | }
47 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | six
 2 | h5py
 3 | numpy>=1.12
 4 | scipy>=0.18
 5 | matplotlib==1.4.3
 6 | seaborn>=0.7.1
 7 | pandas>=0.18.1
 8 | atari_py>=0.0.18
 9 | Pillow>=3.3.1
10 | PyOpenGL>=3.1.0
11 | glances>=2.6.2
12 | mem_top==0.1.5
13 | pytest-cov>=2.3.1
14 | pytest-xdist>=1.15.0
15 | codacy-coverage>=1.3.3
16 | tensorflow>=1.0.0
17 | Keras>=1.2.2,<2.0.0
18 | -e git+https://github.com/openai/gym.git#egg=gym[all]
19 | 


--------------------------------------------------------------------------------
/rl/__init__.py:
--------------------------------------------------------------------------------
 1 | # curse of python pathing, hack to solve rel import
 2 | import glob
 3 | import sys
 4 | from os import path
 5 | from os.path import dirname, basename, isfile
 6 | file_path = path.normpath(path.join(path.dirname(__file__)))
 7 | sys.path.insert(0, file_path)
 8 | 
 9 | # another py curse, expose to prevent 'agent.<agent>' call
10 | pattern = "/*.py"
11 | modules = glob.glob(dirname(__file__) + pattern)
12 | __all__ = [basename(f)[:-3] for f in modules if isfile(f)]
13 | 


--------------------------------------------------------------------------------
/rl/agent/__init__.py:
--------------------------------------------------------------------------------
1 | from rl.util import import_package_files
2 | 
3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__)
4 | 


--------------------------------------------------------------------------------
/rl/agent/actor_critic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.agent.dqn import DQN
  3 | from rl.util import logger
  4 | 
  5 | 
  6 | class ActorCritic(DQN):
  7 | 
  8 |     '''
  9 |     Actor Critic algorithm. The actor's policy
 10 |     is adjusted in the direction that will lead to
 11 |     better actions, guided by the critic
 12 |     Implementation adapted from
 13 |     http://www.rage.net/~greg/2016-07-05-ActorCritic-with-OpenAI-Gym.html
 14 | 
 15 |     Assumes one of the policies in actor_critic.py are being used
 16 |     '''
 17 | 
 18 |     def __init__(self, env_spec,
 19 |                  train_per_n_new_exp=1,
 20 |                  gamma=0.95, lr=0.1,
 21 |                  epi_change_lr=None,
 22 |                  batch_size=16, n_epoch=5, hidden_layers=None,
 23 |                  hidden_layers_activation='sigmoid',
 24 |                  output_layer_activation='linear',
 25 |                  auto_architecture=False,
 26 |                  num_hidden_layers=3,
 27 |                  first_hidden_layer_size=256,
 28 |                  num_initial_channels=16,
 29 |                  **kwargs):  # absorb generic param without breaking
 30 |         # import only when needed to contain side-effects
 31 |         from keras.layers.core import Dense
 32 |         from keras.models import Sequential, load_model
 33 |         self.Dense = Dense
 34 |         self.Sequential = Sequential
 35 |         self.load_model = load_model
 36 | 
 37 |         super(ActorCritic, self).__init__(env_spec,
 38 |                                           train_per_n_new_exp,
 39 |                                           gamma, lr,
 40 |                                           epi_change_lr,
 41 |                                           batch_size, n_epoch, hidden_layers,
 42 |                                           hidden_layers_activation,
 43 |                                           output_layer_activation,
 44 |                                           auto_architecture,
 45 |                                           num_hidden_layers,
 46 |                                           first_hidden_layer_size,
 47 |                                           num_initial_channels,
 48 |                                           **kwargs)
 49 | 
 50 |     def build_model(self):
 51 |         self.build_actor()
 52 |         self.build_critic()
 53 |         logger.info("Actor and critic models built")
 54 | 
 55 |     def build_actor(self):
 56 |         actor = self.Sequential()
 57 |         super(ActorCritic, self).build_hidden_layers(actor)
 58 |         actor.add(self.Dense(self.env_spec['action_dim'],
 59 |                              init='lecun_uniform',
 60 |                              activation=self.output_layer_activation))
 61 |         logger.info("Actor summary")
 62 |         actor.summary()
 63 |         self.actor = actor
 64 | 
 65 |     def build_critic(self):
 66 |         critic = self.Sequential()
 67 |         super(ActorCritic, self).build_hidden_layers(critic)
 68 |         critic.add(self.Dense(1,
 69 |                               init='lecun_uniform',
 70 |                               activation=self.output_layer_activation))
 71 |         logger.info("Critic summary")
 72 |         critic.summary()
 73 |         self.critic = critic
 74 | 
 75 |     def compile_model(self):
 76 |         self.actor.compile(
 77 |             loss='mse',
 78 |             optimizer=self.optimizer.keras_optimizer)
 79 |         self.critic.compile(
 80 |             loss='mse',
 81 |             optimizer=self.optimizer.keras_optimizer)
 82 |         logger.info("Actor and critic compiled")
 83 | 
 84 |     def recompile_model(self, sys_vars):
 85 |         '''
 86 |         Option to change model optimizer settings
 87 |         Currently only used for changing the learning rate
 88 |         Compiling does not affect the model weights
 89 |         '''
 90 |         if self.epi_change_lr is not None:
 91 |             if (sys_vars['epi'] == self.epi_change_lr and
 92 |                     sys_vars['t'] == 0):
 93 |                 self.lr = self.lr / 10.0
 94 |                 self.optimizer.change_optim_param(**{'lr': self.lr})
 95 |                 self.actor.compile(
 96 |                     loss='mse',
 97 |                     optimizer=self.optimizer.keras_optimizer)
 98 |                 self.critic.compile(
 99 |                     loss='mse',
100 |                     optimizer=self.optimizer.keras_optimizer)
101 |                 logger.info(
102 |                     'Actor and critic models recompiled with new settings: '
103 |                     'Learning rate: {}'.format(self.lr))
104 | 
105 |     def train_critic(self, minibatch):
106 |         Q_vals = np.clip(self.critic.predict(minibatch['states']),
107 |                          -self.clip_val, self.clip_val)
108 |         Q_next_vals = np.clip(self.critic.predict(minibatch['next_states']),
109 |                               -self.clip_val, self.clip_val)
110 |         Q_targets = minibatch['rewards'] + self.gamma * \
111 |             (1 - minibatch['terminals']) * Q_next_vals.squeeze()
112 |         Q_targets = np.expand_dims(Q_targets, axis=1)
113 | 
114 |         actor_delta = Q_next_vals - Q_vals
115 |         loss = self.critic.train_on_batch(minibatch['states'], Q_targets)
116 | 
117 |         # update memory, needed for PER
118 |         errors = abs(np.sum(Q_vals - Q_targets, axis=1))
119 |         # Q size is only 1, from critic
120 |         assert Q_targets.shape == (self.batch_size, 1)
121 |         assert errors.shape == (self.batch_size, )
122 |         self.memory.update(errors)
123 |         return loss, actor_delta
124 | 
125 |     def train_actor(self, minibatch, actor_delta):
126 |         old_vals = self.actor.predict(minibatch['states'])
127 |         if self.env_spec['actions'] == 'continuous':
128 |             A_targets = np.zeros(
129 |                 (actor_delta.shape[0], self.env_spec['action_dim']))
130 |             for j in range(A_targets.shape[1]):
131 |                 A_targets[:, j] = actor_delta.squeeze()
132 |         else:
133 |             A_targets = minibatch['actions'] * actor_delta + \
134 |                 (1 - minibatch['actions']) * old_vals
135 | 
136 |         loss = self.actor.train_on_batch(minibatch['states'], A_targets)
137 |         return loss
138 | 
139 |     def train_an_epoch(self):
140 |         minibatch = self.memory.rand_minibatch(self.batch_size)
141 |         critic_loss, actor_delta = self.train_critic(minibatch)
142 |         actor_loss = self.train_actor(minibatch, actor_delta)
143 |         return critic_loss + actor_loss
144 | 


--------------------------------------------------------------------------------
/rl/agent/base_agent.py:
--------------------------------------------------------------------------------
 1 | from rl.util import logger
 2 | 
 3 | 
 4 | class Agent(object):
 5 | 
 6 |     '''
 7 |     The base class of Agent, with the core methods
 8 |     '''
 9 | 
10 |     def __init__(self, env_spec,
11 |                  **kwargs):  # absorb generic param without breaking
12 |         self.env_spec = env_spec
13 | 
14 |     def compile(self, memory, optimizer, policy, preprocessor):
15 |         # set 2 way references
16 |         self.memory = memory
17 |         self.optimizer = optimizer
18 |         self.policy = policy
19 |         self.preprocessor = preprocessor
20 |         # back references
21 |         setattr(memory, 'agent', self)
22 |         setattr(optimizer, 'agent', self)
23 |         setattr(policy, 'agent', self)
24 |         setattr(preprocessor, 'agent', self)
25 |         self.compile_model()
26 |         logger.info(
27 |             'Compiled:\nAgent, Memory, Optimizer, Policy, '
28 |             'Preprocessor:\n{}'.format(
29 |                 ', '.join([comp.__class__.__name__ for comp in
30 |                            [self, memory, optimizer, policy, preprocessor]])
31 |             ))
32 | 
33 |     def build_model(self):
34 |         raise NotImplementedError()
35 | 
36 |     def compile_model(self):
37 |         raise NotImplementedError()
38 | 
39 |     def select_action(self, state):
40 |         self.policy.select_action(state)
41 |         raise NotImplementedError()
42 | 
43 |     def update(self, sys_vars):
44 |         '''Agent update apart from training the Q function'''
45 |         self.policy.update(sys_vars)
46 |         raise NotImplementedError()
47 | 
48 |     def to_train(self, sys_vars):
49 |         raise NotImplementedError()
50 | 
51 |     def train(self, sys_vars):
52 |         raise NotImplementedError()
53 | 


--------------------------------------------------------------------------------
/rl/agent/conv_dqn.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from rl.agent.dqn import DQN
 3 | 
 4 | 
 5 | class ConvDQN(DQN):
 6 | 
 7 |     def __init__(self, *args, **kwargs):
 8 |         from keras.layers.core import Dense, Flatten
 9 |         from keras.layers.convolutional import Convolution2D
10 |         from keras import backend as K
11 |         if K.backend() == 'theano':
12 |             K.set_image_dim_ordering('tf')
13 |         self.Dense = Dense
14 |         self.Flatten = Flatten
15 |         self.Convolution2D = Convolution2D
16 | 
17 |         self.kernel = 4
18 |         self.stride = (2, 2)
19 |         super(ConvDQN, self).__init__(*args, **kwargs)
20 | 
21 |     def build_hidden_layers(self, model):
22 |         '''
23 |         build the hidden layers into model using parameter self.hidden_layers
24 |         Auto architecture infers the size of the hidden layers from the number
25 |         of channels in the first hidden layer and number of layers
26 |         With each successive layer the number of channels is doubled
27 |         Kernel size is fixed at 4, and stride at (2, 2)
28 |         No new layers are added if the cols or rows have dim <= 5
29 |         Enables hyperparameter optimization over network architecture
30 |         '''
31 |         if self.auto_architecture:
32 |             num_channels = self.num_initial_channels
33 |             cols = self.env_spec['state_dim'][0]
34 |             rows = self.env_spec['state_dim'][1]
35 |             # input layer
36 |             model.add(
37 |                 self.Convolution2D(
38 |                     num_channels,
39 |                     self.kernel,
40 |                     self.kernel,
41 |                     subsample=self.stride,
42 |                     input_shape=self.env_spec['state_dim'],
43 |                     activation=self.hidden_layers_activation,
44 |                     # border_mode='same',
45 |                     init='lecun_uniform'))
46 | 
47 |             for i in range(1, self.num_hidden_layers):
48 |                 num_channels *= 2
49 |                 cols = math.ceil(
50 |                     math.floor(cols - self.kernel - 1) / self.stride[0]) + 1
51 |                 rows = math.ceil(
52 |                     math.floor(rows - self.kernel - 1) / self.stride[1]) + 1
53 |                 if cols > 5 and rows > 5:
54 |                     model.add(
55 |                         self.Convolution2D(
56 |                             num_channels,
57 |                             self.kernel,
58 |                             self.kernel,
59 |                             subsample=self.stride,
60 |                             activation=self.hidden_layers_activation,
61 |                             # border_mode='same',
62 |                             init='lecun_uniform'))
63 |                 else:
64 |                     # stop addition of too many layers
65 |                     # and from breakage by cols, rows growing to 0
66 |                     break
67 | 
68 |         else:
69 |             model.add(
70 |                 self.Convolution2D(
71 |                     self.hidden_layers[0][0],
72 |                     self.hidden_layers[0][1],
73 |                     self.hidden_layers[0][2],
74 |                     subsample=self.hidden_layers[0][3],
75 |                     input_shape=self.env_spec['state_dim'],
76 |                     activation=self.hidden_layers_activation,
77 |                     # border_mode='same',
78 |                     init='lecun_uniform'))
79 | 
80 |             if (len(self.hidden_layers) > 1):
81 |                 for i in range(1, len(self.hidden_layers)):
82 |                     model.add(
83 |                         self.Convolution2D(
84 |                             self.hidden_layers[i][0],
85 |                             self.hidden_layers[i][1],
86 |                             self.hidden_layers[i][2],
87 |                             subsample=self.hidden_layers[i][3],
88 |                             activation=self.hidden_layers_activation,
89 |                             # border_mode='same',
90 |                             init='lecun_uniform'))
91 | 
92 |         model.add(self.Flatten())
93 |         model.add(self.Dense(256,
94 |                              init='lecun_uniform',
95 |                              activation=self.hidden_layers_activation))
96 | 
97 |         return model
98 | 


--------------------------------------------------------------------------------
/rl/agent/ddpg.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.agent.dqn import DQN
  3 | from rl.util import logger, clone_model
  4 | 
  5 | 
  6 | class Actor(DQN):
  7 |     '''
  8 |     Actor of DDPG, with its network and target network
  9 |     input is states, output is action
 10 |     very similar to DQN
 11 |     '''
 12 | 
 13 |     def __init__(self, *args, tau=0.001, **kwargs):
 14 |         from keras import backend as K
 15 |         self.K = K
 16 |         self.tf = self.K.tf
 17 |         self.sess = self.K.get_session()
 18 |         self.tau = tau
 19 |         super(Actor, self).__init__(*args, **kwargs)
 20 | 
 21 |     def build_model(self):
 22 |         self.model = super(Actor, self).build_model()
 23 |         self.target_model = clone_model(self.model)
 24 | 
 25 |         self.actor_states = self.model.inputs[0]
 26 |         self.out = self.model.output
 27 |         self.scaled_out = self.tf.multiply(
 28 |             self.out, self.env_spec['action_bound_high'])
 29 |         self.network_params = self.model.trainable_weights
 30 | 
 31 |         self.target_actor_states = self.target_model.inputs[0]
 32 |         self.target_out = self.target_model.output
 33 |         self.target_scaled_out = self.tf.multiply(
 34 |             self.target_out, self.env_spec['action_bound_high'])
 35 |         self.target_network_params = self.target_model.trainable_weights
 36 | 
 37 |         # Op for updating target network
 38 |         self.update_target_network_op = []
 39 |         for i, t_w in enumerate(self.target_network_params):
 40 |             op = t_w.assign(
 41 |                 self.tf.multiply(
 42 |                     self.tau, self.network_params[i]
 43 |                 ) + self.tf.multiply(1. - self.tau, t_w))
 44 |             self.update_target_network_op.append(op)
 45 | 
 46 |         # will be fed as self.action_gradient: critic_grads
 47 |         self.action_gradient = self.tf.placeholder(
 48 |             self.tf.float32, [None, self.env_spec['action_dim']])
 49 | 
 50 |         # actor model gradient op, to be fed from critic
 51 |         self.actor_gradients = self.tf.gradients(
 52 |             self.scaled_out, self.network_params, -self.action_gradient)
 53 | 
 54 |         # Optimization op
 55 |         self.optimize = self.tf.train.AdamOptimizer(self.lr).apply_gradients(
 56 |             zip(self.actor_gradients, self.network_params))
 57 |         return self.model
 58 | 
 59 |     def compile_model(self):
 60 |         pass
 61 | 
 62 |     def recompile_model(self, sys_vars):
 63 |         pass
 64 | 
 65 |     def update(self, sys_vars):
 66 |         self.sess.run(self.update_target_network_op)
 67 | 
 68 |     def predict(self, states):
 69 |         return self.sess.run(self.scaled_out, feed_dict={
 70 |             self.actor_states: states
 71 |         })
 72 | 
 73 |     def target_predict(self, next_states):
 74 |         return self.sess.run(self.target_scaled_out, feed_dict={
 75 |             self.target_actor_states: next_states
 76 |         })
 77 | 
 78 |     def train_tf(self, states, critic_action_gradient):
 79 |         return self.sess.run(self.optimize, feed_dict={
 80 |             self.actor_states: states,
 81 |             self.action_gradient: critic_action_gradient
 82 |         })
 83 | 
 84 | 
 85 | class Critic(DQN):
 86 | 
 87 |     '''
 88 |     Critic of DDPG, with its network and target network
 89 |     input is states and actions, output is Q value
 90 |     the action is from Actor
 91 |     '''
 92 | 
 93 |     def __init__(self, *args, tau=0.001, critic_lr=0.001, **kwargs):
 94 |         from keras.layers import Dense, Merge
 95 |         from keras import backend as K
 96 |         self.Dense = Dense
 97 |         self.Merge = Merge
 98 |         self.K = K
 99 |         self.tf = self.K.tf
100 |         self.sess = self.K.get_session()
101 |         self.tau = tau
102 |         self.critic_lr = critic_lr  # suggestion: 10 x actor_lr
103 |         super(Critic, self).__init__(*args, **kwargs)
104 | 
105 |     def build_critic_models(self):
106 |         state_branch = self.Sequential()
107 |         state_branch.add(self.Dense(
108 |             self.hidden_layers[0],
109 |             input_shape=(self.env_spec['state_dim'],),
110 |             activation=self.hidden_layers_activation,
111 |             init='lecun_uniform'))
112 | 
113 |         action_branch = self.Sequential()
114 |         action_branch.add(self.Dense(
115 |             self.hidden_layers[0],
116 |             input_shape=(self.env_spec['action_dim'],),
117 |             activation=self.hidden_layers_activation,
118 |             init='lecun_uniform'))
119 | 
120 |         input_layer = self.Merge([state_branch, action_branch], mode='concat')
121 | 
122 |         model = self.Sequential()
123 |         model.add(input_layer)
124 | 
125 |         if (len(self.hidden_layers) > 1):
126 |             for i in range(1, len(self.hidden_layers)):
127 |                 model.add(self.Dense(
128 |                     self.hidden_layers[i],
129 |                     init='lecun_uniform',
130 |                     activation=self.hidden_layers_activation))
131 | 
132 |         model.add(self.Dense(1,
133 |                              init='lecun_uniform',
134 |                              activation='linear'))  # fixed
135 |         logger.info('Critic model summary')
136 |         model.summary()
137 |         self.model = model
138 | 
139 |         logger.info("Model built")
140 |         return self.model
141 | 
142 |     def build_model(self):
143 |         self.model = self.build_critic_models()
144 |         self.target_model = clone_model(self.model)
145 | 
146 |         self.critic_states = self.model.inputs[0]
147 |         self.critic_actions = self.model.inputs[1]
148 |         self.out = self.model.output
149 |         self.network_params = self.model.trainable_weights
150 | 
151 |         self.target_critic_states = self.target_model.inputs[0]
152 |         self.target_critic_actions = self.target_model.inputs[1]
153 |         self.target_out = self.target_model.output
154 |         self.target_network_params = self.target_model.trainable_weights
155 | 
156 |         # Op for updating target network
157 |         self.update_target_network_op = []
158 |         for i, t_w in enumerate(self.target_network_params):
159 |             op = t_w.assign(
160 |                 self.tf.multiply(
161 |                     self.tau, self.network_params[i]
162 |                 ) + self.tf.multiply(1. - self.tau, t_w))
163 |             self.update_target_network_op.append(op)
164 | 
165 |         # custom loss and optimization Op
166 |         self.y = self.tf.placeholder(self.tf.float32, [None, 1])
167 |         self.loss = self.tf.losses.mean_squared_error(self.y, self.out)
168 |         self.optimize = self.tf.train.AdamOptimizer(
169 |             self.critic_lr).minimize(self.loss)
170 | 
171 |         self.action_gradient = self.tf.gradients(self.out, self.critic_actions)
172 |         return self.model
173 | 
174 |     def update(self, sys_vars):
175 |         self.sess.run(self.update_target_network_op)
176 | 
177 |     def get_action_gradient(self, states, actions):
178 |         return self.sess.run(self.action_gradient, feed_dict={
179 |             self.critic_states: states,
180 |             self.critic_actions: actions
181 |         })[0]
182 | 
183 |     # def predict(self, inputs, action):
184 |     #     return self.sess.run(self.out, feed_dict={
185 |     #         self.critic_states: inputs,
186 |     #         self.critic_actions: action
187 |     #     })
188 | 
189 |     def target_predict(self, next_states, mu_prime):
190 |         return self.sess.run(self.target_out, feed_dict={
191 |             self.target_critic_states: next_states,
192 |             self.target_critic_actions: mu_prime
193 |         })
194 | 
195 |     def train_tf(self, states, actions, y):
196 |         return self.sess.run([self.out, self.optimize, self.loss], feed_dict={
197 |             self.critic_states: states,
198 |             self.critic_actions: actions,
199 |             self.y: y
200 |         })
201 | 
202 | 
203 | class DDPG(DQN):
204 | 
205 |     '''
206 |     DDPG Algorithm, from https://arxiv.org/abs/1509.02971
207 |     has Actor, Critic, and each has its own target network
208 |     Implementation referred from https://github.com/pemami4911/deep-rl
209 |     '''
210 | 
211 |     def __init__(self, *args, **kwargs):
212 |         # import only when needed to contain side-effects
213 |         from keras import backend as K
214 |         self.K = K
215 |         self.sess = self.K.get_session()
216 |         self.actor = Actor(*args, **kwargs)
217 |         self.critic = Critic(*args, **kwargs)
218 |         self.sess.run(self.K.tf.global_variables_initializer())
219 |         super(DDPG, self).__init__(*args, **kwargs)
220 | 
221 |     def build_model(self):
222 |         pass
223 | 
224 |     def compile_model(self):
225 |         pass
226 | 
227 |     def recompile_model(self, sys_vars):
228 |         pass
229 | 
230 |     def select_action(self, state):
231 |         return self.policy.select_action(state)
232 | 
233 |     def update(self, sys_vars):
234 |         # Update target networks
235 |         self.actor.update(sys_vars)
236 |         self.critic.update(sys_vars)
237 |         self.policy.update(sys_vars)
238 |         self.update_n_epoch(sys_vars)
239 | 
240 |     def train_an_epoch(self):
241 |         minibatch = self.memory.rand_minibatch(self.batch_size)
242 | 
243 |         # train critic
244 |         mu_prime = self.actor.target_predict(minibatch['next_states'])
245 |         q_val = self.critic.target_predict(minibatch['states'], mu_prime)
246 |         q_prime = self.critic.target_predict(
247 |             minibatch['next_states'], mu_prime)
248 |         # reshape for element-wise multiplication
249 |         # to feed into network, y shape needs to be (?, 1)
250 |         y = minibatch['rewards'] + self.gamma * \
251 |             (1 - minibatch['terminals']) * np.reshape(q_prime, (-1))
252 |         y = np.reshape(y, (-1, 1))
253 | 
254 |         # update memory, needed for PER
255 |         errors = abs(np.sum(q_val - y, axis=1))
256 |         # Q size is only 1, from critic
257 |         assert y.shape == (self.batch_size, 1)
258 |         assert errors.shape == (self.batch_size, )
259 |         self.memory.update(errors)
260 | 
261 |         _, _, critic_loss = self.critic.train_tf(
262 |             minibatch['states'], minibatch['actions'], y)
263 | 
264 |         # train actor
265 |         # Update the actor policy using the sampled gradient
266 |         actions = self.actor.predict(minibatch['states'])
267 |         critic_action_gradient = self.critic.get_action_gradient(
268 |             minibatch['states'], actions)
269 |         # currently cant be gotten
270 |         _actorloss = self.actor.train_tf(
271 |             minibatch['states'], critic_action_gradient)
272 | 
273 |         loss = critic_loss
274 |         return loss
275 | 


--------------------------------------------------------------------------------
/rl/agent/deep_exp_sarsa.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rl.agent.deep_sarsa import DeepSarsa
 3 | 
 4 | 
 5 | class DeepExpectedSarsa(DeepSarsa):
 6 | 
 7 |     '''
 8 |     Deep Expected Sarsa agent.
 9 |     On policy, with updates after each experience
10 |     Policy = epsilonGreedyPolicy
11 |     '''
12 | 
13 |     def compute_Q_states(self, minibatch):
14 |         (Q_states, Q_next_states, _max) = super(
15 |             DeepExpectedSarsa, self).compute_Q_states(minibatch)
16 | 
17 |         curr_e = self.policy.e
18 |         curr_e_per_a = curr_e / self.env_spec['action_dim']
19 | 
20 |         Q_next_states_max = np.amax(Q_next_states, axis=1)
21 |         Q_next_states_selected = (1 - curr_e) * Q_next_states_max + \
22 |             np.sum(Q_next_states * curr_e_per_a, axis=1)
23 |         return (Q_states, Q_next_states, Q_next_states_selected)
24 | 


--------------------------------------------------------------------------------
/rl/agent/deep_sarsa.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rl.agent.dqn import DQN
 3 | 
 4 | 
 5 | class DeepSarsa(DQN):
 6 | 
 7 |     '''
 8 |     Deep Sarsa agent.
 9 |     On policy, with updates after each experience
10 |     Policy = epsilonGreedyPolicy
11 |     '''
12 | 
13 |     def __init__(self, *args, **kwargs):
14 |         super(DeepSarsa, self).__init__(*args, **kwargs)
15 |         self.train_per_n_new_exp = 1
16 |         self.batch_size = 1
17 |         self.n_epoch = 1
18 |         self.final_n_epoch = 1
19 | 
20 |     def compute_Q_states(self, minibatch):
21 |         (Q_states, Q_next_states, _max) = super(
22 |             DeepSarsa, self).compute_Q_states(minibatch)
23 |         next_action = self.select_action(minibatch['next_states'][0])
24 |         Q_next_states_selected = Q_next_states[:, next_action]
25 |         return (Q_states, Q_next_states, Q_next_states_selected)
26 | 
27 |     def train_an_epoch(self):
28 |         minibatch = self.memory.pop()
29 |         (Q_states, _next, Q_next_states_selected
30 |          ) = self.compute_Q_states(minibatch)
31 |         Q_targets = self.compute_Q_targets(
32 |             minibatch, Q_states, Q_next_states_selected)
33 |         loss = self.model.train_on_batch(minibatch['states'], Q_targets)
34 | 
35 |         errors = abs(np.sum(Q_states - Q_targets, axis=1))
36 |         assert Q_targets.shape == (
37 |             self.batch_size, self.env_spec['action_dim'])
38 |         assert errors.shape == (self.batch_size, )
39 |         self.memory.update(errors)
40 |         return loss
41 | 


--------------------------------------------------------------------------------
/rl/agent/double_conv_dqn.py:
--------------------------------------------------------------------------------
 1 | from rl.agent.conv_dqn import ConvDQN
 2 | from rl.agent.double_dqn import DoubleDQN
 3 | 
 4 | 
 5 | class DoubleConvDQN(DoubleDQN, ConvDQN):
 6 | 
 7 |     '''
 8 |     The base class of double convolutional DQNs
 9 |     extended from DoubleDQN and ConvDQN
10 |     multiple inheritance will use the method from the first class
11 |     if multiple ones exists
12 |     '''
13 | 
14 |     def build_hidden_layers(self, model):
15 |         ConvDQN.build_hidden_layers(self, model)
16 | 


--------------------------------------------------------------------------------
/rl/agent/double_dqn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rl.agent.dqn import DQN
 3 | from rl.util import logger, clone_model, clone_optimizer
 4 | 
 5 | 
 6 | class DoubleDQN(DQN):
 7 | 
 8 |     '''
 9 |     The base class of double DQNs
10 |     '''
11 | 
12 |     def build_model(self):
13 |         super(DoubleDQN, self).build_model()
14 | 
15 |         model_2 = clone_model(self.model)
16 |         logger.info("Model 2 summary")
17 |         model_2.summary()
18 |         self.model_2 = model_2
19 | 
20 |         logger.info("Models 1 and 2 built")
21 |         return self.model, self.model_2
22 | 
23 |     def compile_model(self):
24 |         self.optimizer.keras_optimizer_2 = clone_optimizer(
25 |             self.optimizer.keras_optimizer)
26 |         self.model.compile(
27 |             loss='mse',
28 |             optimizer=self.optimizer.keras_optimizer)
29 |         self.model_2.compile(
30 |             loss='mse',
31 |             optimizer=self.optimizer.keras_optimizer_2)
32 |         logger.info("Models 1 and 2 compiled")
33 | 
34 |     def switch_models(self):
35 |          # Switch model 1 and model 2, also the optimizers
36 |         temp = self.model
37 |         self.model = self.model_2
38 |         self.model_2 = temp
39 | 
40 |         temp_optimizer = self.optimizer.keras_optimizer
41 |         self.optimizer.keras_optimizer = self.optimizer.keras_optimizer_2
42 |         self.optimizer.keras_optimizer_2 = temp_optimizer
43 | 
44 |     # def recompile_model(self, sys_vars):
45 |     #     '''rotate and recompile both models'''
46 |     #     # TODO fix this, double recompile breaks solving power
47 |     #     if self.epi_change_lr is not None:
48 |     #         self.switch_models()  # to model_2
49 |     #         super(DoubleDQN, self).recompile_model(sys_vars)
50 |     #         self.switch_models()  # back to model
51 |     #         super(DoubleDQN, self).recompile_model(sys_vars)
52 |     #     return self.model
53 | 
54 |     def compute_Q_states(self, minibatch):
55 |         (Q_states, Q_next_states_select, _max) = super(
56 |             DoubleDQN, self).compute_Q_states(minibatch)
57 |         # Different from (single) dqn: Select max using model 2
58 |         Q_next_states_max_ind = np.argmax(Q_next_states_select, axis=1)
59 |         # same as dqn again, but use Q_next_states_max_ind above
60 |         Q_next_states = np.clip(
61 |             self.model_2.predict(minibatch['next_states']),
62 |             -self.clip_val, self.clip_val)
63 |         rows = np.arange(Q_next_states_max_ind.shape[0])
64 |         Q_next_states_max = Q_next_states[rows, Q_next_states_max_ind]
65 | 
66 |         return (Q_states, Q_next_states, Q_next_states_max)
67 | 
68 |     def train_an_epoch(self):
69 |         self.switch_models()
70 |         return super(DoubleDQN, self).train_an_epoch()
71 | 


--------------------------------------------------------------------------------
/rl/agent/dqn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.agent.base_agent import Agent
  3 | from rl.util import logger, log_self
  4 | 
  5 | 
  6 | class DQN(Agent):
  7 | 
  8 |     '''
  9 |     The base class of DQNs, with the core methods
 10 |     The simplest deep Q network,
 11 |     with epsilon-greedy method and
 12 |     Bellman equation for value, using neural net.
 13 |     '''
 14 | 
 15 |     def __init__(self, env_spec,
 16 |                  train_per_n_new_exp=1,
 17 |                  gamma=0.95, lr=0.1,
 18 |                  epi_change_lr=None,
 19 |                  batch_size=16, n_epoch=5, hidden_layers=None,
 20 |                  hidden_layers_activation='sigmoid',
 21 |                  output_layer_activation='linear',
 22 |                  auto_architecture=False,
 23 |                  num_hidden_layers=3,
 24 |                  first_hidden_layer_size=256,
 25 |                  num_initial_channels=16,
 26 |                  **kwargs):  # absorb generic param without breaking
 27 |         # import only when needed to contain side-effects
 28 |         from keras.layers.core import Dense
 29 |         from keras.models import Sequential, load_model
 30 |         self.Dense = Dense
 31 |         self.Sequential = Sequential
 32 |         self.load_model = load_model
 33 | 
 34 |         super(DQN, self).__init__(env_spec)
 35 | 
 36 |         self.train_per_n_new_exp = train_per_n_new_exp
 37 |         self.gamma = gamma
 38 |         self.lr = lr
 39 |         self.epi_change_lr = epi_change_lr
 40 |         self.batch_size = batch_size
 41 |         self.n_epoch = 1
 42 |         self.final_n_epoch = n_epoch
 43 |         self.hidden_layers = hidden_layers or [4]
 44 |         self.hidden_layers_activation = hidden_layers_activation
 45 |         self.output_layer_activation = output_layer_activation
 46 |         self.clip_val = 10000
 47 |         self.auto_architecture = auto_architecture
 48 |         self.num_hidden_layers = num_hidden_layers
 49 |         self.first_hidden_layer_size = first_hidden_layer_size
 50 |         self.num_initial_channels = num_initial_channels
 51 |         log_self(self)
 52 |         self.build_model()
 53 | 
 54 |     def build_hidden_layers(self, model):
 55 |         '''
 56 |         build the hidden layers into model using parameter self.hidden_layers
 57 |         '''
 58 | 
 59 |         # Auto architecture infers the size of the hidden layers from the size
 60 |         # of the first layer. Each successive hidden layer is half the size of the
 61 |         # previous layer
 62 |         # Enables hyperparameter optimization over network architecture
 63 |         if self.auto_architecture:
 64 |             curr_layer_size = self.first_hidden_layer_size
 65 |             model.add(self.Dense(curr_layer_size,
 66 |                                  input_shape=(self.env_spec['state_dim'],),
 67 |                                  activation=self.hidden_layers_activation,
 68 |                                  init='lecun_uniform'))
 69 | 
 70 |             curr_layer_size = int(curr_layer_size / 2)
 71 |             for i in range(1, self.num_hidden_layers):
 72 |                 model.add(self.Dense(curr_layer_size,
 73 |                                      init='lecun_uniform',
 74 |                                      activation=self.hidden_layers_activation))
 75 |                 curr_layer_size = int(curr_layer_size / 2)
 76 | 
 77 |         else:
 78 |             model.add(self.Dense(self.hidden_layers[0],
 79 |                                  input_shape=(self.env_spec['state_dim'],),
 80 |                                  activation=self.hidden_layers_activation,
 81 |                                  init='lecun_uniform'))
 82 |             # inner hidden layer: no specification of input shape
 83 |             if (len(self.hidden_layers) > 1):
 84 |                 for i in range(1, len(self.hidden_layers)):
 85 |                     model.add(self.Dense(
 86 |                         self.hidden_layers[i],
 87 |                         init='lecun_uniform',
 88 |                         activation=self.hidden_layers_activation))
 89 | 
 90 |         return model
 91 | 
 92 |     def build_model(self):
 93 |         model = self.Sequential()
 94 |         self.build_hidden_layers(model)
 95 |         model.add(self.Dense(self.env_spec['action_dim'],
 96 |                              init='lecun_uniform',
 97 |                              activation=self.output_layer_activation))
 98 |         logger.info("Model summary")
 99 |         model.summary()
100 |         self.model = model
101 | 
102 |         logger.info("Model built")
103 |         return self.model
104 | 
105 |     def compile_model(self):
106 |         self.model.compile(
107 |             loss='mse',
108 |             optimizer=self.optimizer.keras_optimizer)
109 |         logger.info("Model compiled")
110 | 
111 |     def recompile_model(self, sys_vars):
112 |         '''
113 |         Option to change model optimizer settings
114 |         Currently only used for changing the learning rate
115 |         Compiling does not affect the model weights
116 |         '''
117 |         if self.epi_change_lr is not None:
118 |             if (sys_vars['epi'] == self.epi_change_lr and
119 |                     sys_vars['t'] == 0):
120 |                 self.lr = self.lr / 10.0
121 |                 self.optimizer.change_optim_param(**{'lr': self.lr})
122 |                 self.model.compile(
123 |                     loss='mse',
124 |                     optimizer=self.optimizer.keras_optimizer)
125 |                 logger.info('Model recompiled with new settings: '
126 |                             'Learning rate: {}'.format(self.lr))
127 |         return self.model
128 | 
129 |     def update_n_epoch(self, sys_vars):
130 |         '''
131 |         Increase epochs at the beginning of each session,
132 |         for training for later episodes,
133 |         once it has more experience
134 |         Best so far, increment num epochs every 2 up to a max of 5
135 |         '''
136 |         if (self.n_epoch < self.final_n_epoch and
137 |                 sys_vars['t'] == 0 and
138 |                 sys_vars['epi'] % 2 == 0):
139 |             self.n_epoch += 1
140 |         return self.n_epoch
141 | 
142 |     def select_action(self, state):
143 |         '''epsilon-greedy method'''
144 |         return self.policy.select_action(state)
145 | 
146 |     def update(self, sys_vars):
147 |         '''
148 |         Agent update apart from training the Q function
149 |         '''
150 |         self.policy.update(sys_vars)
151 |         self.update_n_epoch(sys_vars)
152 |         self.recompile_model(sys_vars)
153 | 
154 |     def to_train(self, sys_vars):
155 |         '''
156 |         return boolean condition if agent should train
157 |         get n NEW experiences before training model
158 |         '''
159 |         t = sys_vars['t']
160 |         done = sys_vars['done']
161 |         timestep_limit = self.env_spec['timestep_limit']
162 |         return (t > 0) and bool(
163 |             t % self.train_per_n_new_exp == 0 or
164 |             t == (timestep_limit-1) or
165 |             done)
166 | 
167 |     def compute_Q_states(self, minibatch):
168 |         # note the computed values below are batched in array
169 |         Q_states = np.clip(self.model.predict(minibatch['states']),
170 |                            -self.clip_val, self.clip_val)
171 |         Q_next_states = np.clip(self.model.predict(minibatch['next_states']),
172 |                                 -self.clip_val, self.clip_val)
173 |         Q_next_states_max = np.amax(Q_next_states, axis=1)
174 |         return (Q_states, Q_next_states, Q_next_states_max)
175 | 
176 |     def compute_Q_targets(self, minibatch, Q_states, Q_next_states_max):
177 |         # make future reward 0 if exp is terminal
178 |         Q_targets_a = minibatch['rewards'] + self.gamma * \
179 |             (1 - minibatch['terminals']) * Q_next_states_max
180 |         # set batch Q_targets of a as above, the rest as is
181 |         # minibatch['actions'] is one-hot encoded
182 |         Q_targets = minibatch['actions'] * Q_targets_a[:, np.newaxis] + \
183 |             (1 - minibatch['actions']) * Q_states
184 |         return Q_targets
185 | 
186 |     def train_an_epoch(self):
187 |         minibatch = self.memory.rand_minibatch(self.batch_size)
188 | 
189 |         (Q_states, _states, Q_next_states_max) = self.compute_Q_states(
190 |             minibatch)
191 |         Q_targets = self.compute_Q_targets(
192 |             minibatch, Q_states, Q_next_states_max)
193 |         loss = self.model.train_on_batch(minibatch['states'], Q_targets)
194 | 
195 |         errors = abs(np.sum(Q_states - Q_targets, axis=1))
196 |         assert Q_targets.shape == (
197 |             self.batch_size, self.env_spec['action_dim'])
198 |         assert errors.shape == (self.batch_size, )
199 |         self.memory.update(errors)
200 |         return loss
201 | 
202 |     def train(self, sys_vars):
203 |         '''
204 |         Training is for the Q function (NN) only
205 |         otherwise (e.g. policy) see self.update()
206 |         step 1,2,3,4 of algo.
207 |         '''
208 |         loss_total = 0
209 |         for _epoch in range(self.n_epoch):
210 |             loss = self.train_an_epoch()
211 |             loss_total += loss
212 |         avg_loss = loss_total / self.n_epoch
213 |         sys_vars['loss'].append(avg_loss)
214 |         return avg_loss
215 | 
216 |     def save(self, model_path, global_step=None):
217 |         logger.info('Saving model checkpoint')
218 |         self.model.save_weights(model_path)
219 | 
220 |     def restore(self, model_path):
221 |         self.model.load_weights(model_path, by_name=False)
222 | 


--------------------------------------------------------------------------------
/rl/agent/freeze_dqn.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rl.agent.double_dqn import DoubleDQN
 3 | from rl.agent.dqn import DQN
 4 | from rl.util import logger, clone_model
 5 | 
 6 | 
 7 | class FreezeDQN(DoubleDQN):
 8 | 
 9 |     '''
10 |     Extends DQN agent to freeze target Q network
11 |     and periodically update them to the weights of the
12 |     exploration model
13 |     Avoids oscillations and breaks correlation
14 |     between Q-network and target
15 |     http://www0.cs.ucl.ac.uk/staff/d.silver/web/Resources_files/deep_rl.pdf
16 |     Exploration model periodically cloned into target Q network
17 |     '''
18 | 
19 |     def compute_Q_states(self, minibatch):
20 |         Q_states = np.clip(self.model.predict(minibatch['states']),
21 |                            -self.clip_val, self.clip_val)
22 |         Q_next_states = np.clip(self.model_2.predict(minibatch['next_states']),
23 |                                 -self.clip_val, self.clip_val)
24 |         Q_next_states_max = np.amax(Q_next_states, axis=1)
25 |         return (Q_states, Q_next_states, Q_next_states_max)
26 | 
27 |     def train_an_epoch(self):
28 |         # Should call DQN to train an epoch, not DoubleDQN
29 |         return DQN.train_an_epoch(self)
30 | 
31 |     def update_target_model(self):
32 |         # Also, loading logic seems off
33 |         self.model_2 = clone_model(self.model)
34 |         logger.debug("Updated target model weights")
35 | 
36 |     def update(self, sys_vars):
37 |         '''
38 |         Agent update apart from training the Q function
39 |         '''
40 |         done = sys_vars['done']
41 |         timestep_check = sys_vars['t'] == (self.env_spec['timestep_limit'] - 1)
42 |         if done or timestep_check:
43 |             self.update_target_model()
44 |         super(FreezeDQN, self).update(sys_vars)
45 | 


--------------------------------------------------------------------------------
/rl/agent/offpol_sarsa.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rl.agent.dqn import DQN
 3 | 
 4 | 
 5 | class OffPolicySarsa(DQN):
 6 | 
 7 |     '''
 8 |     Deep Sarsa agent.
 9 |     Off policy. Reduces to Q learning when eval_e = 0
10 |     Evaluation policy = epsilonGreedyPolicy, eval_e = 0.05
11 |     Experience generating policy = Boltzmann or
12 |     EpsilonGreedy with annealing
13 |     '''
14 | 
15 |     def __init__(self, *args, **kwargs):
16 |         super(OffPolicySarsa, self).__init__(*args, **kwargs)
17 |         self.eval_e = 0.05
18 | 
19 |     def compute_Q_states(self, minibatch):
20 |         (Q_states, Q_next_states, _max) = super(
21 |             OffPolicySarsa, self).compute_Q_states(minibatch)
22 | 
23 |         e_per_action = self.eval_e / self.env_spec['action_dim']
24 | 
25 |         Q_next_states_max = np.amax(Q_next_states, axis=1)
26 |         Q_next_states_selected = (1 - self.eval_e) * Q_next_states_max + \
27 |             np.sum(Q_next_states * e_per_action, axis=1)
28 |         return (Q_states, None, Q_next_states_selected)
29 | 


--------------------------------------------------------------------------------
/rl/agent/q_table.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.agent.base_agent import Agent
  3 | 
  4 | 
  5 | class Dummy(Agent):
  6 | 
  7 |     '''
  8 |     A dummy agent that does random actions, for demo
  9 |     '''
 10 | 
 11 |     def select_action(self, state):
 12 |         '''epsilon-greedy method'''
 13 |         action = np.random.choice(self.env_spec['actions'])
 14 |         return action
 15 | 
 16 |     def update(self, sys_vars):
 17 |         return
 18 | 
 19 |     def to_train(self, sys_vars):
 20 |         return True
 21 | 
 22 |     def train(self, sys_vars):
 23 |         return
 24 | 
 25 |     def build_model(self):
 26 |         return
 27 | 
 28 |     def compile_model(self):
 29 |         return
 30 | 
 31 | 
 32 | class QTable(Agent):
 33 | 
 34 |     '''
 35 |     The simplest Q learner - a table,
 36 |     with epsilon-greedy method and
 37 |     Bellman equation for value.
 38 |     '''
 39 | 
 40 |     def __init__(self, env_spec,
 41 |                  resolution=10,
 42 |                  gamma=0.95, lr=0.1,
 43 |                  init_e=1.0, final_e=0.1, exploration_anneal_episodes=1000,
 44 |                  **kwargs):  # absorb generic param without breaking
 45 |         super(QTable, self).__init__(env_spec)
 46 |         self.resolution = resolution
 47 |         self.gamma = gamma
 48 |         self.lr = lr
 49 |         self.init_e = init_e
 50 |         self.final_e = final_e
 51 |         self.e = self.init_e
 52 |         self.exploration_anneal_episodes = exploration_anneal_episodes
 53 |         self.build_model()
 54 | 
 55 |     def build_model(self):
 56 |         '''
 57 |         init the 2D qtable by
 58 |         bijecting the state space into pixelated, flattened vector
 59 |         multiplied with
 60 |         list of possible discrete actions
 61 |         '''
 62 |         self.pixelate_state_space(self.resolution)
 63 |         flat_state_size = self.resolution ** self.env_spec['state_dim']
 64 |         self.qtable = np.random.uniform(
 65 |             low=-1, high=1,
 66 |             size=(flat_state_size, self.env_spec['action_dim']))
 67 |         return self.qtable
 68 | 
 69 |     def compile_model(self):
 70 |         return
 71 | 
 72 |     def pixelate_state_space(self, resolution=10):
 73 |         '''chunk up the state space hypercube to specified resolution'''
 74 |         state_bounds = np.transpose(
 75 |             [self.env_spec['state_bound_low'],
 76 |              self.env_spec['state_bound_high']])
 77 |         self.state_pixels = [np.linspace(*sb, num=resolution+1)
 78 |                              for sb in state_bounds]
 79 |         return self.state_pixels
 80 | 
 81 |     def flatten_state(self, state):
 82 |         '''
 83 |         collapse a hyperdim state by binning into state_pixels
 84 |         then flattening the pixel_state into 1-dim bijection
 85 |         '''
 86 |         val_space_pairs = list(zip(state, self.state_pixels))
 87 |         pixel_state = [np.digitize(*val_space)
 88 |                        for val_space in val_space_pairs]  # binning
 89 |         flat_state = int("".join([str(ps) for ps in pixel_state]))
 90 |         return flat_state
 91 | 
 92 |     def select_action(self, state):
 93 |         '''epsilon-greedy method'''
 94 |         if self.e > np.random.rand():
 95 |             action = np.random.choice(self.env_spec['actions'])
 96 |         else:
 97 |             flat_state = self.flatten_state(state)
 98 |             action = np.argmax(self.qtable[flat_state, :])
 99 |         return action
100 | 
101 |     def update_e(self):
102 |         '''strategy to update epsilon'''
103 |         self.e = max(self.e -
104 |                      (self.init_e - self.final_e) /
105 |                      float(self.exploration_anneal_episodes),
106 |                      self.final_e)
107 |         return self.e
108 | 
109 |     def update(self, sys_vars):
110 |         self.update_e()
111 | 
112 |     def to_train(self, sys_vars):
113 |         return True
114 | 
115 |     def train(self, sys_vars):
116 |         '''
117 |         run the basic bellman equation update
118 |         '''
119 |         last_exp = self.memory.pop()
120 |         state = last_exp['states'][0]
121 |         flat_state = self.flatten_state(state)
122 |         next_state = last_exp['next_states'][0]
123 |         next_flat_state = self.flatten_state(next_state)
124 |         action = np.argmax(last_exp['actions'][0])  # from one-hot
125 |         reward = last_exp['rewards'][0]
126 |         Q_state_action = self.qtable[flat_state, action]
127 |         Q_next_state = self.qtable[next_flat_state, :]
128 |         Q_next_state_max = np.amax(Q_next_state)
129 |         loss = (reward + self.gamma * Q_next_state_max - Q_state_action)
130 |         sys_vars['loss'].append(loss)
131 | 
132 |         self.qtable[flat_state, action] = Q_state_action + \
133 |             self.lr * loss
134 |         return self.qtable
135 | 


--------------------------------------------------------------------------------
/rl/analytics.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import pandas as pd
  4 | import platform
  5 | import warnings
  6 | from os import environ
  7 | from rl.util import *
  8 | 
  9 | warnings.filterwarnings("ignore", module="matplotlib")
 10 | 
 11 | if platform.system() == 'Darwin':
 12 |     MPL_BACKEND = 'agg' if args.param_selection else 'macosx'
 13 | else:
 14 |     MPL_BACKEND = 'TkAgg'
 15 | 
 16 | STATS_COLS = [
 17 |     'best_session_epi',
 18 |     'best_session_id',
 19 |     'best_session_mean_rewards',
 20 |     'best_session_stability',
 21 |     'fitness_score',
 22 |     'mean_rewards_per_epi_stats_mean',
 23 |     'mean_rewards_stats_mean',
 24 |     'mean_rewards_stats_max',
 25 |     'epi_stats_mean',
 26 |     'epi_stats_min',
 27 |     'solved_ratio_of_sessions',
 28 |     'max_total_rewards_stats_mean',
 29 |     'trial_id',
 30 | ]
 31 | 
 32 | EXPERIMENT_DATA_Y_COLS = [
 33 |     'fitness_score',
 34 |     'mean_rewards_stats_max',
 35 |     'max_total_rewards_stats_mean',
 36 |     'epi_stats_min',
 37 | ]
 38 | 
 39 | 
 40 | # import matplotlib scoped to the class for gc in multiprocessing
 41 | def scoped_mpl_import():
 42 |     import matplotlib
 43 |     matplotlib.rcParams['backend'] = MPL_BACKEND
 44 | 
 45 |     import matplotlib.pyplot as plt
 46 |     plt.rcParams['toolbar'] = 'None'  # mute matplotlib toolbar
 47 | 
 48 |     import seaborn as sns
 49 |     sns.set(style="whitegrid", color_codes=True, font_scale=1.0,
 50 |             rc={'lines.linewidth': 1.0,
 51 |                 'backend': matplotlib.rcParams['backend']})
 52 |     palette = sns.color_palette("Blues_d")
 53 |     palette.reverse()
 54 |     sns.set_palette(palette)
 55 | 
 56 |     return (matplotlib, plt, sns)
 57 | 
 58 | 
 59 | class Grapher(object):
 60 | 
 61 |     '''
 62 |     Grapher object that belongs to a Session
 63 |     to draw graphs from its data
 64 |     '''
 65 | 
 66 |     def __init__(self, session):
 67 |         if environ.get('CI'):
 68 |             return
 69 |         (_mpl, self.plt, _sns) = scoped_mpl_import()
 70 |         self.session = session
 71 |         self.graph_filename = self.session.graph_filename
 72 |         self.subgraphs = {}
 73 |         self.figure = self.plt.figure(facecolor='white', figsize=(8, 9))
 74 |         self.figure.suptitle(wrap_text(self.session.session_id))
 75 |         self.init_figure()
 76 | 
 77 |     def init_figure(self):
 78 |         # graph 1
 79 |         ax1 = self.figure.add_subplot(
 80 |             311,
 81 |             frame_on=False,
 82 |             title="\n\ntotal rewards per episode",
 83 |             ylabel='total rewards')
 84 |         p1, = ax1.plot([], [])
 85 |         self.subgraphs['total rewards'] = (ax1, p1)
 86 | 
 87 |         ax1e = ax1.twinx()
 88 |         ax1e.set_ylabel('exploration rate').set_color('r')
 89 |         ax1e.set_frame_on(False)
 90 |         ax1e.grid(False)
 91 |         p1e, = ax1e.plot([], [], 'r')
 92 |         self.subgraphs['e'] = (ax1e, p1e)
 93 | 
 94 |         # graph 2
 95 |         ax2 = self.figure.add_subplot(
 96 |             312,
 97 |             frame_on=False,
 98 |             title='mean rewards over last 100 episodes',
 99 |             ylabel='mean rewards')
100 |         p2, = ax2.plot([], [], 'g')
101 |         self.subgraphs['mean rewards'] = (ax2, p2)
102 | 
103 |         # graph 3
104 |         ax3 = self.figure.add_subplot(
105 |             313,
106 |             frame_on=False,
107 |             title='loss over time, episode',
108 |             ylabel='loss')
109 |         p3, = ax3.plot([], [])
110 |         self.subgraphs['loss'] = (ax3, p3)
111 | 
112 |         self.plt.tight_layout()  # auto-fix spacing
113 |         self.plt.ion()  # for live plot
114 | 
115 |     def plot(self):
116 |         '''do live plotting'''
117 |         if environ.get('CI'):
118 |             return
119 |         sys_vars = self.session.sys_vars
120 |         ax1, p1 = self.subgraphs['total rewards']
121 |         p1.set_ydata(sys_vars['total_rewards_history'])
122 |         p1.set_xdata(np.arange(len(p1.get_ydata())))
123 |         ax1.relim()
124 |         ax1.autoscale_view(tight=True, scalex=True, scaley=True)
125 | 
126 |         ax1e, p1e = self.subgraphs['e']
127 |         p1e.set_ydata(sys_vars['explore_history'])
128 |         p1e.set_xdata(np.arange(len(p1e.get_ydata())))
129 |         ax1e.relim()
130 |         ax1e.autoscale_view(tight=True, scalex=True, scaley=True)
131 | 
132 |         ax2, p2 = self.subgraphs['mean rewards']
133 |         p2.set_ydata(sys_vars['mean_rewards_history'])
134 |         p2.set_xdata(np.arange(len(p2.get_ydata())))
135 |         ax2.relim()
136 |         ax2.autoscale_view(tight=True, scalex=True, scaley=True)
137 | 
138 |         ax3, p3 = self.subgraphs['loss']
139 |         p3.set_ydata(sys_vars['loss'])
140 |         p3.set_xdata(np.arange(len(p3.get_ydata())))
141 |         ax3.relim()
142 |         ax3.autoscale_view(tight=True, scalex=True, scaley=True)
143 | 
144 |         self.plt.draw()
145 |         self.plt.pause(0.01)
146 |         self.save()
147 |         import gc
148 |         gc.collect()
149 | 
150 |     def save(self):
151 |         '''save graph to filename'''
152 |         self.figure.savefig(self.graph_filename)
153 | 
154 |     def clear(self):
155 |         if environ.get('CI'):
156 |             return
157 |         self.plt.close()
158 |         del_self_attr(self)
159 | 
160 | 
161 | def calc_stability(sys_vars):
162 |     '''
163 |     calculate the stability of a session using its sys_vars
164 |     when problem is unsolved (unbounded), use 1 sigma 95% of max
165 |     stability 1 = perfectly stable
166 |     0.5 = half-ish unstable
167 |     0 = totally unstable, cannot yield solution
168 |     '''
169 |     total_r_history = sys_vars['total_rewards_history']
170 |     if sys_vars['SOLVED_MEAN_REWARD'] is None:
171 |         min_rewards = min(total_r_history)
172 |         max_rewards = max(total_r_history)
173 |         rewards_gap = max_rewards - min_rewards
174 |         r_threshold = max_rewards - (0.10 * rewards_gap)
175 |     else:
176 |         r_threshold = sys_vars['SOLVED_MEAN_REWARD']
177 |     # find index i.e. epi of first solved
178 |     first_solved_epi = next(
179 |         (idx for idx, total_r in enumerate(total_r_history)
180 |             if total_r > r_threshold), None)
181 |     last_epi = sys_vars['epi']
182 |     stable_epi_count = len([
183 |         total_r for total_r in total_r_history if total_r > r_threshold])
184 | 
185 |     if (first_solved_epi is None) or (last_epi == first_solved_epi):
186 |         mastery_gap = np.inf
187 |     else:  # get max if mastery_gap is smaller (faster) than needed - perfect
188 |         mastery_gap = last_epi - first_solved_epi
189 |     stability = stable_epi_count / mastery_gap
190 |     return stability
191 | 
192 | 
193 | def fitness_score(stats):
194 |     '''
195 |     calculate the fitness score (see doc Metrics for more)
196 |     1. solution rewards
197 |     2. solving speed: /epi
198 |     3. stability
199 |     4. consistency
200 |     5. granularity
201 |     6. amplification of good results
202 |     7. distinguishability
203 |     '''
204 |     mean_rewards_per_epi = stats['mean_rewards_per_epi_stats']['mean']
205 |     stability = stats['stability_stats']['mean']
206 |     consistency = stats['solved_ratio_of_sessions']
207 |     amplifier = (1+stability)*((1+consistency)**2)
208 |     distinguisher = amplifier ** np.sign(mean_rewards_per_epi)
209 |     fitness = mean_rewards_per_epi * distinguisher
210 |     return fitness
211 | 
212 | 
213 | def ideal_fitness_score(problem):
214 |     '''
215 |     calculate the ideal fitness_score with perfect solved ratio
216 |     for hyperparameter optimization to select
217 |     '''
218 |     if problem['SOLVED_MEAN_REWARD'] is None:
219 |         return np.inf  # for unsolved environments
220 |     solved_mean_reward = problem['SOLVED_MEAN_REWARD']
221 |     max_episodes = problem['MAX_EPISODES']
222 |     solved_epi_speedup = 3
223 |     ideal_epi = max_episodes / solved_epi_speedup
224 |     ideal_mean_rewards_per_epi = solved_mean_reward / ideal_epi
225 |     ideal_stability = 1
226 |     ideal_consistency = 1
227 |     amplifier = (1+ideal_stability)*((1+ideal_consistency)**2)
228 |     distinguisher = amplifier ** np.sign(ideal_mean_rewards_per_epi)
229 |     ideal_fitness = ideal_mean_rewards_per_epi * distinguisher
230 |     return ideal_fitness
231 | 
232 | 
233 | def basic_stats(array):
234 |     '''generate the basic stats for a numerical array'''
235 |     if not len(array):
236 |         return None
237 |     return {
238 |         'min': np.min(array).astype(float),
239 |         'max': np.max(array).astype(float),
240 |         'mean': np.mean(array).astype(float),
241 |         'std': np.std(array).astype(float),
242 |     }
243 | 
244 | 
245 | def compose_data(trial):
246 |     '''
247 |     compose raw data from an trial object
248 |     into useful summary and full metrics for analysis
249 |     '''
250 |     sys_vars_array = trial.data['sys_vars_array']
251 | 
252 |     # collect all data from sys_vars_array
253 |     solved_sys_vars_array = list(filter(
254 |         lambda sv: sv['solved'], sys_vars_array))
255 |     errored_array = list(map(
256 |         lambda sv: sv['errored'], sys_vars_array))
257 |     mean_rewards_array = np.array(list(map(
258 |         lambda sv: sv['mean_rewards'], sys_vars_array)))
259 |     max_total_rewards_array = np.array(list(map(
260 |         lambda sv: np.max(sv['total_rewards_history']), sys_vars_array)))
261 |     epi_array = np.array(list(map(lambda sv: sv['epi'], sys_vars_array)))
262 |     mean_rewards_per_epi_array = np.divide(mean_rewards_array, epi_array + 1)
263 |     stability_array = list(map(calc_stability, sys_vars_array))
264 |     t_array = np.array(list(map(lambda sv: sv['t'], sys_vars_array)))
265 |     time_taken_array = np.array(list(map(
266 |         lambda sv: timestamp_elapse_to_seconds(sv['time_taken']),
267 |         sys_vars_array)))
268 |     solved_epi_array = np.array(list(map(
269 |         lambda sv: sv['epi'], solved_sys_vars_array)))
270 |     solved_t_array = np.array(list(map(
271 |         lambda sv: sv['t'], solved_sys_vars_array)))
272 |     solved_time_taken_array = np.array(list(map(
273 |         lambda sv: timestamp_elapse_to_seconds(sv['time_taken']),
274 |         solved_sys_vars_array)))
275 |     best_idx = list(mean_rewards_per_epi_array).index(
276 |         max(mean_rewards_per_epi_array))
277 |     best_session_id = '{}_s{}'.format(trial.data['trial_id'], best_idx)
278 | 
279 |     # compose sys_vars stats
280 |     stats = {
281 |         'best_session_epi': epi_array.tolist()[best_idx],
282 |         'best_session_id': best_session_id,
283 |         'best_session_mean_rewards': mean_rewards_array[best_idx],
284 |         'best_session_stability': stability_array[best_idx],
285 |         'errored': any(errored_array),
286 |         'epi_stats': basic_stats(epi_array),
287 |         'max_total_rewards_stats': basic_stats(max_total_rewards_array),
288 |         'mean_rewards_stats': basic_stats(mean_rewards_array),
289 |         'mean_rewards_per_epi_stats': basic_stats(
290 |             mean_rewards_per_epi_array),
291 |         'num_of_sessions': len(sys_vars_array),
292 |         'solved_epi_stats': basic_stats(solved_epi_array),
293 |         'solved_num_of_sessions': len(solved_sys_vars_array),
294 |         'solved_ratio_of_sessions': float(len(
295 |             solved_sys_vars_array)) / trial.times,
296 |         'solved_t_stats': basic_stats(solved_t_array),
297 |         'solved_time_taken_stats': basic_stats(solved_time_taken_array),
298 |         'stability_stats': basic_stats(stability_array),
299 |         't_stats': basic_stats(t_array),
300 |         'time_taken_stats': basic_stats(time_taken_array),
301 |     }
302 |     stats.update({
303 |         'fitness_score': fitness_score(stats)
304 |     })
305 | 
306 |     # summary metrics picked from stats
307 |     metrics = {
308 |         'best_session_epi': stats['best_session_epi'],
309 |         'best_session_id': stats['best_session_id'],
310 |         'best_session_mean_rewards': stats['best_session_mean_rewards'],
311 |         'best_session_stability': stats['best_session_stability'],
312 |         'fitness_score': stats['fitness_score'],
313 |         'mean_rewards_per_epi_stats_mean': stats[
314 |             'mean_rewards_per_epi_stats']['mean'],
315 |         'solved_ratio_of_sessions': stats['solved_ratio_of_sessions'],
316 |         't_stats_mean': stats['t_stats']['mean'],
317 |     }
318 | 
319 |     # param variables for independent vars of trials
320 |     default_param = trial.experiment_spec['param']
321 |     param_variables = {
322 |         pv: default_param[pv] for
323 |         pv in trial.param_variables if pv in default_param}
324 | 
325 |     trial.data['metrics'].update(metrics)
326 |     trial.data['param_variables'] = param_variables
327 |     trial.data['stats'] = stats
328 |     return trial.data
329 | 
330 | 
331 | # order a unique df categorical data for plotting
332 | def order_category(uniq_df):
333 |     uniq_list = list(uniq_df)
334 |     try:
335 |         uniq_dict = {k: json.loads(k) for k in uniq_list}
336 |         sorted_pair = sorted(uniq_dict.items(), key=lambda x: x[1])
337 |         return [pair[0] for pair in sorted_pair]
338 |     except (json.JSONDecodeError, TypeError):
339 |         return list(sorted(uniq_list))
340 | 
341 | 
342 | # plot the experiment data from data_df
343 | # X are columns with name starting with 'variable_'
344 | # Y cols are defined below
345 | def plot_experiment(data_df, trial_id):
346 |     if len(data_df) < 2:  # no multi selection
347 |         return
348 |     (_mpl, _plt, sns) = scoped_mpl_import()
349 |     experiment_id = parse_experiment_id(trial_id)
350 |     hue = 'solved_ratio_of_sessions'
351 |     data_df = data_df.sort_values(hue)
352 |     fitness_hue = 'fitness_score_bin'
353 |     data_df[fitness_hue] = pd.cut(data_df['fitness_score'], bins=5)
354 |     X_cols = list(filter(lambda c: c.startswith('variable_'), data_df.columns))
355 |     col_size = len(X_cols)
356 |     row_size = len(EXPERIMENT_DATA_Y_COLS)
357 |     groups = data_df.groupby(hue)
358 | 
359 |     # for main grid plot
360 |     sns_only = True
361 |     big_fig, axes = sns.plt.subplots(
362 |         row_size, col_size, figsize=(col_size*4, row_size*3),
363 |         sharex='col', sharey='row')
364 |     for ix, x in enumerate(X_cols):
365 |         for iy, y in enumerate(EXPERIMENT_DATA_Y_COLS):
366 |             big_ax = axes[iy] if col_size == 1 else axes[iy][ix]
367 |             uniq_df = data_df[x].unique()
368 |             if (data_df[x].dtype.name == 'category' or
369 |                     len(uniq_df) <= 5):
370 |                 order = order_category(uniq_df)
371 |                 sns.swarmplot(
372 |                     data=data_df, x=x, y=y, hue=hue, size=3,
373 |                     order=order, ax=big_ax)
374 |             else:
375 |                 sns_only = False
376 |                 big_ax.margins(0.05)
377 |                 big_ax.xaxis.grid(False)
378 |                 for _, group in groups:
379 |                     big_ax.plot(group[x], group[y], label=hue,
380 |                                 marker='o', ms=3, linestyle='')
381 |                     big_ax.set_xlabel(x)
382 |                     big_ax.set_ylabel(y)
383 | 
384 |             big_ax.legend_ = None  # set common legend below
385 |             # label only left and bottom axes
386 |             if iy != row_size - 1:
387 |                 big_ax.set_xlabel('')
388 |             if ix != 0:
389 |                 big_ax.set_ylabel('')
390 | 
391 |     big_fig.tight_layout()
392 |     big_fig.suptitle(wrap_text(experiment_id))
393 |     legend_labels = None if sns_only else sorted(data_df[hue].unique())
394 |     legend_ms = 0.5 if sns_only else 1
395 |     legend = sns.plt.legend(title='solved_ratio_of_sessions',
396 |                             labels=legend_labels, markerscale=legend_ms,
397 |                             fontsize=10, loc='center right',
398 |                             bbox_to_anchor=(1.1+col_size*0.1, row_size+0.1))
399 |     legend.get_title().set_fontsize('10')
400 |     big_fig.subplots_adjust(top=0.96, right=0.9)
401 | 
402 |     filename = './data/{0}/{0}_analysis.png'.format(
403 |         experiment_id)
404 |     big_fig.savefig(filename)
405 |     big_fig.clear()
406 | 
407 |     # use numerical, since contour only makes sense for ordered azes
408 |     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
409 |     numeric_X_cols = list(
410 |         filter(lambda x: data_df[x].dtype in numerics, X_cols))
411 |     with sns.axes_style('white', {'axes.linewidth': 0.2}):
412 |         g = sns.pairplot(
413 |             data_df, vars=numeric_X_cols, hue=fitness_hue,
414 |             size=3, aspect=1, plot_kws={'s': 50, 'alpha': 0.5})
415 |         g.fig.suptitle(wrap_text(experiment_id))
416 |         g = g.add_legend()
417 |         filename = './data/{0}/{0}_analysis_correlation.png'.format(
418 |             experiment_id)
419 |         g.savefig(filename)
420 |         g.fig.clear()
421 | 
422 |     sns.plt.close()
423 | 
424 | 
425 | def analyze_data(experiment_data_or_experiment_id):
426 |     '''
427 |     get all the data from all trials.run()
428 |     or read from all data files matching the prefix of trial_id
429 |     e.g. usage without running:
430 |     experiment_id = 'DevCartPole-v0_DQN_LinearMemoryWithForgetting_BoltzmannPolicy_2017-01-15_142810'
431 |     analyze_data(experiment_id)
432 |     '''
433 |     if isinstance(experiment_data_or_experiment_id, str):
434 |         experiment_data = load_data_array_from_experiment_id(
435 |             experiment_data_or_experiment_id)
436 |     else:
437 |         experiment_data = experiment_data_or_experiment_id
438 | 
439 |     stats_array, param_variables_array = [], []
440 |     for data in experiment_data:
441 |         stats = flatten_dict(data['stats'])
442 |         stats.update({'trial_id': data['trial_id']})
443 |         param_variables = flat_cast_dict(data['param_variables'])
444 |         if stats['errored']:  # remove errored trials
445 |             continue
446 |         stats_array.append(stats)
447 |         param_variables_array.append(param_variables)
448 | 
449 |     raw_stats_df = pd.DataFrame.from_dict(stats_array)
450 |     stats_df = raw_stats_df[STATS_COLS]
451 | 
452 |     param_variables_df = pd.DataFrame.from_dict(param_variables_array)
453 |     param_variables_df.columns = [
454 |         'variable_'+c for c in param_variables_df.columns]
455 | 
456 |     data_df = pd.concat([stats_df, param_variables_df], axis=1)
457 |     for c in data_df.columns:
458 |         if data_df[c].dtype == object:  # guard
459 |             data_df[c] = data_df[c].astype('category')
460 | 
461 |     data_df.sort_values(
462 |         ['fitness_score'], ascending=False, inplace=True)
463 |     data_df.reset_index(drop=True, inplace=True)
464 | 
465 |     trial_id = experiment_data[0]['trial_id']
466 |     save_experiment_data(data_df, trial_id)
467 |     plot_experiment(data_df, trial_id)
468 |     return data_df
469 | 


--------------------------------------------------------------------------------
/rl/hyperoptimizer/__init__.py:
--------------------------------------------------------------------------------
1 | from rl.util import import_package_files
2 | 
3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__)
4 | 


--------------------------------------------------------------------------------
/rl/hyperoptimizer/base_hyperoptimizer.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import multiprocessing as mp
  3 | import os
  4 | import time
  5 | from collections import OrderedDict
  6 | from rl.util import logger, timestamp, PARALLEL_PROCESS_NUM, debug_mem_usage
  7 | 
  8 | 
  9 | class HyperOptimizer(object):
 10 | 
 11 |     '''
 12 |     The base class of hyperparam optimizer, with core methods
 13 |     read about it on the documentation
 14 |     input: Trial (and some specs), param space P (as standardized specs)
 15 |     Algo:
 16 |     1. search the next p in P using its internal search algo,
 17 |     add to its internal `param_search_list`
 18 |     2. run a (slow) function Trial(p) = score (inside trial data)
 19 |     3. update search using feedback score
 20 |     4. repeat till max steps or fitness condition met
 21 | 
 22 |     it will be ran by the experiment as:
 23 |     hyperopt = HyperOptimizer(Trial, **experiment_kwargs)
 24 |     experiment_data = hyperopt.run()
 25 |     '''
 26 | 
 27 |     def __init__(self, Trial, **kwargs):
 28 |         self.Trial = Trial
 29 |         self.REQUIRED_ARGS = [
 30 |             'experiment_spec',
 31 |             'experiment_id_override',
 32 |             'times'
 33 |         ]
 34 |         self.PARALLEL_PROCESS_NUM = PARALLEL_PROCESS_NUM
 35 |         self.free_cpu = self.PARALLEL_PROCESS_NUM  # for parallel run
 36 |         logger.info('Initialize {}'.format(self.__class__.__name__))
 37 |         self.set_keys(**kwargs)
 38 |         self.init_search()
 39 | 
 40 |     def set_keys(self, **kwargs):
 41 |         assert all(k in kwargs for k in self.REQUIRED_ARGS), \
 42 |             'kwargs do not have all REQUIRED_ARGS'
 43 |         for k in kwargs:
 44 |             setattr(self, k, kwargs[k])
 45 | 
 46 |         self.experiment_name = self.experiment_spec.get('experiment_name')
 47 |         self.run_timestamp = timestamp()
 48 |         self.experiment_id = self.experiment_id_override or '{}-{}'.format(
 49 |             self.experiment_name, self.run_timestamp)
 50 |         self.experiment_data = []
 51 |         self.param_search_list = []
 52 |         # the index of next param to try in param_search_list
 53 |         self.next_trial_num = len(self.param_search_list)
 54 | 
 55 |         self.default_param = self.experiment_spec['param']
 56 |         unordered_param_range = self.experiment_spec['param_range']
 57 |         # import ordering for param_range for search serialization
 58 |         self.param_range = OrderedDict(sorted(unordered_param_range.items()))
 59 |         self.param_range_keys = sorted(self.param_range.keys())
 60 | 
 61 |     def compose_experiment_spec(self, param):
 62 |         new_experiment_spec = copy.deepcopy(self.experiment_spec)
 63 |         new_experiment_spec.pop('param_range', None)
 64 |         new_experiment_spec.update({
 65 |             'param': param,
 66 |         })
 67 |         return new_experiment_spec
 68 | 
 69 |     def init_search(self):
 70 |         '''initialize the search algo and the search space'''
 71 |         raise NotImplementedError()
 72 | 
 73 |     def search(self):
 74 |         '''
 75 |         algo step 1, search and return the next p for Trial(p),
 76 |         Its only job is to append to (or modify)
 77 |         its internal self.param_search_list using its search logic
 78 |         It may refer to self.experiment_data as search memory
 79 |         and whatever new pointer or special memory implemented by a HyperOptimizer class
 80 |         '''
 81 |         raise NotImplementedError()
 82 | 
 83 |     def next_param(self):
 84 |         '''retrieve trial_num and param, advance the class next_trial_num'''
 85 |         assert self.next_trial_num < len(self.param_search_list), \
 86 |             'param_search_list expansion cannot keep up with next_trial_num'
 87 |         trial_num = self.next_trial_num
 88 |         param = self.param_search_list[self.next_trial_num]
 89 |         self.next_trial_num = self.next_trial_num + 1
 90 |         return (trial_num, param)
 91 | 
 92 |     def run_trial(self, trial_num, param):
 93 |         '''
 94 |         algo step 2, construct and run Trial with the next param
 95 |         args trial_num, param must be provided externally,
 96 |         otherwise they will not progress within mp.process
 97 |         '''
 98 |         experiment_spec = self.compose_experiment_spec(param)
 99 |         trial = self.Trial(
100 |             experiment_spec, trial_num=trial_num,
101 |             times=self.times,
102 |             num_of_trials=self.num_of_trials,
103 |             run_timestamp=self.run_timestamp,
104 |             experiment_id_override=self.experiment_id_override)
105 |         trial_data = trial.run()
106 |         del trial
107 |         import gc
108 |         gc.collect()
109 |         debug_mem_usage()
110 |         return trial_data
111 | 
112 |     # retrieve the trial_num, param, fitness_score from trial_data
113 |     @classmethod
114 |     def get_fitness(cls, trial_data):
115 |         trial_id = trial_data['trial_id']
116 |         trial_num = trial_id.split('_').pop()
117 |         param = trial_data['experiment_spec']['param']
118 |         metrics = trial_data['metrics']
119 |         fitness_score = metrics['fitness_score']
120 |         return trial_num, param, fitness_score
121 | 
122 |     def update_search(self):
123 |         '''algo step 3, update search algo using self.experiment_data'''
124 |         raise NotImplementedError()
125 | 
126 |     def to_terminate(self):
127 |         '''algo step 4, terminate when at max steps or fitness condition met'''
128 |         raise NotImplementedError()
129 | 
130 |     # handler task after a search is complete from multiprocessing pool
131 |     def post_search(self, trial_data):
132 |         self.experiment_data.append(trial_data)
133 |         self.update_search()
134 |         self.free_cpu += 1
135 | 
136 |     @classmethod
137 |     def pool_init(self):
138 |         # you can never be too safe in multiprocessing gc
139 |         import gc
140 |         gc.collect()
141 | 
142 |     @classmethod
143 |     def raise_error(cls, e):
144 |         logger.error('Pool worker throws Exception')
145 |         print(e.__cause__)
146 |         time.sleep(1)
147 |         os._exit(1)
148 | 
149 |     def run(self):
150 |         '''
151 |         top level method to run the entire hyperoptimizer
152 |         will gather and compose experiment_data, then return it
153 |         '''
154 |         logger.info('Run {}'.format(self.__class__.__name__))
155 |         # crucial maxtasksperchild to free up memory by respawning worker
156 |         pool = mp.Pool(self.PARALLEL_PROCESS_NUM,
157 |                        initializer=self.pool_init, maxtasksperchild=1)
158 |         while (not self.to_terminate()):
159 |             if self.free_cpu > 0:
160 |                 self.free_cpu -= 1  # update
161 |                 self.search()  # add to self.param_search_list
162 |                 trial_num, param = self.next_param()
163 |                 pool.apply_async(
164 |                     self.run_trial, (trial_num, param),
165 |                     callback=self.post_search, error_callback=self.raise_error)
166 |             else:
167 |                 pass  # keep looping till free_cpu available
168 |             time.sleep(0.02)  # prevent cpu overwork from while loop
169 |         pool.close()
170 |         pool.join()
171 |         return self.experiment_data
172 | 


--------------------------------------------------------------------------------
/rl/hyperoptimizer/grid_search.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import itertools
 3 | from rl.hyperoptimizer.line_search import LineSearch
 4 | 
 5 | 
 6 | class GridSearch(LineSearch):
 7 | 
 8 |     def init_search(self):
 9 |         '''
10 |         convert a dict of param ranges into
11 |         a list of cartesian products of param_range
12 |         e.g. {'a': [1,2], 'b': [3]} into
13 |         [{'a': 1, 'b': 3}, {'a': 2, 'b': 3}]
14 |         note that this is order-preserving, as required by design
15 |         '''
16 |         range_vals = self.param_range.values()
17 |         for vals in itertools.product(*range_vals):
18 |             param = copy.deepcopy(self.default_param)
19 |             param.update(dict(zip(self.param_range_keys, vals)))
20 |             self.param_search_list.append(param)
21 |         self.num_of_trials = len(self.param_search_list)
22 | 


--------------------------------------------------------------------------------
/rl/hyperoptimizer/line_search.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from rl.hyperoptimizer.base_hyperoptimizer import HyperOptimizer
 3 | 
 4 | 
 5 | class LineSearch(HyperOptimizer):
 6 | 
 7 |     def init_search(self):
 8 |         '''
 9 |         convert a dict of param ranges into
10 |         a list parameter settings corresponding
11 |         to a line search of the param range
12 |         for each param
13 |         All other parameters set to default vals
14 |         note that this is order-preserving, as required by design
15 |         '''
16 |         for key in self.param_range_keys:
17 |             vals = self.param_range[key]
18 |             for val in vals:
19 |                 param = copy.deepcopy(self.default_param)
20 |                 param[key] = val
21 |                 self.param_search_list.append(param)
22 |         self.num_of_trials = len(self.param_search_list)
23 | 
24 |     def search(self):
25 |         '''no action needed here for exhaustive trials'''
26 |         return
27 | 
28 |     def update_search(self):
29 |         '''no action needed here for exhaustive trials'''
30 |         return
31 | 
32 |     def to_terminate(self):
33 |         return not (self.next_trial_num < len(self.param_search_list))
34 | 


--------------------------------------------------------------------------------
/rl/hyperoptimizer/random_search.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | from rl.analytics import ideal_fitness_score
  4 | from rl.hyperoptimizer.base_hyperoptimizer import HyperOptimizer
  5 | from rl.util import PROBLEMS, to_json, logger
  6 | 
  7 | 
  8 | class RandomSearch(HyperOptimizer):
  9 | 
 10 |     '''
 11 |     Random Search by sampling on hysphere around a search path
 12 |     algo:
 13 |     1. init x a random position in space
 14 |     2. until termination (max_eval or fitness, e.g. solved all), do:
 15 |         2.1 sample new pos some radius away: next_x = x + r
 16 |         2.2 if f(next_x) > f(x) then set x = next_x
 17 | 
 18 |     Extra search memory units:
 19 |     - search_path
 20 |     - best_point
 21 | 
 22 |     save for experiment resume, search_history:
 23 |     - search_path
 24 |     - best_point
 25 |     - param_search_list
 26 |     '''
 27 | 
 28 |     # # calculate the constant radius needed to traverse unit cube
 29 |     # def cube_traversal_radius(self):
 30 |     #     traversal_diameter = 1/np.power(self.max_evals,
 31 |     #                                     1/self.search_dim)
 32 |     #     traversal_radius = traversal_diameter/2
 33 |     #     return traversal_radius
 34 | 
 35 |     def decay_search_radius(self):
 36 |         '''
 37 |         start of half cube for diameter (0.25 radius) then decay
 38 |         at 100 searches, will shrink to 1/10 of initial radius 0.025
 39 |         clip to prevent going too small (0.01)
 40 |         '''
 41 |         min_radius = 0.01
 42 |         linear_decay_rate = self.next_trial_num/10./self.PARALLEL_PROCESS_NUM
 43 |         self.search_radius = np.clip(
 44 |             self.init_search_radius / linear_decay_rate,
 45 |             min_radius, self.init_search_radius)
 46 | 
 47 |     @classmethod
 48 |     def sample_hypersphere(cls, dim, r=1):
 49 |         '''Marsaglia algo for sampling uniformly on a hypersphere'''
 50 |         v = np.random.randn(dim)
 51 |         v = v * r / np.linalg.norm(v)
 52 |         return v
 53 | 
 54 |     def sample_cube(self):
 55 |         return np.random.rand(self.search_dim)
 56 | 
 57 |     def sample_r(self):
 58 |         return self.sample_hypersphere(
 59 |             self.search_dim, self.search_radius)
 60 | 
 61 |     # biject [0, 1] to [x_min, x_max]
 62 |     def biject_continuous(self, norm_val, x_min, x_max):
 63 |         return np.around(norm_val*(x_max - x_min) + x_min, self.precision)
 64 | 
 65 |     # biject [0, 1] to x_list = [a, b, c, ...] by binning
 66 |     def biject_discrete(self, norm_val, x_list):
 67 |         list_len = len(x_list)
 68 |         inds = np.arange(list_len)
 69 |         cont_val = self.biject_continuous(norm_val, 0, list_len)
 70 |         ind = np.digitize(cont_val, inds) - 1
 71 |         return x_list[ind]
 72 | 
 73 |     # biject one dimension: [0, 1] to a param_range val
 74 |     def biject_dim(self, norm_val, dim_spec):
 75 |         if isinstance(dim_spec, list):  # discrete
 76 |             return self.biject_discrete(norm_val, dim_spec)
 77 |         else:  # cont
 78 |             return self.biject_continuous(
 79 |                 norm_val, dim_spec['min'], dim_spec['max'])
 80 |         return
 81 | 
 82 |     # biject a vector on unit cube into a param in param_space
 83 |     def biject_param(self, v):
 84 |         param = {}
 85 |         for i, param_key in enumerate(self.param_range_keys):
 86 |             dim_spec = self.param_range[param_key]
 87 |             param[param_key] = self.biject_dim(v[i], dim_spec)
 88 |         return param
 89 | 
 90 |     def init_search(self):
 91 |         '''
 92 |         Initialize the random search internal variables
 93 |         '''
 94 |         self.max_evals = self.experiment_spec['param']['max_evals']
 95 |         self.num_of_trials = self.max_evals
 96 |         self.search_dim = len(self.param_range_keys)
 97 |         self.precision = 4  # decimal roundoff biject_continuous
 98 |         self.search_radius = self.init_search_radius = 0.5
 99 |         self.search_count = 0  # number of times search() has ran
100 |         self.search_exhausted = False
101 |         self.search_path = []
102 |         self.best_point = {
103 |             'trial_num': None,
104 |             'param': None,
105 |             'x': self.sample_cube(),
106 |             'fitness_score': float('-inf'),
107 |         }
108 |         problem = PROBLEMS.get(self.experiment_spec['problem'])
109 |         self.ideal_fitness_score = ideal_fitness_score(problem)
110 |         logger.info(
111 |             'ideal_fitness_scrore: {}'.format(self.ideal_fitness_score))
112 | 
113 |         self.filename = './data/{}/random_search_history.json'.format(
114 |             self.experiment_id)
115 |         if self.experiment_id_override is not None:
116 |             self.load()  # resume
117 | 
118 |     def search(self):
119 |         '''
120 |         algo step 2.1 sample new pos some radius away: next_x = x + r
121 |         update search_path and param_search_list
122 |         '''
123 |         self.search_count += 1
124 |         if self.next_trial_num < len(self.search_path):  # resuming
125 |             next_x = self.search_path[self.next_trial_num]
126 |             next_param = self.param_search_list[self.next_trial_num]
127 |         else:
128 |             next_x = np.clip(self.best_point['x'] + self.sample_r(), 0., 1.)
129 |             # check if too close to previously searched x
130 |             distances = [np.linalg.norm(next_x - old_x)
131 |                          for old_x in self.search_path]
132 |             distances = np.around(distances, self.precision)
133 | 
134 |             if self.search_count > (10 * self.max_evals):
135 |                 self.search_exhausted = True  # exhausted search space
136 |                 next_param = self.biject_param(next_x)
137 |                 self.search_path.append(next_x)
138 |                 self.param_search_list.append(next_param)
139 |             elif len(distances) > 0 and np.amin(distances) == 0:
140 |                 self.search()
141 |             else:
142 |                 next_param = self.biject_param(next_x)
143 |                 self.search_path.append(next_x)
144 |                 self.param_search_list.append(next_param)
145 | 
146 |     def update_search(self):
147 |         '''
148 |         algo step 2.2 if f(next_x) > f(x) then set x = next_x
149 |         invoked right after the latest run_trial()
150 |         update self.best_point
151 |         '''
152 |         if (self.next_trial_num < self.PARALLEL_PROCESS_NUM or
153 |                 self.next_trial_num < len(self.search_path)):
154 |             # yet to have history or still resuming from history
155 |             return
156 |         assert len(self.experiment_data) > 0, \
157 |             'self.experiment_data must not be empty for update_search'
158 | 
159 |         self.decay_search_radius()
160 | 
161 |         x = self.search_path[-1]
162 |         trial_data = self.experiment_data[-1]
163 |         trial_num, param, fitness_score = self.get_fitness(trial_data)
164 |         if fitness_score > self.best_point['fitness_score']:
165 |             self.best_point = {
166 |                 'trial_num': trial_num,
167 |                 'param': param,
168 |                 'x': x,
169 |                 'fitness_score': fitness_score,
170 |             }
171 |         self.save()
172 | 
173 |     def save(self):
174 |         search_history = {
175 |             'search_path': self.search_path,
176 |             'search_count': self.search_count,
177 |             'best_point': self.best_point,
178 |             'param_search_list': self.param_search_list,
179 |         }
180 |         with open(self.filename, 'w') as f:
181 |             f.write(to_json(search_history))
182 |         logger.info(
183 |             'Save search history to {}'.format(self.filename))
184 |         return
185 | 
186 |     def load(self):
187 |         try:
188 |             search_history = json.loads(open(self.filename).read())
189 |             self.search_path = search_history['search_path']
190 |             self.best_point = search_history['best_point']
191 |             self.param_search_list = search_history['param_search_list']
192 |             logger.info('Load search history from {}'.format(self.filename))
193 |         except (FileNotFoundError, json.JSONDecodeError):
194 |             logger.info(
195 |                 'Fail to load search history from {}'.format(self.filename))
196 |             return None
197 | 
198 |     def satisfy_fitness(self):
199 |         '''
200 |         break on the first strong solution
201 |         '''
202 |         best_fitness_score = self.best_point['fitness_score']
203 |         if self.next_trial_num < self.PARALLEL_PROCESS_NUM:
204 |             return False
205 |         elif best_fitness_score > self.ideal_fitness_score:
206 |             logger.info(
207 |                 'fitness_score {} > ideal_fitness_score {}, '
208 |                 'could terminate early'.format(
209 |                     best_fitness_score, self.ideal_fitness_score))
210 |             # return True
211 |             # TODO fix ideal_fitness_score
212 |             return False
213 |         else:
214 |             return False
215 | 
216 |     def to_terminate(self):
217 |         return (self.search_exhausted or
218 |                 self.next_trial_num >= self.max_evals or
219 |                 self.satisfy_fitness())
220 | 


--------------------------------------------------------------------------------
/rl/memory/__init__.py:
--------------------------------------------------------------------------------
1 | from rl.util import import_package_files
2 | 
3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__)
4 | 


--------------------------------------------------------------------------------
/rl/memory/base_memory.py:
--------------------------------------------------------------------------------
 1 | class Memory(object):
 2 | 
 3 |     '''
 4 |     The base class of Memory, with the core methods
 5 |     '''
 6 | 
 7 |     def __init__(self, env_spec, **kwargs):  # absorb generic param without breaking
 8 |         '''Construct externally, and set at Agent.compile()'''
 9 |         self.env_spec = env_spec
10 |         self.agent = None
11 |         self.state = None
12 | 
13 |     def reset_state(self, init_state):
14 |         '''reset the state of LinearMemory per episode env.reset()'''
15 |         self.state = init_state
16 | 
17 |     def add_exp(self, action, reward, next_state, terminal):
18 |         '''add an experience'''
19 |         raise NotImplementedError()
20 | 
21 |     def get_exp(self, inds):
22 |         '''get a batch of experiences by indices'''
23 |         raise NotImplementedError()
24 | 
25 |     def pop(self):
26 |         '''get the last experience (batched like get_exp()'''
27 |         raise NotImplementedError()
28 | 
29 |     def size(self):
30 |         '''get a batch of experiences by indices'''
31 |         raise NotImplementedError()
32 | 
33 |     def rand_minibatch(self, size):
34 |         '''get a batch of experiences by indices'''
35 |         raise NotImplementedError()
36 | 
37 |     def update(self, updates):
38 |         '''update elements of the memory as requires'''
39 |         raise NotImplementedError()
40 | 


--------------------------------------------------------------------------------
/rl/memory/linear.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.memory.base_memory import Memory
  3 | from rl.util import log_self
  4 | from scipy.stats import halfnorm
  5 | 
  6 | 
  7 | class LinearMemory(Memory):
  8 | 
  9 |     '''
 10 |     The replay memory used for random minibatch training
 11 |     '''
 12 | 
 13 |     # absorb generic param without breaking
 14 |     def __init__(self, env_spec, **kwargs):
 15 |         super(LinearMemory, self).__init__(env_spec)
 16 |         self.exp_keys = [
 17 |             'states', 'actions', 'rewards', 'next_states', 'terminals']
 18 |         self.exp = {k: [] for k in self.exp_keys}
 19 |         log_self(self)
 20 | 
 21 |     def encode_action(self, action):
 22 |         '''encode action based on continuous/discrete before adding'''
 23 |         if self.agent.env_spec['actions'] == 'continuous':
 24 |             return action
 25 |         else:  # do one-hot encoding
 26 |             action_arr = np.zeros(self.agent.env_spec['action_dim'])
 27 |             action_arr[action] = 1
 28 |             return action_arr
 29 | 
 30 |     def add_exp(self, action, reward, next_state, terminal):
 31 |         '''
 32 |         after the env.step(a) that returns s', r,
 33 |         using the previously stored state for the s,
 34 |         form an experience tuple <s, a, r, s'>
 35 |         '''
 36 |         self.exp['states'].append(self.state)
 37 |         self.exp['actions'].append(self.encode_action(action))
 38 |         self.exp['rewards'].append(reward)
 39 |         self.exp['next_states'].append(next_state)
 40 |         self.exp['terminals'].append(int(terminal))
 41 |         self.state = next_state
 42 | 
 43 |     def _get_exp(self, exp_name, inds):
 44 |         return np.array([self.exp[exp_name][i] for i in inds])
 45 | 
 46 |     def get_exp(self, inds):
 47 |         return {k: self._get_exp(k, inds) for k in self.exp_keys}
 48 | 
 49 |     def pop(self):
 50 |         '''convenient method to get exp at [last_ind]'''
 51 |         assert self.size() > 0, 'memory is empty, cannot pop'
 52 |         return self.get_exp([self.size() - 1])
 53 | 
 54 |     def size(self):
 55 |         return len(self.exp['rewards'])
 56 | 
 57 |     def rand_minibatch(self, size):
 58 |         '''plain random sampling'''
 59 |         memory_size = self.size()
 60 |         rand_inds = np.random.randint(memory_size, size=size)
 61 |         minibatch = self.get_exp(rand_inds)
 62 |         return minibatch
 63 | 
 64 |     def update(self, updates):
 65 |         pass
 66 | 
 67 | 
 68 | class LinearMemoryWithForgetting(LinearMemory):
 69 | 
 70 |     '''
 71 |     Linear memory with uniform sampling, retaining last 50k experiences
 72 |     '''
 73 | 
 74 |     def __init__(self, env_spec, max_mem_len=50000,
 75 |                  **kwargs):  # absorb generic param without breaking
 76 |         super(LinearMemoryWithForgetting, self).__init__(env_spec)
 77 |         self.max_mem_len = max_mem_len
 78 | 
 79 |     def trim_exp(self):
 80 |         '''The forgetting mechanism'''
 81 |         if (self.size() > self.max_mem_len):
 82 |             for k in self.exp_keys:
 83 |                 del self.exp[k][0]
 84 | 
 85 |     def add_exp(self, action, reward, next_state, terminal):
 86 |         '''
 87 |         add exp as usual, but preserve only the recent episodes
 88 |         '''
 89 |         super(LinearMemoryWithForgetting, self).add_exp(
 90 |             action, reward, next_state, terminal)
 91 |         self.trim_exp()
 92 | 
 93 | 
 94 | class LeftTailMemory(LinearMemory):
 95 | 
 96 |     '''
 97 |     Memory with sampling via a left-tail distribution
 98 |     '''
 99 | 
100 |     def rand_minibatch(self, size):
101 |         '''
102 |         get a minibatch of random exp for training
103 |         use simple memory decay, i.e. sample with a left tail
104 |         distribution to draw more from latest memory
105 |         then append with the most recent, untrained experience
106 |         '''
107 |         memory_size = self.size()
108 |         new_exp_size = self.agent.train_per_n_new_exp
109 |         if memory_size <= size or memory_size <= new_exp_size:
110 |             inds = np.random.randint(memory_size, size=size)
111 |         else:
112 |             new_memory_ind = max(0, memory_size - new_exp_size)
113 |             old_memory_ind = max(0, new_memory_ind - 1)
114 |             latest_inds = np.arange(new_memory_ind, memory_size)
115 |             random_batch_size = size - new_exp_size
116 |             rand_inds = (old_memory_ind - halfnorm.rvs(
117 |                 size=random_batch_size,
118 |                 scale=float(old_memory_ind)*0.80).astype(int))
119 |             inds = np.concatenate([rand_inds, latest_inds]).clip(0)
120 |         minibatch = self.get_exp(inds)
121 |         return minibatch
122 | 


--------------------------------------------------------------------------------
/rl/memory/prioritized_exp_replay.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.memory.linear import LinearMemoryWithForgetting
  3 | 
  4 | 
  5 | class PrioritizedExperienceReplay(LinearMemoryWithForgetting):
  6 | 
  7 |     '''
  8 |     Replay memory with random sampling weighted by the absolute
  9 |     size of the value function error
 10 | 
 11 |     Adapted from https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py
 12 |     memory unit
 13 |     '''
 14 | 
 15 |     def __init__(self, env_spec, max_mem_len=None, e=0.01, alpha=0.6,
 16 |                  **kwargs):
 17 |         if max_mem_len is None:  # auto calculate mem len
 18 |             max_timestep = env_spec['timestep_limit']
 19 |             max_epis = env_spec['problem']['MAX_EPISODES']
 20 |             memory_epi = np.ceil(max_epis / 3.).astype(int)
 21 |             max_mem_len = max(10**6, max_timestep * memory_epi)
 22 |         super(PrioritizedExperienceReplay, self).__init__(
 23 |             env_spec, max_mem_len)
 24 |         self.exp_keys.append('error')
 25 |         self.exp = {k: [] for k in self.exp_keys}  # reinit with added mem key
 26 |         # Prevents experiences with error of 0 from being replayed
 27 |         self.e = e
 28 |         # Controls how spiked the distribution is. alpha = 0 means uniform
 29 |         self.alpha = alpha
 30 |         self.curr_data_inds = None
 31 |         self.curr_tree_inds = None
 32 |         self.prio_tree = SumTree(self.max_mem_len)
 33 |         self.head = 0
 34 | 
 35 |     def get_priority(self, error):
 36 |         # add min_priority to prevent root of negative = complex
 37 |         p = (error + self.e) ** self.alpha
 38 |         assert np.isfinite(p)
 39 |         return p
 40 | 
 41 |     def add_exp(self, action, reward, next_state, terminal):
 42 |         '''Round robin memory updating'''
 43 |         # init error to reward first, update later
 44 |         error = abs(reward)
 45 |         p = self.get_priority(error)
 46 | 
 47 |         if self.size() < self.max_mem_len:  # add as usual
 48 |             super(PrioritizedExperienceReplay, self).add_exp(
 49 |                 action, reward, next_state, terminal)
 50 |             self.exp['error'].append(error)
 51 |         else:  # replace round robin
 52 |             self.exp['states'][self.head] = self.state
 53 |             self.exp['actions'][self.head] = self.encode_action(action)
 54 |             self.exp['rewards'][self.head] = reward
 55 |             self.exp['next_states'][self.head] = next_state
 56 |             self.exp['terminals'][self.head] = int(terminal)
 57 |             self.exp['error'][self.head] = error
 58 |             self.state = next_state
 59 | 
 60 |         self.head += 1
 61 |         if self.head >= self.max_mem_len:
 62 |             self.head = 0  # reset for round robin
 63 | 
 64 |         self.prio_tree.add(p)
 65 | 
 66 |         assert self.head == self.prio_tree.head, 'prio_tree head is wrong'
 67 | 
 68 |     def rand_minibatch(self, size):
 69 |         '''random sampling weighted by priority'''
 70 |         self.curr_tree_inds, self.curr_data_inds = self.select_prio_inds(size)
 71 |         minibatch = self.get_exp(self.curr_data_inds)
 72 |         return minibatch
 73 | 
 74 |     def select_prio_inds(self, size):
 75 |         tree_inds = []
 76 |         data_inds = []
 77 |         segment = self.prio_tree.total() / size
 78 | 
 79 |         for i in range(size):
 80 |             a = segment * i
 81 |             b = segment * (i + 1)
 82 | 
 83 |             s = np.random.uniform(a, b)
 84 |             t_idx, d_idx = self.prio_tree.get(s)
 85 |             tree_inds.append(t_idx)
 86 |             data_inds.append(d_idx)
 87 | 
 88 |         return tree_inds, data_inds
 89 | 
 90 |     def update(self, updates):
 91 |         for i, u in enumerate(updates):
 92 |             t_idx = self.curr_tree_inds[i]
 93 |             d_idx = self.curr_data_inds[i]
 94 |             p = self.get_priority(u)
 95 |             self.prio_tree.update(t_idx, p)
 96 |             self.exp['error'][d_idx] = u
 97 | 
 98 | 
 99 | class SumTree(object):
100 | 
101 |     '''
102 |     Adapted from  https://github.com/jaara/AI-blog/blob/master/SumTree.py
103 |     See https://jaromiru.com/2016/11/07/lets-make-a-dqn-double-learning-and-prioritized-experience-replay/
104 |     for a good introduction to PER
105 |     '''
106 | 
107 |     def __init__(self, capacity):
108 |         self.capacity = capacity
109 |         self.tree = np.zeros(2*capacity - 1)
110 |         self.head = 0
111 | 
112 |     def _propagate(self, idx, change):
113 |         parent = (idx - 1) // 2
114 |         self.tree[parent] += change
115 |         if parent != 0:
116 |             self._propagate(parent, change)
117 | 
118 |     def _retrieve(self, idx, s):
119 |         left = 2 * idx + 1
120 |         right = left + 1
121 | 
122 |         if left >= len(self.tree):
123 |             return idx
124 | 
125 |         if s <= self.tree[left]:
126 |             return self._retrieve(left, s)
127 |         else:
128 |             return self._retrieve(right, s-self.tree[left])
129 | 
130 |     def total(self):
131 |         return self.tree[0]
132 | 
133 |     def add(self, p):
134 |         idx = self.head + self.capacity - 1
135 |         self.update(idx, p)
136 |         self.head += 1
137 |         if self.head >= self.capacity:
138 |             self.head = 0
139 | 
140 |     def update(self, idx, p):
141 |         change = p - self.tree[idx]
142 |         self.tree[idx] = p
143 |         self._propagate(idx, change)
144 | 
145 |     def get(self, s):
146 |         idx = self._retrieve(0, s)
147 |         data_idx = idx - self.capacity + 1
148 |         return idx, data_idx
149 | 


--------------------------------------------------------------------------------
/rl/memory/ranked.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.memory.linear import LinearMemory
  3 | from rl.util import log_self
  4 | import math
  5 | 
  6 | 
  7 | class HighLowMemory(LinearMemory):
  8 | 
  9 |     '''
 10 |     Memory divided into two: good and bad experiences
 11 |     As with RankedMemory experiences are grouped episodically
 12 |     Episodes with a total reward > threshold are assigned to good memory
 13 |     The threshold is recomputed every n episodes and
 14 |     episodes are reassigned accordingly.
 15 |     Memories are sampled from good experiences with a self.prob_high
 16 |     Memories are sampled from bad experiences with a 1 - self.prob_high
 17 |     Experiences are sampled from a maximum of 3 randomly selected episodes,
 18 |     per minibatch for each of the high and low memories
 19 |     TODO improvement: do a more natural continuous range to sort high low
 20 |     by self.epi_memory.sort(key=lambda epi_exp: epi_exp['total_rewards'])
 21 |     '''
 22 | 
 23 |     # absorb generic param without breaking
 24 |     def __init__(self, env_spec, **kwargs):
 25 |         super(HighLowMemory, self).__init__(env_spec)
 26 |         # use the old self.exp as buffer, remember to clear
 27 |         self.last_exp = self.exp
 28 |         self.epi_memory_high = []
 29 |         self.epi_memory_low = []
 30 |         self.max_reward = -math.inf
 31 |         self.min_reward = math.inf
 32 |         # 1st  5 epis goes into bad half, recompute every 5 epis
 33 |         self.threshold = math.inf
 34 |         self.threshold_history = []
 35 |         self.epi_num = 0
 36 |         self.prob_high = 0.66
 37 |         self.num_epis_to_sample = 3
 38 |         self.max_epis_in_mem = 15
 39 |         self.recompute_freq = 10
 40 |         log_self(self)
 41 | 
 42 |     def reassign_episodes(self):
 43 |         new_high, new_low = []
 44 | 
 45 |         for mem in (self.epi_memory_high, self.epi_memory_low):
 46 |             for epi_exp in mem:
 47 |                 if (epi_exp['total_rewards'] > self.threshold):
 48 |                     new_high.append(epi_exp)
 49 |                 else:
 50 |                     new_low.append(epi_exp)
 51 | 
 52 |         self.epi_memory_high = new_high
 53 |         self.epi_memory_low = new_low
 54 | 
 55 |     def compute_threshold(self):
 56 |         self.threshold_history.append([self.threshold,
 57 |                                        self.max_reward,
 58 |                                        self.min_reward])
 59 |         if (len(self.threshold_history) > 1):
 60 |             # Scaled because this threshold seems too severe based on trial
 61 |             # runs
 62 |             self.threshold =  \
 63 |                 max(self.threshold,
 64 |                     (self.max_reward + self.min_reward) / 2.0 * 0.75)
 65 |         else:
 66 |             self.threshold = (self.max_reward + self.min_reward) / 2.0 * 0.75
 67 |         self.reassign_episodes()
 68 |         self.max_reward = -math.inf
 69 |         self.min_reward = math.inf
 70 | 
 71 |     def add_exp(self, action, reward, next_state, terminal):
 72 |         super(HighLowMemory, self).add_exp(
 73 |             action, reward, next_state, terminal)
 74 |         if terminal:
 75 |             epi_exp = {
 76 |                 'exp': self.exp,
 77 |                 'total_rewards': np.sum(self.exp['rewards']),
 78 |                 'epi_num': self.epi_num
 79 |             }
 80 |             if (epi_exp['total_rewards'] <= self.threshold):
 81 |                 self.epi_memory_low.append(epi_exp)
 82 |             else:
 83 |                 self.epi_memory_high.append(epi_exp)
 84 |             if (self.epi_num > 0 and self.epi_num % self.recompute_freq == 0):
 85 |                 self.compute_threshold()
 86 |             if (epi_exp['total_rewards'] > self.max_reward):
 87 |                 self.max_reward = epi_exp['total_rewards']
 88 |             if (epi_exp['total_rewards'] < self.min_reward):
 89 |                 self.min_reward = epi_exp['total_rewards']
 90 |             self.last_exp = self.exp
 91 |             self.exp = {k: [] for k in self.exp_keys}
 92 |             self.epi_num += 1
 93 |             # print("THRESHOLD HISTORY")
 94 |             # print(self.threshold_history)
 95 |             # print("HIGH MEM")
 96 |             # for epi in self.epi_memory_high:
 97 |             #     print(str(epi['total_rewards'])+ " ,", end=" ")
 98 |             # print()
 99 |             # print("LOW MEM")
100 |             # for epi in self.epi_memory_low:
101 |             #     print(str(epi['total_rewards'] )+ " ,", end=" ")
102 |             # print()
103 | 
104 |     def pop(self):
105 |         '''convenient method to get exp at [last_ind]'''
106 |         buffer_exp = self.exp  # store for restore later
107 |         self.exp = self.last_exp
108 |         res = super(HighLowMemory, self).pop()
109 |         self.exp = buffer_exp
110 |         return res
111 | 
112 |     def rand_minibatch(self, size):
113 |         # base case, early exit
114 |         high_samples = np.int(np.ceil(size * self.prob_high))
115 |         low_samples = size - high_samples
116 | 
117 |         if (len(self.epi_memory_high) == 0 and
118 |                 len(self.epi_memory_low) == 0):
119 |             return super(HighLowMemory, self).rand_minibatch(size)
120 | 
121 |         if (len(self.epi_memory_high) == 0):
122 |             high_samples = 0
123 |             low_samples = size
124 | 
125 |         high_samples_per_epi = np.int(
126 |             np.ceil(high_samples / self.num_epis_to_sample))
127 |         low_samples_per_epi = np.int(
128 |             np.ceil(low_samples / self.num_epis_to_sample))
129 | 
130 |         buffer_exp = self.exp
131 |         minibatch_as_list = []
132 |         if high_samples > 0:
133 |             for _i in range(4):
134 |                 idx = np.random.randint(0, len(self.epi_memory_high))
135 |                 epi_exp = self.epi_memory_high[idx]['exp']
136 |                 self.exp = epi_exp
137 |                 epi_minibatch = super(HighLowMemory, self).rand_minibatch(
138 |                     high_samples_per_epi)
139 |                 minibatch_as_list.append(epi_minibatch)
140 | 
141 |         if low_samples > 0:
142 |             for _i in range(4):
143 |                 idx = np.random.randint(0, len(self.epi_memory_low))
144 |                 epi_exp = self.epi_memory_low[idx]['exp']
145 |                 self.exp = epi_exp
146 |                 epi_minibatch = super(HighLowMemory, self).rand_minibatch(
147 |                     low_samples_per_epi)
148 |                 minibatch_as_list.append(epi_minibatch)
149 | 
150 |         # set buffer back to original
151 |         self.exp = buffer_exp
152 | 
153 |         # merge all minibatches from best_epi_memory into a minibatch
154 |         minibatch = {}
155 |         for k in self.exp_keys:
156 |             k_exp = np.concatenate(
157 |                 [epi_exp[k] for epi_exp in minibatch_as_list]
158 |             )[-size:]
159 |             minibatch[k] = k_exp
160 |         assert len(
161 |             minibatch['rewards']) == size, 'minibatch has the wrong size'
162 | 
163 |         return minibatch
164 | 
165 |     def update(self, updates):
166 |         pass
167 | 
168 | 
169 | class HighLowMemoryWithForgetting(HighLowMemory):
170 | 
171 |     '''
172 |     Like HighLowMemory but also has forgetting capability
173 |     Controlled by max_epis_in_mem param
174 |     '''
175 | 
176 |     # absorb generic param without breaking
177 |     def __init__(self, env_spec, **kwargs):
178 |         super(HighLowMemoryWithForgetting, self).__init__(env_spec)
179 |         self.max_epis_in_mem = 250
180 |         log_self(self)
181 | 
182 |     def reassign_episodes(self):
183 |         new_high, new_low = []
184 | 
185 |         for mem in (self.epi_memory_high, self.epi_memory_low):
186 |             for epi_exp in mem:
187 |                 if (self.epi_num - epi_exp['epi_num'] <= self.max_epis_in_mem):
188 |                     if (epi_exp['total_rewards'] > self.threshold):
189 |                         new_high.append(epi_exp)
190 |                     else:
191 |                         new_low.append(epi_exp)
192 | 
193 |         self.epi_memory_high = new_high
194 |         self.epi_memory_low = new_low
195 | 


--------------------------------------------------------------------------------
/rl/model/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kengz/openai_lab/d0669d89268f2dc01c1cf878e4879775c7b6eb3c/rl/model/.gitkeep


--------------------------------------------------------------------------------
/rl/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | from rl.util import import_package_files
2 | 
3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__)
4 | 


--------------------------------------------------------------------------------
/rl/optimizer/adam.py:
--------------------------------------------------------------------------------
 1 | from rl.optimizer.base_optimizer import Optimizer
 2 | 
 3 | 
 4 | class AdamOptimizer(Optimizer):
 5 | 
 6 |     '''
 7 |     Adam optimizer
 8 |     Potential param:
 9 |         lr (learning rate)
10 |         beta_1
11 |         beta_2
12 |         epsilon
13 |         decay
14 |         Suggested to leave at default param with the expected of lr
15 |     '''
16 | 
17 |     def __init__(self, **kwargs):
18 |         from keras.optimizers import Adam
19 |         self.Adam = Adam
20 | 
21 |         self.optim_param_keys = ['lr', 'beta_1', 'beta_2', 'epsilon', 'decay']
22 |         super(AdamOptimizer, self).__init__(**kwargs)
23 | 
24 |     def init_optimizer(self):
25 |         self.keras_optimizer = self.Adam(**self.optim_param)
26 | 


--------------------------------------------------------------------------------
/rl/optimizer/base_optimizer.py:
--------------------------------------------------------------------------------
 1 | from rl.util import log_self, logger
 2 | 
 3 | 
 4 | class Optimizer(object):
 5 | 
 6 |     '''
 7 |     The base class of Optimizer, with the core methods
 8 |     '''
 9 | 
10 |     def __init__(self, **kwargs):
11 |         '''Construct externally, and set at Agent.compile()'''
12 |         self.agent = None
13 |         self.keras_optimizer = None
14 |         self.optim_param = {}
15 |         self.update_optim_param(**kwargs)
16 |         self.init_optimizer()
17 |         log_self(self)
18 | 
19 |     def update_optim_param(self, **kwargs):
20 |         o_param = {
21 |             k: kwargs.get(k) for k in self.optim_param_keys
22 |             if kwargs.get(k) is not None}
23 |         self.optim_param.update(o_param)
24 | 
25 |     def init_optimizer(self):
26 |         raise NotImplementedError()
27 | 
28 |     def change_optim_param(self, **new_param):
29 |         self.update_optim_param(**new_param)
30 |         self.init_optimizer()
31 |         logger.info("Optimizer param changed")
32 |         log_self(self)
33 | 


--------------------------------------------------------------------------------
/rl/optimizer/rmsprop.py:
--------------------------------------------------------------------------------
 1 | from rl.optimizer.base_optimizer import Optimizer
 2 | 
 3 | 
 4 | class RMSpropOptimizer(Optimizer):
 5 | 
 6 |     '''
 7 |     RMS prop
 8 |     Potential param:
 9 |         lr (learning rate)
10 |         rho
11 |         decay
12 |         epsilon
13 |     '''
14 | 
15 |     def __init__(self, **kwargs):
16 |         from keras.optimizers import RMSprop
17 |         self.RMSprop = RMSprop
18 | 
19 |         self.optim_param_keys = ['lr', 'rho', 'decay', 'epsilon']
20 |         super(RMSpropOptimizer, self).__init__(**kwargs)
21 | 
22 |     def init_optimizer(self):
23 |         self.keras_optimizer = self.RMSprop(**self.optim_param)
24 | 


--------------------------------------------------------------------------------
/rl/optimizer/sgd.py:
--------------------------------------------------------------------------------
 1 | from rl.optimizer.base_optimizer import Optimizer
 2 | 
 3 | 
 4 | class SGDOptimizer(Optimizer):
 5 | 
 6 |     '''
 7 |     Stochastic gradient descent
 8 |     Potential param:
 9 |         lr (learning rate)
10 |         momentum
11 |         decay
12 |         nesterov
13 |     '''
14 | 
15 |     def __init__(self, **kwargs):
16 |         from keras.optimizers import SGD
17 |         self.SGD = SGD
18 | 
19 |         self.optim_param_keys = ['lr', 'momentum', 'decay', 'nesterov']
20 |         super(SGDOptimizer, self).__init__(**kwargs)
21 | 
22 |     def init_optimizer(self):
23 |         self.keras_optimizer = self.SGD(**self.optim_param)
24 | 


--------------------------------------------------------------------------------
/rl/policy/__init__.py:
--------------------------------------------------------------------------------
1 | from rl.util import import_package_files
2 | 
3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__)
4 | 


--------------------------------------------------------------------------------
/rl/policy/actor_critic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.policy.base_policy import Policy
  3 | from rl.util import log_self
  4 | 
  5 | 
  6 | class ArgmaxPolicy(Policy):
  7 | 
  8 |     '''
  9 |     The argmax policy for actor critic agents
 10 |     Agent takes the action with the highest
 11 |     action score
 12 |     '''
 13 | 
 14 |     def __init__(self, env_spec,
 15 |                  **kwargs):  # absorb generic param without breaking
 16 |         super(ArgmaxPolicy, self).__init__(env_spec)
 17 |         log_self(self)
 18 | 
 19 |     def select_action(self, state):
 20 |         agent = self.agent
 21 |         state = np.expand_dims(state, axis=0)
 22 |         A_score = agent.actor.predict(state)[0]  # extract from batch predict
 23 |         assert A_score.ndim == 1
 24 |         action = np.argmax(A_score)
 25 |         return action
 26 | 
 27 |     def update(self, sys_vars):
 28 |         pass
 29 | 
 30 | 
 31 | class SoftmaxPolicy(Policy):
 32 | 
 33 |     '''
 34 |     The softmax policy for actor critic agents
 35 |     Action is drawn from the prob dist generated
 36 |     by softmax(acion_scores)
 37 |     '''
 38 | 
 39 |     def __init__(self, env_spec,
 40 |                  **kwargs):  # absorb generic param without breaking
 41 |         super(SoftmaxPolicy, self).__init__(env_spec)
 42 |         self.clip_val = 500.
 43 |         log_self(self)
 44 | 
 45 |     def select_action(self, state):
 46 |         agent = self.agent
 47 |         state = np.expand_dims(state, axis=0)
 48 |         A_score = agent.actor.predict(state)[0]  # extract from batch predict
 49 |         assert A_score.ndim == 1
 50 |         A_score = A_score.astype('float64')  # fix precision overflow
 51 |         exp_values = np.exp(
 52 |             np.clip(A_score, -self.clip_val, self.clip_val))
 53 |         assert np.isfinite(exp_values).all()
 54 |         probs = np.array(exp_values / np.sum(exp_values))
 55 |         probs /= probs.sum()  # renormalize to prevent floating pt error
 56 |         action = np.random.choice(agent.env_spec['actions'], p=probs)
 57 |         return action
 58 | 
 59 |     def update(self, sys_vars):
 60 |         pass
 61 | 
 62 | 
 63 | class GaussianPolicy(Policy):
 64 | 
 65 |     '''
 66 |     Continuous policy for actor critic models
 67 |     Output of the actor network is the mean action
 68 |     along each dimension. Action chosen is the mean
 69 |     plus some noise parameterized by the variance
 70 |     '''
 71 | 
 72 |     def __init__(self, env_spec,
 73 |                  variance=1.0,
 74 |                  **kwargs):  # absorb generic param without breaking
 75 |         super(GaussianPolicy, self).__init__(env_spec)
 76 |         self.variance = variance
 77 |         log_self(self)
 78 | 
 79 |     def select_action(self, state):
 80 |         agent = self.agent
 81 |         state = np.expand_dims(state, axis=0)
 82 |         a_mean = agent.actor.predict(state)[0]  # extract from batch predict
 83 |         action = a_mean + np.random.normal(
 84 |             loc=0.0, scale=self.variance, size=a_mean.shape)
 85 |         action = np.clip(action,
 86 |                          self.env_spec['action_bound_low'],
 87 |                          self.env_spec['action_bound_high'])
 88 |         return action
 89 | 
 90 |     def update(self, sys_vars):
 91 |         pass
 92 | 
 93 | 
 94 | class BoundedPolicy(Policy):
 95 | 
 96 |     '''
 97 |     The bounded policy for actor critic agents
 98 |     and continous, bounded policy spaces
 99 |     Action bounded above and below by
100 |     - action_bound, + action_bound
101 |     '''
102 | 
103 |     def __init__(self, env_spec,
104 |                  **kwargs):  # absorb generic param without breaking
105 |         super(BoundedPolicy, self).__init__(env_spec)
106 |         self.action_bound = env_spec['action_bound_high']
107 |         assert env_spec['action_bound_high'] == -env_spec['action_bound_low']
108 |         log_self(self)
109 | 
110 |     def select_action(self, state):
111 |         agent = self.agent
112 |         state = np.expand_dims(state, axis=0)
113 |         A_score = agent.actor.predict(state)[0]  # extract from batch predict
114 |         action = np.tanh(A_score) * self.action_bound
115 |         return action
116 | 
117 |     def update(self, sys_vars):
118 |         pass
119 | 


--------------------------------------------------------------------------------
/rl/policy/base_policy.py:
--------------------------------------------------------------------------------
 1 | class Policy(object):
 2 | 
 3 |     '''
 4 |     The base class of Policy, with the core methods
 5 |     '''
 6 | 
 7 |     def __init__(self, env_spec,
 8 |                  **kwargs):  # absorb generic param without breaking
 9 |         '''Construct externally, and set at Agent.compile()'''
10 |         self.env_spec = env_spec
11 |         self.agent = None
12 | 
13 |     def select_action(self, state):
14 |         raise NotImplementedError()
15 | 
16 |     def update(self, sys_vars):
17 |         raise NotImplementedError()
18 | 


--------------------------------------------------------------------------------
/rl/policy/boltzmann.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rl.policy.base_policy import Policy
 3 | from rl.util import log_self
 4 | 
 5 | 
 6 | class BoltzmannPolicy(Policy):
 7 | 
 8 |     '''
 9 |     The Boltzmann policy, where prob dist for selection
10 |     p = exp(Q/tau) / sum(Q[a]/tau)
11 |     '''
12 | 
13 |     def __init__(self, env_spec,
14 |                  init_tau=5., final_tau=0.5, exploration_anneal_episodes=20,
15 |                  **kwargs):  # absorb generic param without breaking
16 |         super(BoltzmannPolicy, self).__init__(env_spec)
17 |         self.init_tau = init_tau
18 |         self.final_tau = final_tau
19 |         self.tau = self.init_tau
20 |         self.exploration_anneal_episodes = exploration_anneal_episodes
21 |         self.clip_val = 500.
22 |         log_self(self)
23 | 
24 |     def select_action(self, state):
25 |         agent = self.agent
26 |         state = np.expand_dims(state, axis=0)
27 |         Q_state = agent.model.predict(state)[0]  # extract from batch predict
28 |         assert Q_state.ndim == 1
29 |         Q_state = Q_state.astype('float64')  # fix precision overflow
30 |         exp_values = np.exp(
31 |             np.clip(Q_state / self.tau, -self.clip_val, self.clip_val))
32 |         assert np.isfinite(exp_values).all()
33 |         probs = np.array(exp_values / np.sum(exp_values))
34 |         probs /= probs.sum()  # renormalize to prevent floating pt error
35 |         action = np.random.choice(agent.env_spec['actions'], p=probs)
36 |         return action
37 | 
38 |     def update(self, sys_vars):
39 |         '''strategy to update tau in agent'''
40 |         epi = sys_vars['epi']
41 |         rise = self.final_tau - self.init_tau
42 |         slope = rise / float(self.exploration_anneal_episodes)
43 |         self.tau = max(slope * epi + self.init_tau, self.final_tau)
44 |         return self.tau
45 | 
46 | 
47 | class DoubleDQNBoltzmannPolicy(BoltzmannPolicy):
48 | 
49 |     '''
50 |     Same as the Boltzmann policy but for a Double DQN agent
51 |     '''
52 | 
53 |     def __init__(self, env_spec,
54 |                  init_tau=5., final_tau=0.5, exploration_anneal_episodes=20,
55 |                  **kwargs):  # absorb generic param without breaking
56 |         super(DoubleDQNBoltzmannPolicy, self).__init__(
57 |             env_spec, init_tau, final_tau,
58 |             exploration_anneal_episodes)
59 | 
60 |     def select_action(self, state):
61 |         agent = self.agent
62 |         state = np.expand_dims(state, axis=0)
63 |         # extract from batch predict
64 |         Q_state1 = agent.model.predict(state)[0]
65 |         Q_state2 = agent.model_2.predict(state)[0]
66 |         Q_state = Q_state1 + Q_state2
67 |         assert Q_state.ndim == 1
68 |         Q_state = Q_state.astype('float64')  # fix precision overflow
69 |         exp_values = np.exp(
70 |             np.clip(Q_state / self.tau, -self.clip_val, self.clip_val))
71 |         assert np.isfinite(exp_values).all()
72 |         probs = np.array(exp_values / np.sum(exp_values))
73 |         probs /= probs.sum()  # renormalize to prevent floating pt error
74 |         action = np.random.choice(agent.env_spec['actions'], p=probs)
75 |         return action
76 | 


--------------------------------------------------------------------------------
/rl/policy/epsilon_greedy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.policy.base_policy import Policy
  3 | from rl.util import log_self
  4 | 
  5 | 
  6 | class EpsilonGreedyPolicy(Policy):
  7 | 
  8 |     '''
  9 |     The Epsilon-greedy policy
 10 |     '''
 11 | 
 12 |     def __init__(self, env_spec,
 13 |                  init_e=1.0, final_e=0.1, exploration_anneal_episodes=30,
 14 |                  **kwargs):  # absorb generic param without breaking
 15 |         super(EpsilonGreedyPolicy, self).__init__(env_spec)
 16 |         self.init_e = init_e
 17 |         self.final_e = final_e
 18 |         self.e = self.init_e
 19 |         self.exploration_anneal_episodes = exploration_anneal_episodes
 20 |         log_self(self)
 21 | 
 22 |     def select_action(self, state):
 23 |         '''epsilon-greedy method'''
 24 |         agent = self.agent
 25 |         if self.e > np.random.rand():
 26 |             action = np.random.choice(agent.env_spec['actions'])
 27 |         else:
 28 |             state = np.expand_dims(state, axis=0)
 29 |             # extract from batch predict
 30 |             Q_state = agent.model.predict(state)[0]
 31 |             assert Q_state.ndim == 1
 32 |             action = np.argmax(Q_state)
 33 |         return action
 34 | 
 35 |     def update(self, sys_vars):
 36 |         '''strategy to update epsilon in agent'''
 37 |         epi = sys_vars['epi']
 38 |         rise = self.final_e - self.init_e
 39 |         slope = rise / float(self.exploration_anneal_episodes)
 40 |         self.e = max(slope * epi + self.init_e, self.final_e)
 41 |         return self.e
 42 | 
 43 | 
 44 | class DoubleDQNEpsilonGreedyPolicy(EpsilonGreedyPolicy):
 45 | 
 46 |     '''
 47 |     Policy to accompany double dqn agents
 48 |     When actions are not random this policy
 49 |     selects actions by symming the outputs from
 50 |     each of the two Q-state approximators
 51 |     before taking the max of the result
 52 |     '''
 53 | 
 54 |     def __init__(self, env_spec,
 55 |                  init_e=1.0, final_e=0.1, exploration_anneal_episodes=30,
 56 |                  **kwargs):  # absorb generic param without breaking
 57 |         super(DoubleDQNEpsilonGreedyPolicy, self).__init__(
 58 |             env_spec, init_e, final_e,
 59 |             exploration_anneal_episodes)
 60 | 
 61 |     def select_action(self, state):
 62 |         '''epsilon-greedy method'''
 63 |         agent = self.agent
 64 |         if self.e > np.random.rand():
 65 |             action = np.random.choice(agent.env_spec['actions'])
 66 |         else:
 67 |             state = np.expand_dims(state, axis=0)
 68 |             # extract from batch predict
 69 |             Q_state1 = agent.model.predict(state)[0]
 70 |             Q_state2 = agent.model_2.predict(state)[0]
 71 |             Q_state = Q_state1 + Q_state2
 72 |             assert Q_state.ndim == 1
 73 |             action = np.argmax(Q_state)
 74 |         return action
 75 | 
 76 | 
 77 | class DecayingEpsilonGreedyPolicy(EpsilonGreedyPolicy):
 78 | 
 79 |     '''
 80 |     Inspired by alvacarce's solution to mountain car
 81 |     https://gym.openai.com/evaluations/eval_t3GN2Xb0R5KpyjkJUGsLw
 82 |     '''
 83 | 
 84 |     def __init__(self, env_spec,
 85 |                  init_e=1.0, final_e=0.1, exploration_anneal_episodes=30,
 86 |                  **kwargs):  # absorb generic param without breaking
 87 |         super(DecayingEpsilonGreedyPolicy, self).__init__(
 88 |             env_spec, init_e, final_e, exploration_anneal_episodes)
 89 |         self.e_decay = 0.9997
 90 | 
 91 |     def update(self, sys_vars):
 92 |         _epi = sys_vars['epi']
 93 |         if self.e > self.final_e:
 94 |             self.e = self.e * self.e_decay
 95 |         return self.e
 96 | 
 97 | 
 98 | class OscillatingEpsilonGreedyPolicy(EpsilonGreedyPolicy):
 99 | 
100 |     '''
101 |     The epsilon-greedy policy with oscillating epsilon
102 |     periodically agent.e will drop to a fraction of
103 |     the current exploration rate
104 |     '''
105 | 
106 |     def update(self, sys_vars):
107 |         '''strategy to update epsilon in agent'''
108 |         super(OscillatingEpsilonGreedyPolicy, self).update(
109 |             sys_vars)
110 |         epi = sys_vars['epi']
111 |         if not (epi % 3) and epi > 15:
112 |             # drop to 1/3 of the current exploration rate
113 |             self.e = max(self.e/3., self.final_e)
114 |         return self.e
115 | 
116 | 
117 | class TargetedEpsilonGreedyPolicy(EpsilonGreedyPolicy):
118 | 
119 |     '''
120 |     switch between active and inactive exploration cycles by
121 |     partial mean rewards and its distance to the target mean rewards
122 |     '''
123 | 
124 |     def update(self, sys_vars):
125 |         '''strategy to update epsilon in agent'''
126 |         epi = sys_vars['epi']
127 |         assert sys_vars['SOLVED_MEAN_REWARD'] is not None, \
128 |             'this policy needs an explicit target SOLVED_MEAN_REWARD'
129 |         SOLVED_MEAN_REWARD = sys_vars['SOLVED_MEAN_REWARD']
130 |         REWARD_MEAN_LEN = sys_vars['REWARD_MEAN_LEN']
131 |         PARTIAL_MEAN_LEN = int(REWARD_MEAN_LEN * 0.20)
132 |         if epi < 1:  # corner case when no total_rewards_history to avg
133 |             return
134 |         # the partial mean for projection the entire mean
135 |         partial_mean_reward = np.mean(
136 |             sys_vars['total_rewards_history'][-PARTIAL_MEAN_LEN:])
137 |         # difference to target, and its ratio (1 if denominator is 0)
138 |         min_reward = np.amin(sys_vars['total_rewards_history'])
139 |         projection_gap = SOLVED_MEAN_REWARD - partial_mean_reward
140 |         worst_gap = SOLVED_MEAN_REWARD - min_reward
141 |         gap_ratio = projection_gap / worst_gap
142 |         envelope = self.init_e + (self.final_e - self.init_e) / 2. * \
143 |             (float(epi)/float(self.exploration_anneal_episodes))
144 |         pessimistic_gap_ratio = envelope * min(2 * gap_ratio, 1)
145 |         # if is in odd cycle, and diff is still big, actively explore
146 |         active_exploration_cycle = not bool(
147 |             int(epi/PARTIAL_MEAN_LEN) % 2) and (
148 |             projection_gap > abs(SOLVED_MEAN_REWARD * 0.05))
149 |         self.e = max(pessimistic_gap_ratio * self.init_e, self.final_e)
150 | 
151 |         if not active_exploration_cycle:
152 |             self.e = max(self.e/2., self.final_e)
153 |         return self.e
154 | 


--------------------------------------------------------------------------------
/rl/policy/noise.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from rl.util import log_self
  3 | from rl.policy.base_policy import Policy
  4 | from rl.policy.epsilon_greedy import EpsilonGreedyPolicy
  5 | 
  6 | 
  7 | class NoNoisePolicy(Policy):
  8 | 
  9 |     '''
 10 |     The base class for noise policy for DDPG
 11 |     default is no noise
 12 |     '''
 13 | 
 14 |     def __init__(self, env_spec,
 15 |                  **kwargs):  # absorb generic param without breaking
 16 |         super(NoNoisePolicy, self).__init__(env_spec)
 17 |         log_self(self)
 18 | 
 19 |     def sample(self):
 20 |         '''implement noise here, default is none'''
 21 |         assert 'actions' in self.env_spec
 22 |         return 0
 23 | 
 24 |     def select_action(self, state):
 25 |         agent = self.agent
 26 |         state = np.expand_dims(state, axis=0)
 27 |         if self.env_spec['actions'] == 'continuous':
 28 |             action = agent.actor.predict(state)[0] + self.sample()
 29 |             action = np.clip(action,
 30 |                              self.env_spec['action_bound_low'],
 31 |                              self.env_spec['action_bound_high'])
 32 |         else:
 33 |             Q_state = agent.actor.predict(state)[0]
 34 |             assert Q_state.ndim == 1
 35 |             action = np.argmax(Q_state)
 36 |         return action
 37 | 
 38 |     def update(self, sys_vars):
 39 |         pass
 40 | 
 41 | 
 42 | class LinearNoisePolicy(NoNoisePolicy):
 43 | 
 44 |     '''
 45 |     policy with linearly decaying noise (1. / (1. + self.epi))
 46 |     '''
 47 | 
 48 |     def __init__(self, env_spec, exploration_anneal_episodes=20,
 49 |                  **kwargs):  # absorb generic param without breaking
 50 |         super(LinearNoisePolicy, self).__init__(env_spec)
 51 |         self.exploration_anneal_episodes = exploration_anneal_episodes
 52 |         self.n_step = 0  # init
 53 |         log_self(self)
 54 | 
 55 |     def sample(self):
 56 |         noise = (1. / (1. + self.n_step))
 57 |         return noise
 58 | 
 59 |     def update(self, sys_vars):
 60 |         epi = sys_vars['epi']
 61 |         if epi >= self.exploration_anneal_episodes:
 62 |             self.n_step = np.inf  # noise divide to zero
 63 |         else:
 64 |             self.n_step = sys_vars['epi']
 65 | 
 66 | 
 67 | class EpsilonGreedyNoisePolicy(EpsilonGreedyPolicy, NoNoisePolicy):
 68 | 
 69 |     '''
 70 |     akin to epsilon greedy decay,
 71 |     but return random sample instead
 72 |     '''
 73 | 
 74 |     def sample(self):
 75 |         if self.e > np.random.rand():
 76 |             noise = np.random.uniform(
 77 |                 0.5 * self.env_spec['action_bound_low'],
 78 |                 0.5 * self.env_spec['action_bound_high'])
 79 |         else:
 80 |             noise = 0
 81 |         return noise
 82 | 
 83 |     def select_action(self, state):
 84 |         return NoNoisePolicy.select_action(self, state)
 85 | 
 86 | 
 87 | class AnnealedGaussianPolicy(LinearNoisePolicy):
 88 | 
 89 |     '''
 90 |     Base class of random noise policy for DDPG
 91 |     Adopted from
 92 |     https://github.com/matthiasplappert/keras-rl/blob/master/rl/random.py
 93 |     '''
 94 | 
 95 |     def __init__(self, env_spec, exploration_anneal_episodes,
 96 |                  mu, sigma, sigma_min,
 97 |                  **kwargs):  # absorb generic param without breaking
 98 |         super(AnnealedGaussianPolicy, self).__init__(
 99 |             env_spec, exploration_anneal_episodes)
100 |         self.size = env_spec['action_dim']
101 |         self.mu = mu
102 |         self.sigma = sigma
103 | 
104 |         if sigma_min is not None:
105 |             self.m = -(sigma - sigma_min) / self.exploration_anneal_episodes
106 |             self.c = sigma
107 |             self.sigma_min = sigma_min
108 |         else:
109 |             self.m = 0.
110 |             self.c = sigma
111 |             self.sigma_min = sigma
112 | 
113 |     @property
114 |     def current_sigma(self):
115 |         sigma = max(self.sigma_min, self.m * self.n_step + self.c)
116 |         return sigma
117 | 
118 | 
119 | class GaussianWhiteNoisePolicy(AnnealedGaussianPolicy):
120 | 
121 |     def __init__(self, env_spec, exploration_anneal_episodes=20,
122 |                  mu=0., sigma=.3, sigma_min=None,
123 |                  **kwargs):  # absorb generic param without breaking
124 |         super(GaussianWhiteNoisePolicy, self).__init__(
125 |             env_spec, exploration_anneal_episodes,
126 |             mu, sigma, sigma_min)
127 | 
128 |     def sample(self):
129 |         sample = np.random.normal(self.mu, self.current_sigma, self.size)
130 |         return sample
131 | 
132 | 
133 | class OUNoisePolicy(AnnealedGaussianPolicy):
134 | 
135 |     '''
136 |     Based on
137 |     http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
138 |     '''
139 | 
140 |     def __init__(self, env_spec, exploration_anneal_episodes=20,
141 |                  theta=.15, mu=0., sigma=.3, dt=1e-2, x0=None, sigma_min=None,
142 |                  **kwargs):  # absorb generic param without breaking
143 |         super(OUNoisePolicy, self).__init__(
144 |             env_spec, exploration_anneal_episodes,
145 |             mu, sigma, sigma_min,
146 |             **kwargs)
147 |         self.theta = theta
148 |         self.mu = mu
149 |         self.dt = dt
150 |         self.x0 = x0
151 |         self.reset_states()
152 | 
153 |     def reset_states(self):
154 |         self.x_prev = self.x0 if self.x0 is not None else np.zeros(self.size)
155 | 
156 |     def sample(self):
157 |         x = self.x_prev + self.theta * \
158 |             (self.mu - self.x_prev) * self.dt + self.current_sigma * \
159 |             np.sqrt(self.dt) * np.random.normal(size=self.size)
160 |         self.x_prev = x
161 |         return x
162 | 


--------------------------------------------------------------------------------
/rl/preprocessor/__init__.py:
--------------------------------------------------------------------------------
1 | from rl.util import import_package_files
2 | 
3 | __all__ = ['__all__'] + import_package_files(globals(), locals(), __file__)
4 | 


--------------------------------------------------------------------------------
/rl/preprocessor/atari.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy as sp
 3 | from rl.preprocessor.base_preprocessor import PreProcessor
 4 | 
 5 | 
 6 | # Util functions for state preprocessing
 7 | 
 8 | def resize_image(im):
 9 |     return sp.misc.imresize(im, (110, 84))
10 | 
11 | 
12 | def crop_image(im):
13 |     return im[-84:, :]
14 | 
15 | 
16 | def process_image_atari(im):
17 |     '''
18 |     Image preprocessing from the paper
19 |     Playing Atari with Deep Reinforcement Learning, 2013
20 |     Takes an RGB image and converts it to grayscale,
21 |     downsizes to 110 x 84
22 |     and crops to square 84 x 84, taking bottomost rows of image
23 |     '''
24 |     im_gray = np.dot(im[..., :3], [0.299, 0.587, 0.114])
25 |     im_resized = resize_image(im_gray)
26 |     im_cropped = crop_image(im_resized)
27 |     return im_cropped
28 | 
29 | 
30 | class Atari(PreProcessor):
31 | 
32 |     '''
33 |     Convert images to greyscale, downsize, crop, then stack 4 states
34 |     NOTE: Image order is cols * rows * channels to match openai gym format
35 |     Input to model is rows * cols * channels (== states)
36 |     '''
37 | 
38 |     def __init__(self, **kwargs):  # absorb generic param without breaking):
39 |         super(Atari, self).__init__()
40 | 
41 |     def preprocess_state(self):
42 |         processed_state_queue = (
43 |             process_image_atari(self.state),
44 |             process_image_atari(self.previous_state),
45 |             process_image_atari(self.pre_previous_state),
46 |             process_image_atari(self.pre_pre_previous_state))
47 |         processed_state = np.stack(processed_state_queue, axis=-1)
48 |         return processed_state
49 | 
50 |     def preprocess_memory(self, action, reward, next_state, done):
51 |         self.add_raw_exp(action, reward, next_state, done)
52 |         if (self.exp_queue_size() < self.MAX_QUEUE_SIZE):  # insufficient queue
53 |             return
54 |         (_state, action, reward, next_state, done) = self.exp_queue[-1]
55 |         processed_next_state_queue = (
56 |             process_image_atari(self.exp_queue[-1][3]),
57 |             process_image_atari(self.exp_queue[-2][3]),
58 |             process_image_atari(self.exp_queue[-3][3]),
59 |             process_image_atari(self.exp_queue[-4][3]))
60 |         processed_state = self.preprocess_state()
61 |         processed_next_state = np.stack(processed_next_state_queue, axis=-1)
62 |         self.debug_state(processed_state, processed_next_state)
63 |         processed_exp = (action, reward, processed_next_state, done)
64 |         return processed_exp
65 | 


--------------------------------------------------------------------------------
/rl/preprocessor/base_preprocessor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rl.util import logger, log_self
 3 | 
 4 | 
 5 | def create_dummy_states(state):
 6 |     state_shape = state.shape
 7 |     previous_state = np.zeros(state_shape)
 8 |     pre_previous_state = np.zeros(state_shape)
 9 |     pre_pre_previous_state = np.zeros(state_shape)
10 |     if (previous_state.ndim == 1):
11 |         previous_state = np.zeros([state_shape[0]])
12 |         pre_previous_state = np.zeros([state_shape[0]])
13 |         pre_pre_previous_state = np.zeros([state_shape[0]])
14 |     return (previous_state, pre_previous_state, pre_pre_previous_state)
15 | 
16 | 
17 | class PreProcessor(object):
18 | 
19 |     '''
20 |     The Base class for state preprocessing
21 |     '''
22 | 
23 |     def __init__(self, max_queue_size=4, **kwargs):
24 |         '''Construct externally, and set at Agent.compile()'''
25 |         self.agent = None
26 |         self.state = None
27 |         self.exp_queue = []
28 |         self.MAX_QUEUE_SIZE = max_queue_size
29 |         self.never_debugged = True
30 |         log_self(self)
31 | 
32 |     def reset_state(self, init_state):
33 |         '''reset the state of LinearMemory per episode env.reset()'''
34 |         self.state = np.array(init_state)  # cast into np for safety
35 |         (previous_state, pre_previous_state,
36 |             pre_pre_previous_state) = create_dummy_states(self.state)
37 |         self.previous_state = previous_state
38 |         self.pre_previous_state = pre_previous_state
39 |         self.pre_pre_previous_state = pre_pre_previous_state
40 |         return self.preprocess_state()
41 | 
42 |     def exp_queue_size(self):
43 |         return len(self.exp_queue)
44 | 
45 |     def debug_state(self, processed_state, processed_next_state):
46 |         if self.never_debugged:
47 |             logger.debug("State shape: {}".format(processed_state.shape))
48 |             logger.debug(
49 |                 "Next state shape: {}".format(processed_next_state.shape))
50 |             self.never_debugged = False
51 | 
52 |     def preprocess_env_spec(self, env_spec):
53 |         '''helper to tweak env_spec according to preprocessor'''
54 |         class_name = self.__class__.__name__
55 |         if class_name is 'StackStates':
56 |             env_spec['state_dim'] = env_spec['state_dim'] * 2
57 |         elif class_name is 'Atari':
58 |             env_spec['state_dim'] = (84, 84, 4)
59 |         return env_spec
60 | 
61 |     def preprocess_state(self):
62 |         raise NotImplementedError()
63 | 
64 |     def advance_state(self, next_state):
65 |         self.pre_pre_previous_state = self.pre_previous_state
66 |         self.pre_previous_state = self.previous_state
67 |         self.previous_state = self.state
68 |         self.state = next_state
69 | 
70 |     def add_raw_exp(self, action, reward, next_state, done):
71 |         '''
72 |         Buffer currently set to hold only last 4 experiences
73 |         Amount needed for Atari games preprocessing
74 |         '''
75 |         self.exp_queue.append([self.state, action, reward, next_state, done])
76 |         if (self.exp_queue_size() > self.MAX_QUEUE_SIZE):
77 |             del self.exp_queue[0]
78 |         self.advance_state(next_state)
79 | 
80 |     def preprocess_memory(self, action, reward, next_state, done):
81 |         raise NotImplementedError()
82 | 


--------------------------------------------------------------------------------
/rl/preprocessor/linear.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rl.preprocessor.base_preprocessor import PreProcessor
 3 | 
 4 | 
 5 | class NoPreProcessor(PreProcessor):
 6 | 
 7 |     '''
 8 |     Default class, no preprocessing
 9 |     '''
10 | 
11 |     def __init__(self, **kwargs):  # absorb generic param without breaking):
12 |         super(NoPreProcessor, self).__init__()
13 | 
14 |     def preprocess_state(self):
15 |         return self.state
16 | 
17 |     def preprocess_memory(self, action, reward, next_state, done):
18 |         '''No state processing'''
19 |         self.add_raw_exp(action, reward, next_state, done)
20 |         (_state, action, reward, next_state, done) = self.exp_queue[-1]
21 |         processed_exp = (action, reward, next_state, done)
22 |         return processed_exp
23 | 
24 | 
25 | class StackStates(PreProcessor):
26 | 
27 |     '''
28 |     Current and last state are concatenated to form input to model
29 |     '''
30 | 
31 |     def __init__(self, **kwargs):  # absorb generic param without breaking):
32 |         super(StackStates, self).__init__(max_queue_size=2)
33 | 
34 |     def preprocess_state(self):
35 |         processed_state = np.concatenate([self.previous_state, self.state])
36 |         return processed_state
37 | 
38 |     def preprocess_memory(self, action, reward, next_state, done):
39 |         '''Concatenate: previous + current states'''
40 |         self.add_raw_exp(action, reward, next_state, done)
41 |         if (self.exp_queue_size() < self.MAX_QUEUE_SIZE):  # insufficient queue
42 |             return
43 |         (state, action, reward, next_state, done) = self.exp_queue[-1]
44 |         processed_state = self.preprocess_state()
45 |         processed_next_state = np.concatenate([state, next_state])
46 |         self.debug_state(processed_state, processed_next_state)
47 |         processed_exp = (action, reward, processed_next_state, done)
48 |         return processed_exp
49 | 
50 | 
51 | class DiffStates(PreProcessor):
52 | 
53 |     '''
54 |     Different between current and last state is input to model
55 |     '''
56 | 
57 |     def __init__(self, **kwargs):  # absorb generic param without breaking):
58 |         super(DiffStates, self).__init__(max_queue_size=2)
59 | 
60 |     def preprocess_state(self):
61 |         processed_state = self.state - self.previous_state
62 |         return processed_state
63 | 
64 |     def preprocess_memory(self, action, reward, next_state, done):
65 |         '''Change in state, curr_state - last_state'''
66 |         self.add_raw_exp(action, reward, next_state, done)
67 |         if (self.exp_queue_size() < self.MAX_QUEUE_SIZE):  # insufficient queue
68 |             return
69 |         (state, action, reward, next_state, done) = self.exp_queue[-1]
70 |         processed_state = self.preprocess_state()
71 |         processed_next_state = next_state - state
72 |         self.debug_state(processed_state, processed_next_state)
73 |         processed_exp = (action, reward, processed_next_state, done)
74 |         return processed_exp
75 | 


--------------------------------------------------------------------------------
/rl/spec/atari_experiment_specs.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "dev_conv_dqn": {
  3 |     "problem": "DevBreakout-v0",
  4 |     "Agent": "ConvDQN",
  5 |     "HyperOptimizer": "GridSearch",
  6 |     "Memory": "LinearMemoryWithForgetting",
  7 |     "Optimizer": "AdamOptimizer",
  8 |     "Policy": "EpsilonGreedyPolicy",
  9 |     "PreProcessor": "Atari",
 10 |     "param": {
 11 |       "train_per_n_new_exp": 4,
 12 |       "lr": 0.001,
 13 |       "batch_size": 32,
 14 |       "gamma": 0.99,
 15 |       "hidden_layers": [
 16 |         [16, 8, 8, [4, 4]],
 17 |         [32, 4, 4, [2, 2]]
 18 |       ],
 19 |       "hidden_layers_activation": "relu",
 20 |       "exploration_anneal_episodes": 3000,
 21 |       "epi_change_lr": 3000,
 22 |       "auto_architecture": true,
 23 |       "num_hidden_layers": 3,
 24 |       "num_initial_channels": 8,
 25 |       "max_mem_len": 500000
 26 | 
 27 |     },
 28 |     "param_range": {
 29 |       "lr": [0.001, 0.0001],
 30 |       "hidden_layers": [
 31 |         [
 32 |           [16, 8, 8, [4, 4]],
 33 |           [32, 4, 4, [2, 2]]
 34 |         ]
 35 |       ]
 36 |     }
 37 |   },
 38 |   "breakout_dqn": {
 39 |     "problem": "Breakout-v0",
 40 |     "Agent": "ConvDQN",
 41 |     "HyperOptimizer": "GridSearch",
 42 |     "Memory": "LinearMemoryWithForgetting",
 43 |     "Optimizer": "AdamOptimizer",
 44 |     "Policy": "EpsilonGreedyPolicy",
 45 |     "PreProcessor": "Atari",
 46 |     "param": {
 47 |       "train_per_n_new_exp": 4,
 48 |       "batch_size": 32,
 49 |       "lr": 0.001,
 50 |       "gamma": 0.99,
 51 |       "hidden_layers": [
 52 |         [16, 8, 8, [4, 4]],
 53 |         [32, 4, 4, [2, 2]]
 54 |       ],
 55 |       "hidden_layers_activation": "relu",
 56 |       "exploration_anneal_episodes": 3000,
 57 |       "epi_change_lr": 3000,
 58 |       "max_mem_len": 500000
 59 |     },
 60 |     "param_range": {
 61 |       "lr": [0.001, 0.01]
 62 |     }
 63 |   },
 64 |   "breakout_double_dqn": {
 65 |     "problem": "Breakout-v0",
 66 |     "Agent": "DoubleConvDQN",
 67 |     "HyperOptimizer": "GridSearch",
 68 |     "Memory": "LinearMemoryWithForgetting",
 69 |     "Optimizer": "AdamOptimizer",
 70 |     "Policy": "DoubleDQNEpsilonGreedyPolicy",
 71 |     "PreProcessor": "Atari",
 72 |     "param": {
 73 |       "train_per_n_new_exp": 4,
 74 |       "batch_size": 32,
 75 |       "lr": 0.001,
 76 |       "gamma": 0.99,
 77 |       "hidden_layers": [
 78 |         [16, 8, 8, [4, 4]],
 79 |         [32, 4, 4, [2, 2]]
 80 |       ],
 81 |       "hidden_layers_activation": "relu",
 82 |       "exploration_anneal_episodes": 3000,
 83 |       "epi_change_lr": 3000,
 84 |       "max_mem_len": 500000
 85 |     },
 86 |     "param_range": {
 87 |       "lr": [0.001, 0.0001],
 88 |       "gamma": [0.97, 0.99]
 89 |     }
 90 |   },
 91 |   "air_raid_dqn": {
 92 |     "problem": "AirRaid-v0",
 93 |     "Agent": "ConvDQN",
 94 |     "HyperOptimizer": "GridSearch",
 95 |     "Memory": "LinearMemoryWithForgetting",
 96 |     "Optimizer": "AdamOptimizer",
 97 |     "Policy": "EpsilonGreedyPolicy",
 98 |     "PreProcessor": "Atari",
 99 |     "param": {
100 |       "train_per_n_new_exp": 4,
101 |       "batch_size": 32,
102 |       "lr": 0.001,
103 |       "gamma": 0.99,
104 |       "hidden_layers": [
105 |         [16, 8, 8, [4, 4]],
106 |         [32, 4, 4, [2, 2]]
107 |       ],
108 |       "hidden_layers_activation": "relu",
109 |       "exploration_anneal_episodes": 10000,
110 |       "epi_change_lr": 10000,
111 |       "max_mem_len": 500000
112 |     },
113 |     "param_range": {
114 |       "lr": [0.001, 0.0001],
115 |       "gamma": [0.97, 0.99]
116 |     }
117 |   }
118 | }
119 | 


--------------------------------------------------------------------------------
/rl/spec/box2d_experiment_specs.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "lunar_dqn": {
  3 |     "problem": "LunarLander-v2",
  4 |     "Agent": "DQN",
  5 |     "HyperOptimizer": "GridSearch",
  6 |     "Memory": "LinearMemoryWithForgetting",
  7 |     "Optimizer": "AdamOptimizer",
  8 |     "Policy": "EpsilonGreedyPolicy",
  9 |     "PreProcessor": "StackStates",
 10 |     "param": {
 11 |       "train_per_n_new_exp": 5,
 12 |       "batch_size": 32,
 13 |       "lr": 0.005,
 14 |       "gamma": 0.99,
 15 |       "hidden_layers": [400, 200],
 16 |       "hidden_layers_activation": "sigmoid",
 17 |       "output_layer_activation": "linear",
 18 |       "exploration_anneal_episodes": 150,
 19 |       "epi_change_lr": 200
 20 |     },
 21 |     "param_range": {
 22 |       "lr": [0.0005, 0.001, 0.005, 0.01, 0.02],
 23 |       "gamma": [0.95, 0.97, 0.99, 0.999],
 24 |       "hidden_layers": [
 25 |         [400, 200],
 26 |         [800, 400],
 27 |         [400, 200, 100],
 28 |         [400, 200, 100, 50]
 29 |       ]
 30 |     }
 31 |   },
 32 |   "rand_lunar_dqn": {
 33 |     "problem": "LunarLander-v2",
 34 |     "Agent": "DQN",
 35 |     "HyperOptimizer": "RandomSearch",
 36 |     "Memory": "LinearMemoryWithForgetting",
 37 |     "Optimizer": "AdamOptimizer",
 38 |     "Policy": "EpsilonGreedyPolicy",
 39 |     "PreProcessor": "NoPreProcessor",
 40 |     "param": {
 41 |       "max_evals": 100,
 42 |       "train_per_n_new_exp": 5,
 43 |       "batch_size": 32,
 44 |       "lr": 0.001,
 45 |       "gamma": 0.99,
 46 |       "hidden_layers": [300, 150, 75],
 47 |       "hidden_layers_activation": "relu",
 48 |       "output_layer_activation": "linear",
 49 |       "exploration_anneal_episodes": 150,
 50 |       "epi_change_lr": 200
 51 |     },
 52 |     "param_range": {
 53 |       "lr": {
 54 |         "min": 0.0005,
 55 |         "max": 0.05
 56 |       },
 57 |       "gamma": {
 58 |         "min": 0.97,
 59 |         "max": 0.9999
 60 |       },
 61 |       "hidden_layers": [
 62 |         [400, 200],
 63 |         [800, 400],
 64 |         [200, 100, 50],
 65 |         [400, 200, 100],
 66 |         [400, 200, 100, 50]
 67 |       ]
 68 |     }
 69 |   },
 70 |   "lunar_double_dqn": {
 71 |     "problem": "LunarLander-v2",
 72 |     "Agent": "DoubleDQN",
 73 |     "HyperOptimizer": "GridSearch",
 74 |     "Memory": "LinearMemoryWithForgetting",
 75 |     "Optimizer": "AdamOptimizer",
 76 |     "Policy": "DoubleDQNBoltzmannPolicy",
 77 |     "PreProcessor": "StackStates",
 78 |     "param": {
 79 |       "train_per_n_new_exp": 5,
 80 |       "batch_size": 32,
 81 |       "lr": 0.005,
 82 |       "gamma": 0.99,
 83 |       "hidden_layers": [800, 400],
 84 |       "hidden_layers_activation": "sigmoid",
 85 |       "output_layer_activation": "linear",
 86 |       "exploration_anneal_episodes": 150,
 87 |       "epi_change_lr": 200
 88 |     },
 89 |     "param_range": {
 90 |       "lr": [0.0005, 0.001, 0.005, 0.01, 0.02],
 91 |       "gamma": [0.95, 0.97, 0.99, 0.999],
 92 |       "hidden_layers": [
 93 |         [400, 200],
 94 |         [800, 400],
 95 |         [400, 200, 100],
 96 |         [400, 200, 100, 50]
 97 |       ]
 98 |     }
 99 |   },
100 |   "lunar_double_dqn_nopreprocess": {
101 |     "problem": "LunarLander-v2",
102 |     "Agent": "DoubleDQN",
103 |     "HyperOptimizer": "GridSearch",
104 |     "Memory": "LinearMemoryWithForgetting",
105 |     "Optimizer": "AdamOptimizer",
106 |     "Policy": "DoubleDQNBoltzmannPolicy",
107 |     "PreProcessor": "NoPreProcessor",
108 |     "param": {
109 |       "train_per_n_new_exp": 5,
110 |       "batch_size": 32,
111 |       "lr": 0.005,
112 |       "gamma": 0.99,
113 |       "hidden_layers": [800, 400],
114 |       "hidden_layers_activation": "sigmoid",
115 |       "output_layer_activation": "linear",
116 |       "exploration_anneal_episodes": 150,
117 |       "epi_change_lr": 200
118 |     },
119 |     "param_range": {
120 |       "lr": [0.005, 0.01, 0.02],
121 |       "gamma": [0.97, 0.99, 0.999],
122 |       "hidden_layers": [
123 |         [400, 200],
124 |         [800, 400]
125 |       ]
126 |     }
127 |   },
128 |   "lunar_freeze": {
129 |     "problem": "LunarLander-v2",
130 |     "Agent": "FreezeDQN",
131 |     "HyperOptimizer": "GridSearch",
132 |     "Memory": "LinearMemoryWithForgetting",
133 |     "Optimizer": "AdamOptimizer",
134 |     "Policy": "BoltzmannPolicy",
135 |     "PreProcessor": "StackStates",
136 |     "param": {
137 |       "train_per_n_new_exp": 5,
138 |       "batch_size": 32,
139 |       "lr": 0.001,
140 |       "gamma": 0.99,
141 |       "hidden_layers": [300, 150, 75],
142 |       "hidden_layers_activation": "relu",
143 |       "output_layer_activation": "linear",
144 |       "exploration_anneal_episodes": 150,
145 |       "epi_change_lr": 200
146 |     },
147 |     "param_range": {
148 |       "lr": [0.0001, 0.0005, 0.001, 0.005],
149 |       "gamma": [0.97, 0.99, 0.999],
150 |       "hidden_layers": [
151 |         [200, 100],
152 |         [400, 200],
153 |         [300, 150, 75],
154 |         [400, 200, 100]
155 |       ]
156 |     }
157 |   },
158 |   "lunar_sarsa": {
159 |     "problem": "LunarLander-v2",
160 |     "Agent": "DeepExpectedSarsa",
161 |     "HyperOptimizer": "GridSearch",
162 |     "Memory": "LinearMemoryWithForgetting",
163 |     "Optimizer": "AdamOptimizer",
164 |     "Policy": "EpsilonGreedyPolicy",
165 |     "PreProcessor": "StackStates",
166 |     "param": {
167 |       "train_per_n_new_exp": 1,
168 |       "lr": 0.001,
169 |       "gamma": 0.99,
170 |       "hidden_layers": [300, 150, 75],
171 |       "hidden_layers_activation": "relu",
172 |       "output_layer_activation": "linear",
173 |       "exploration_anneal_episodes": 150,
174 |       "epi_change_lr": 200
175 |     },
176 |     "param_range": {
177 |       "lr": [0.0001, 0.0005, 0.001, 0.005],
178 |       "gamma": [0.97, 0.99, 0.999],
179 |       "hidden_layers": [
180 |         [200, 100],
181 |         [400, 200],
182 |         [300, 150, 75],
183 |         [400, 200, 100]
184 |       ]
185 |     }
186 |   },
187 |   "lunar_offpol_sarsa": {
188 |     "problem": "LunarLander-v2",
189 |     "Agent": "OffPolicySarsa",
190 |     "HyperOptimizer": "GridSearch",
191 |     "Memory": "LinearMemoryWithForgetting",
192 |     "Optimizer": "AdamOptimizer",
193 |     "Policy": "EpsilonGreedyPolicy",
194 |     "PreProcessor": "StackStates",
195 |     "param": {
196 |       "train_per_n_new_exp": 5,
197 |       "batch_size": 32,
198 |       "lr": 0.001,
199 |       "gamma": 0.99,
200 |       "hidden_layers": [800, 400],
201 |       "hidden_layers_activation": "sigmoid",
202 |       "output_layer_activation": "linear",
203 |       "exploration_anneal_episodes": 150,
204 |       "epi_change_lr": 200
205 |     },
206 |     "param_range": {
207 |       "lr": [0.001, 0.005, 0.01],
208 |       "gamma": [0.97, 0.99, 0.999],
209 |       "hidden_layers": [
210 |         [400, 200],
211 |         [800, 400],
212 |         [400, 200, 100]
213 |       ]
214 |     }
215 |   },
216 |   "lunar_ac_softmax": {
217 |     "problem": "LunarLander-v2",
218 |     "Agent": "ActorCritic",
219 |     "HyperOptimizer": "GridSearch",
220 |     "Memory": "LinearMemoryWithForgetting",
221 |     "Optimizer": "AdamOptimizer",
222 |     "Policy": "SoftmaxPolicy",
223 |     "PreProcessor": "NoPreProcessor",
224 |     "param": {
225 |       "lr": 0.02,
226 |       "gamma": 0.99,
227 |       "hidden_layers": [64],
228 |       "hidden_layers_activation": "sigmoid"
229 |     },
230 |     "param_range": {
231 |       "lr": [0.001, 0.005, 0.01],
232 |       "gamma": [0.99, 0.999],
233 |       "hidden_layers": [
234 |         [400, 300],
235 |         [800, 400],
236 |         [800, 600]
237 |       ]
238 |     }
239 |   },
240 |   "lunar_cont_ddpg_linearnoise": {
241 |     "problem": "LunarLanderContinuous-v2",
242 |     "Agent": "DDPG",
243 |     "HyperOptimizer": "GridSearch",
244 |     "Memory": "LinearMemoryWithForgetting",
245 |     "Optimizer": "AdamOptimizer",
246 |     "Policy": "LinearNoisePolicy",
247 |     "PreProcessor": "NoPreProcessor",
248 |     "param": {
249 |       "batch_size": 64,
250 |       "n_epoch": 1,
251 |       "tau": 0.005,
252 |       "lr": 0.001,
253 |       "critic_lr": 0.001,
254 |       "exploration_anneal_episodes": 100,
255 |       "gamma": 0.99,
256 |       "hidden_layers": [600, 300],
257 |       "hidden_layers_activation": "relu",
258 |       "output_layer_activation": "tanh"
259 |     },
260 |     "param_range": {
261 |       "lr": [0.0001, 0.0005, 0.001],
262 |       "critic_lr": [0.001, 0.005, 0.01],
263 |       "gamma": [0.97, 0.99, 0.999],
264 |       "hidden_layers": [
265 |         [400, 300],
266 |         [600, 300],
267 |         [800, 400, 200]
268 |       ]
269 |     }
270 |   },
271 |   "lunar_cont_ddpg_per_linearnoise": {
272 |     "problem": "LunarLanderContinuous-v2",
273 |     "Agent": "DDPG",
274 |     "HyperOptimizer": "GridSearch",
275 |     "Memory": "PrioritizedExperienceReplay",
276 |     "Optimizer": "AdamOptimizer",
277 |     "Policy": "LinearNoisePolicy",
278 |     "PreProcessor": "NoPreProcessor",
279 |     "param": {
280 |       "batch_size": 64,
281 |       "n_epoch": 1,
282 |       "tau": 0.005,
283 |       "lr": 0.001,
284 |       "critic_lr": 0.001,
285 |       "exploration_anneal_episodes": 100,
286 |       "gamma": 0.97,
287 |       "hidden_layers": [400, 300],
288 |       "hidden_layers_activation": "relu",
289 |       "output_layer_activation": "tanh"
290 |     },
291 |     "param_range": {
292 |       "lr": [0.0001, 0.0005, 0.001],
293 |       "critic_lr": [0.001, 0.005, 0.01],
294 |       "gamma": [0.97, 0.99, 0.999],
295 |       "hidden_layers": [
296 |         [400, 300],
297 |         [600, 300],
298 |         [800, 400, 200]
299 |       ]
300 |     }
301 |   },
302 |   "walker_ddpg_linearnoise": {
303 |     "problem": "BipedalWalker-v2",
304 |     "Agent": "DDPG",
305 |     "HyperOptimizer": "GridSearch",
306 |     "Memory": "LinearMemoryWithForgetting",
307 |     "Optimizer": "AdamOptimizer",
308 |     "Policy": "LinearNoisePolicy",
309 |     "PreProcessor": "NoPreProcessor",
310 |     "param": {
311 |       "batch_size": 64,
312 |       "n_epoch": 1,
313 |       "tau": 0.005,
314 |       "lr": 0.001,
315 |       "critic_lr": 0.001,
316 |       "exploration_anneal_episodes": 100,
317 |       "gamma": 0.97,
318 |       "hidden_layers": [400, 300],
319 |       "hidden_layers_activation": "relu",
320 |       "output_layer_activation": "tanh"
321 |     },
322 |     "param_range": {
323 |       "lr": [0.0001, 0.0005, 0.001],
324 |       "critic_lr": [0.001, 0.005, 0.01],
325 |       "gamma": [0.97, 0.99, 0.999],
326 |       "hidden_layers": [
327 |         [400, 300],
328 |         [600, 300],
329 |         [800, 400, 200]
330 |       ]
331 |     }
332 |   },
333 |   "walker_ddpg_per_linearnoise": {
334 |     "problem": "BipedalWalker-v2",
335 |     "Agent": "DDPG",
336 |     "HyperOptimizer": "GridSearch",
337 |     "Memory": "PrioritizedExperienceReplay",
338 |     "Optimizer": "AdamOptimizer",
339 |     "Policy": "LinearNoisePolicy",
340 |     "PreProcessor": "NoPreProcessor",
341 |     "param": {
342 |       "batch_size": 64,
343 |       "n_epoch": 1,
344 |       "tau": 0.005,
345 |       "lr": 0.0005,
346 |       "critic_lr": 0.001,
347 |       "gamma": 0.97,
348 |       "hidden_layers": [400, 200],
349 |       "hidden_layers_activation": "relu",
350 |       "output_layer_activation": "tanh"
351 |     },
352 |     "param_range": {
353 |       "lr": [0.0001, 0.0005],
354 |       "critic_lr": [0.001, 0.005],
355 |       "gamma": [0.95, 0.97, 0.99],
356 |       "hidden_layers": [
357 |         [200, 100],
358 |         [400, 300],
359 |         [800, 400]
360 |       ]
361 |     }
362 |   }
363 | }
364 | 


--------------------------------------------------------------------------------
/rl/spec/component_locks.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "double_network": {
  3 |     "type": "mutex",
  4 |     "details": "double_network agents need policies that invokes both networks properly",
  5 |     "head": "Agent",
  6 |     "Agent": [
  7 |       "DoubleConvDQN",
  8 |       "DoubleDQN"
  9 |     ],
 10 |     "Policy": [
 11 |       "DoubleDQNBoltzmannPolicy",
 12 |       "DoubleDQNEpsilonGreedyPolicy"
 13 |     ]
 14 |   },
 15 |   "ddpg": {
 16 |     "type": "mutex",
 17 |     "details": "ddpg uses white-noise policy",
 18 |     "head": "Agent",
 19 |     "Agent": [
 20 |       "DDPG"
 21 |     ],
 22 |     "Policy": [
 23 |       "GaussianWhiteNoisePolicy",
 24 |       "LinearNoisePolicy",
 25 |       "NoNoisePolicy",
 26 |       "OUNoisePolicy"
 27 |     ]
 28 |   },
 29 |   "actor_critic": {
 30 |     "type": "mutex",
 31 |     "details": "actor critic uses custom Q computation in its policy",
 32 |     "head": "Agent",
 33 |     "Agent": [
 34 |       "ActorCritic"
 35 |     ],
 36 |     "Policy": [
 37 |       "ArgmaxPolicy",
 38 |       "BoundedPolicy",
 39 |       "GaussianPolicy",
 40 |       "SoftmaxPolicy"
 41 |     ]
 42 |   },
 43 |   "actor_critic_discrete": {
 44 |     "type": "subset",
 45 |     "details": "actor critic discrete components cannot work in continuous action space",
 46 |     "head": "problem",
 47 |     "problem": [
 48 |       "Acrobot-v1",
 49 |       "AirRaid-v0",
 50 |       "Alien-v0",
 51 |       "Assault-v0",
 52 |       "Breakout-v0",
 53 |       "CartPole-v0",
 54 |       "CartPole-v1",
 55 |       "DevBreakout-v0",
 56 |       "DevCartPole-v0",
 57 |       "FlappyBird-v0",
 58 |       "LunarLander-v2",
 59 |       "MountainCar-v0",
 60 |       "MsPacman-v0",
 61 |       "Pong-v0",
 62 |       "Qbert-v0",
 63 |       "Snake-v0",
 64 |       "SpaceInvader-v0",
 65 |       "TestPassCartPole-v0"
 66 |     ],
 67 |     "Policy": [
 68 |       "ArgmaxPolicy",
 69 |       "SoftmaxPolicy"
 70 |     ]
 71 |   },
 72 |   "discrete_action": {
 73 |     "type": "subset",
 74 |     "details": "discrete components cannot work in continuous action space",
 75 |     "head": "problem",
 76 |     "problem": [
 77 |       "Acrobot-v1",
 78 |       "AirRaid-v0",
 79 |       "Alien-v0",
 80 |       "Assault-v0",
 81 |       "Breakout-v0",
 82 |       "CartPole-v0",
 83 |       "CartPole-v1",
 84 |       "DevBreakout-v0",
 85 |       "DevCartPole-v0",
 86 |       "FlappyBird-v0",
 87 |       "LunarLander-v2",
 88 |       "MountainCar-v0",
 89 |       "MsPacman-v0",
 90 |       "Pong-v0",
 91 |       "Qbert-v0",
 92 |       "Snake-v0",
 93 |       "SpaceInvader-v0",
 94 |       "TestPassCartPole-v0"
 95 |     ],
 96 |     "Agent": [
 97 |       "ConvDQN",
 98 |       "DeepExpectedSarsa",
 99 |       "DeepSarsa",
100 |       "DoubleConvDQN",
101 |       "DoubleDQN",
102 |       "DQN",
103 |       "Dummy",
104 |       "FreezeDQN",
105 |       "OffPolicySarsa",
106 |       "QTable"
107 |     ],
108 |     "Policy": [
109 |       "BoltzmannPolicy",
110 |       "DecayingEpsilonGreedyPolicy",
111 |       "DoubleDQNBoltzmannPolicy",
112 |       "DoubleDQNEpsilonGreedyPolicy",
113 |       "EpsilonGreedyPolicy",
114 |       "OscillatingEpsilonGreedyPolicy",
115 |       "TargetedEpsilonGreedyPolicy"
116 |     ]
117 |   }
118 | }
119 | 


--------------------------------------------------------------------------------
/rl/spec/dev_experiment_specs.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "dummy": {
  3 |     "problem": "CartPole-v0",
  4 |     "Agent": "Dummy",
  5 |     "HyperOptimizer": "GridSearch",
  6 |     "Memory": "LinearMemory",
  7 |     "Optimizer": "SGDOptimizer",
  8 |     "Policy": "EpsilonGreedyPolicy",
  9 |     "PreProcessor": "NoPreProcessor",
 10 |     "param": {}
 11 |   },
 12 |   "q_table": {
 13 |     "problem": "CartPole-v0",
 14 |     "Agent": "QTable",
 15 |     "HyperOptimizer": "GridSearch",
 16 |     "Memory": "LinearMemory",
 17 |     "Optimizer": "SGDOptimizer",
 18 |     "Policy": "EpsilonGreedyPolicy",
 19 |     "PreProcessor": "NoPreProcessor",
 20 |     "param": {
 21 |       "lr": 0.01,
 22 |       "gamma": 0.99,
 23 |       "exploration_anneal_episodes": 100
 24 |     }
 25 |   },
 26 |   "test_dqn_pass": {
 27 |     "problem": "TestPassCartPole-v0",
 28 |     "Agent": "DQN",
 29 |     "HyperOptimizer": "GridSearch",
 30 |     "Memory": "LinearMemoryWithForgetting",
 31 |     "Optimizer": "AdamOptimizer",
 32 |     "Policy": "BoltzmannPolicy",
 33 |     "PreProcessor": "NoPreProcessor",
 34 |     "param": {
 35 |       "lr": 0.01,
 36 |       "decay": 0.0,
 37 |       "gamma": 0.99,
 38 |       "hidden_layers": [16],
 39 |       "hidden_layers_activation": "sigmoid",
 40 |       "exploration_anneal_episodes": 10
 41 |     },
 42 |     "param_range": {
 43 |       "lr": [0.0001, 0.0005],
 44 |       "gamma": [0.97, 0.99]
 45 |     }
 46 |   },
 47 |   "test_dqn_grid_search": {
 48 |     "problem": "DevCartPole-v0",
 49 |     "Agent": "DQN",
 50 |     "HyperOptimizer": "GridSearch",
 51 |     "Memory": "LinearMemoryWithForgetting",
 52 |     "Optimizer": "AdamOptimizer",
 53 |     "Policy": "BoltzmannPolicy",
 54 |     "PreProcessor": "NoPreProcessor",
 55 |     "param": {
 56 |       "lr": 0.01,
 57 |       "decay": 0.0,
 58 |       "gamma": 0.99,
 59 |       "hidden_layers": [16],
 60 |       "hidden_layers_activation": "sigmoid",
 61 |       "exploration_anneal_episodes": 10
 62 |     },
 63 |     "param_range": {
 64 |       "lr": [0.0001, 0.0005],
 65 |       "gamma": [0.97, 0.99]
 66 |     }
 67 |   },
 68 |   "test_dqn_random_search": {
 69 |     "problem": "DevCartPole-v0",
 70 |     "Agent": "DQN",
 71 |     "HyperOptimizer": "RandomSearch",
 72 |     "Memory": "LinearMemoryWithForgetting",
 73 |     "Optimizer": "AdamOptimizer",
 74 |     "Policy": "BoltzmannPolicy",
 75 |     "PreProcessor": "NoPreProcessor",
 76 |     "param": {
 77 |       "max_evals": 3,
 78 |       "lr": 0.01,
 79 |       "decay": 0.0,
 80 |       "gamma": 0.99,
 81 |       "hidden_layers": [16],
 82 |       "hidden_layers_activation": "sigmoid",
 83 |       "exploration_anneal_episodes": 10
 84 |     },
 85 |     "param_range": {
 86 |       "lr": {
 87 |         "min": 0.0001,
 88 |         "max": 0.005
 89 |       },
 90 |       "gamma": {
 91 |         "min": 0.90,
 92 |         "max": 0.999
 93 |       }
 94 |     }
 95 |   },
 96 |   "dev_dqn": {
 97 |     "problem": "DevCartPole-v0",
 98 |     "Agent": "DQN",
 99 |     "HyperOptimizer": "GridSearch",
100 |     "Memory": "PrioritizedExperienceReplay",
101 |     "Optimizer": "AdamOptimizer",
102 |     "Policy": "BoltzmannPolicy",
103 |     "PreProcessor": "NoPreProcessor",
104 |     "param": {
105 |       "lr": 0.01,
106 |       "decay": 0.0,
107 |       "gamma": 0.99,
108 |       "n_epoch": 1,
109 |       "hidden_layers": [32],
110 |       "hidden_layers_activation": "sigmoid",
111 |       "exploration_anneal_episodes": 10,
112 |       "auto_architecture": false,
113 |       "num_hidden_layers": 3,
114 |       "first_hidden_layer_size": 512,
115 |       "e": 0.01,
116 |       "alpha": 0.6,
117 |       "max_mem_len": 7
118 |     },
119 |     "param_range": {
120 |       "gamma": [0.97, 0.99],
121 |       "lr": [0.01, 0.1]
122 |     }
123 |   }
124 | }
125 | 


--------------------------------------------------------------------------------
/rl/spec/problems.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "DevCartPole-v0": {
  3 |     "GYM_ENV_NAME": "CartPole-v0",
  4 |     "SOLVED_MEAN_REWARD": 195.0,
  5 |     "MAX_EPISODES": 4,
  6 |     "REWARD_MEAN_LEN": 100
  7 |   },
  8 |   "TestPassCartPole-v0": {
  9 |     "GYM_ENV_NAME": "CartPole-v0",
 10 |     "SOLVED_MEAN_REWARD": 50.0,
 11 |     "MAX_EPISODES": 20,
 12 |     "REWARD_MEAN_LEN": 100
 13 |   },
 14 |   "CartPole-v0": {
 15 |     "GYM_ENV_NAME": "CartPole-v0",
 16 |     "SOLVED_MEAN_REWARD": 195.0,
 17 |     "MAX_EPISODES": 250,
 18 |     "REWARD_MEAN_LEN": 100
 19 |   },
 20 |   "CartPole-v1": {
 21 |     "GYM_ENV_NAME": "CartPole-v1",
 22 |     "SOLVED_MEAN_REWARD": 475.0,
 23 |     "MAX_EPISODES": 500,
 24 |     "REWARD_MEAN_LEN": 100
 25 |   },
 26 |   "Acrobot-v1": {
 27 |     "GYM_ENV_NAME": "Acrobot-v1",
 28 |     "SOLVED_MEAN_REWARD": null,
 29 |     "MAX_EPISODES": 600,
 30 |     "REWARD_MEAN_LEN": 100
 31 |   },
 32 |   "MountainCar-v0": {
 33 |     "GYM_ENV_NAME": "MountainCar-v0",
 34 |     "SOLVED_MEAN_REWARD": -110.0,
 35 |     "MAX_EPISODES": 1400,
 36 |     "REWARD_MEAN_LEN": 100
 37 |   },
 38 |   "MountainCarContinuous-v0": {
 39 |     "GYM_ENV_NAME": "MountainCarContinuous-v0",
 40 |     "SOLVED_MEAN_REWARD": 90.0,
 41 |     "MAX_EPISODES": 5000,
 42 |     "REWARD_MEAN_LEN": 100
 43 |   },
 44 |   "Pendulum-v0": {
 45 |     "GYM_ENV_NAME": "Pendulum-v0",
 46 |     "SOLVED_MEAN_REWARD": null,
 47 |     "MAX_EPISODES": 300,
 48 |     "REWARD_MEAN_LEN": 100
 49 |   },
 50 |   "LunarLander-v2": {
 51 |     "GYM_ENV_NAME": "LunarLander-v2",
 52 |     "SOLVED_MEAN_REWARD": 200.0,
 53 |     "MAX_EPISODES": 600,
 54 |     "REWARD_MEAN_LEN": 100
 55 |   },
 56 |   "LunarLanderContinuous-v2": {
 57 |     "GYM_ENV_NAME": "LunarLanderContinuous-v2",
 58 |     "SOLVED_MEAN_REWARD": 200.0,
 59 |     "MAX_EPISODES": 800,
 60 |     "REWARD_MEAN_LEN": 100
 61 |   },
 62 |   "BipedalWalker-v2": {
 63 |     "GYM_ENV_NAME": "BipedalWalker-v2",
 64 |     "SOLVED_MEAN_REWARD": 300.0,
 65 |     "MAX_EPISODES": 5000,
 66 |     "REWARD_MEAN_LEN": 100
 67 |   },
 68 |   "BipedalWalkerHardcore-v2": {
 69 |     "GYM_ENV_NAME": "BipedalWalkerHardcore-v2",
 70 |     "SOLVED_MEAN_REWARD": 300.0,
 71 |     "MAX_EPISODES": 5000,
 72 |     "REWARD_MEAN_LEN": 100
 73 |   },
 74 |   "CarRacing-v0": {
 75 |     "GYM_ENV_NAME": "CarRacing-v0",
 76 |     "SOLVED_MEAN_REWARD": 900.0,
 77 |     "MAX_EPISODES": 5000,
 78 |     "REWARD_MEAN_LEN": 100
 79 |   },
 80 |   "AirRaid-v0": {
 81 |     "GYM_ENV_NAME": "AirRaid-v0",
 82 |     "SOLVED_MEAN_REWARD": null,
 83 |     "MAX_EPISODES": 5000,
 84 |     "REWARD_MEAN_LEN": 100
 85 |   },
 86 |   "Alien-v0": {
 87 |     "GYM_ENV_NAME": "Alien-v0",
 88 |     "SOLVED_MEAN_REWARD": null,
 89 |     "MAX_EPISODES": 5000,
 90 |     "REWARD_MEAN_LEN": 100
 91 |   },
 92 |   "Assault-v0": {
 93 |     "GYM_ENV_NAME": "Assault-v0",
 94 |     "SOLVED_MEAN_REWARD": null,
 95 |     "MAX_EPISODES": 5000,
 96 |     "REWARD_MEAN_LEN": 100
 97 |   },
 98 |   "DevBreakout-v0": {
 99 |     "GYM_ENV_NAME": "Breakout-v0",
100 |     "SOLVED_MEAN_REWARD": null,
101 |     "MAX_EPISODES": 1,
102 |     "REWARD_MEAN_LEN": 100
103 |   },
104 |   "Breakout-v0": {
105 |     "GYM_ENV_NAME": "Breakout-v0",
106 |     "SOLVED_MEAN_REWARD": null,
107 |     "MAX_EPISODES": 5000,
108 |     "REWARD_MEAN_LEN": 100
109 |   },
110 |   "MsPacman-v0": {
111 |     "GYM_ENV_NAME": "MsPacman-v0",
112 |     "SOLVED_MEAN_REWARD": null,
113 |     "MAX_EPISODES": 5000,
114 |     "REWARD_MEAN_LEN": 100
115 |   },
116 |   "Pong-v0": {
117 |     "GYM_ENV_NAME": "Pong-v0",
118 |     "SOLVED_MEAN_REWARD": null,
119 |     "MAX_EPISODES": 5000,
120 |     "REWARD_MEAN_LEN": 100
121 |   },
122 |   "Qbert-v0": {
123 |     "GYM_ENV_NAME": "Qbert-v0",
124 |     "SOLVED_MEAN_REWARD": null,
125 |     "MAX_EPISODES": 5000,
126 |     "REWARD_MEAN_LEN": 100
127 |   },
128 |   "SpaceInvader-v0": {
129 |     "GYM_ENV_NAME": "SpaceInvader-v0",
130 |     "SOLVED_MEAN_REWARD": null,
131 |     "MAX_EPISODES": 5000,
132 |     "REWARD_MEAN_LEN": 100
133 |   },
134 |   "FlappyBird-v0": {
135 |     "GYM_ENV_NAME": "FlappyBird-v0",
136 |     "SOLVED_MEAN_REWARD": null,
137 |     "MAX_EPISODES": 1000,
138 |     "REWARD_MEAN_LEN": 100
139 |   },
140 |   "Snake-v0": {
141 |     "GYM_ENV_NAME": "Snake-v0",
142 |     "SOLVED_MEAN_REWARD": null,
143 |     "MAX_EPISODES": 1000,
144 |     "REWARD_MEAN_LEN": 100
145 |   }
146 | }
147 | 


--------------------------------------------------------------------------------
/rl/spec/pygame_experiment_specs.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "flappy": {
 3 |     "problem": "FlappyBird-v0",
 4 |     "Agent": "ConvDQN",
 5 |     "HyperOptimizer": "GridSearch",
 6 |     "Memory": "LinearMemoryWithForgetting",
 7 |     "Optimizer": "AdamOptimizer",
 8 |     "Policy": "EpsilonGreedyPolicy",
 9 |     "PreProcessor": "NoPreProcessor",
10 |     "param": {
11 |       "train_per_n_new_exp": 4,
12 |       "batch_size": 32,
13 |       "lr": 0.001,
14 |       "gamma": 0.99,
15 |       "hidden_layers": [
16 |         [16, 8, 8, [4, 4]],
17 |         [32, 4, 4, [2, 2]]
18 |       ],
19 |       "hidden_layers_activation": "relu",
20 |       "exploration_anneal_episodes": 5000,
21 |       "epi_change_lr": 5000
22 |     },
23 |     "param_range": {
24 |       "lr": [0.001, 0.0001],
25 |       "gamma": [0.97, 0.99]
26 |     }
27 |   },
28 |   "snake": {
29 |     "problem": "Snake-v0",
30 |     "Agent": "ConvDQN",
31 |     "HyperOptimizer": "GridSearch",
32 |     "Memory": "LinearMemoryWithForgetting",
33 |     "Optimizer": "AdamOptimizer",
34 |     "Policy": "EpsilonGreedyPolicy",
35 |     "PreProcessor": "NoPreProcessor",
36 |     "param": {
37 |       "train_per_n_new_exp": 4,
38 |       "batch_size": 32,
39 |       "lr": 0.001,
40 |       "gamma": 0.99,
41 |       "hidden_layers": [
42 |         [16, 8, 8, [4, 4]],
43 |         [32, 4, 4, [2, 2]]
44 |       ],
45 |       "hidden_layers_activation": "relu",
46 |       "exploration_anneal_episodes": 5000,
47 |       "epi_change_lr": 5000
48 |     },
49 |     "param_range": {
50 |       "lr": [0.001, 0.0001],
51 |       "gamma": [0.97, 0.99]
52 |     }
53 |   }
54 | }
55 | 


--------------------------------------------------------------------------------
/rl/util.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import inspect
  4 | import json
  5 | import logging
  6 | import multiprocessing as mp
  7 | import numpy as np
  8 | import re
  9 | import sys
 10 | import zipfile
 11 | from datetime import datetime, timedelta
 12 | from os import path, listdir, environ, getpid
 13 | from textwrap import wrap
 14 | 
 15 | PARALLEL_PROCESS_NUM = mp.cpu_count()
 16 | TIMESTAMP_REGEX = r'(\d{4}_\d{2}_\d{2}_\d{6})'
 17 | SPEC_PATH = path.join(path.dirname(__file__), 'spec')
 18 | COMPONENT_LOCKS = json.loads(
 19 |     open(path.join(SPEC_PATH, 'component_locks.json')).read())
 20 | LOCK_HEAD_REST_SIG = {
 21 |     # signature list of [head, rest] in component lock
 22 |     'mutex': [[0, 0], [1, 1]],
 23 |     'subset': [[0, 0], [1, 0], [1, 1]],
 24 | }
 25 | 
 26 | 
 27 | # parse_args to add flag
 28 | parser = argparse.ArgumentParser(description='Set flags for functions')
 29 | parser.add_argument("-b", "--blind",
 30 |                     help="dont render graphics",
 31 |                     action="store_const",
 32 |                     dest="render",
 33 |                     const=False,
 34 |                     default=True)
 35 | parser.add_argument("-d", "--debug",
 36 |                     help="activate debug log",
 37 |                     action="store_const",
 38 |                     dest="loglevel",
 39 |                     const=logging.DEBUG,
 40 |                     default=logging.INFO)
 41 | parser.add_argument("-e", "--experiment",
 42 |                     help="specify experiment to run",
 43 |                     action="store",
 44 |                     type=str,
 45 |                     nargs='?',
 46 |                     dest="experiment",
 47 |                     default="dev_dqn")
 48 | parser.add_argument("-p", "--param_selection",
 49 |                     help="run parameter selection if present",
 50 |                     action="store_true",
 51 |                     dest="param_selection",
 52 |                     default=False)
 53 | parser.add_argument("-q", "--quiet",
 54 |                     help="change log to warning level",
 55 |                     action="store_const",
 56 |                     dest="loglevel",
 57 |                     const=logging.WARNING,
 58 |                     default=logging.INFO)
 59 | parser.add_argument("-t", "--times",
 60 |                     help="number of times session is run",
 61 |                     action="store",
 62 |                     nargs='?',
 63 |                     type=int,
 64 |                     dest="times",
 65 |                     default=1)
 66 | parser.add_argument("-x", "--max_episodes",
 67 |                     help="manually set environment max episodes",
 68 |                     action="store",
 69 |                     nargs='?',
 70 |                     type=int,
 71 |                     dest="max_epis",
 72 |                     default=-1)
 73 | args = parser.parse_args([]) if environ.get('CI') else parser.parse_args()
 74 | 
 75 | # Goddam python logger
 76 | logger = logging.getLogger(__name__)
 77 | handler = logging.StreamHandler(sys.stdout)
 78 | handler.setFormatter(
 79 |     logging.Formatter('[%(asctime)s] %(levelname)s: %(message)s'))
 80 | logger.setLevel(args.loglevel)
 81 | logger.addHandler(handler)
 82 | logger.propagate = False
 83 | environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # mute tf warnings on optimized setup
 84 | 
 85 | 
 86 | def check_equal(iterator):
 87 |     '''check if list contains all the same elements'''
 88 |     iterator = iter(iterator)
 89 |     try:
 90 |         first = next(iterator)
 91 |     except StopIteration:
 92 |         return True
 93 |     return all(first == rest for rest in iterator)
 94 | 
 95 | 
 96 | def check_lock(lock_name, lock, experiment_spec):
 97 |     '''
 98 |     refer to rl/spec/component_locks.json
 99 |     check a spec's component lock using binary signatures
100 |     e.g. head = problem (discrete)
101 |     rest = [Agent, Policy] (to be discrete too)
102 |     first check if rest all has the same signature, i.e. same set
103 |     then check pair [bin_head, bin_rest] in valid_lock_sig_list
104 |     as specified by the lock's type
105 |     '''
106 |     lock_type = lock['type']
107 |     valid_lock_sig_list = LOCK_HEAD_REST_SIG[lock_type]
108 |     lock_head = lock['head']
109 |     bin_head = (experiment_spec[lock_head] in lock[lock_head])
110 |     bin_rest_list = []
111 |     for k, v_list in lock.items():
112 |         if k in experiment_spec and k != lock_head:
113 |             bin_rest_list.append(experiment_spec[k] in v_list)
114 |     # rest must all have the same signature
115 |     rest_equal = check_equal(bin_rest_list)
116 |     if not rest_equal:
117 |         logger.warn(
118 |             'All components need to be of the same set, '
119 |             'check component lock "{}" and your spec "{}"'.format(
120 |                 lock_name, experiment_spec['experiment_name']))
121 | 
122 |     bin_rest = bin_rest_list[0]
123 |     lock_sig = [bin_head, bin_rest]
124 |     lock_valid = lock_sig in valid_lock_sig_list
125 |     if not lock_valid:
126 |         logger.warn(
127 |             'Component lock violated: "{}", spec: "{}"'.format(
128 |                 lock_name, experiment_spec['experiment_name']))
129 |     return lock_valid
130 | 
131 | 
132 | def check_component_locks(experiment_spec):
133 |     '''
134 |     check the spec components for all locks
135 |     to ensure no lock is violated
136 |     refer to rl/spec/component_locks.json
137 |     '''
138 |     for lock_name, lock in COMPONENT_LOCKS.items():
139 |         check_lock(lock_name, lock, experiment_spec)
140 |     return
141 | 
142 | 
143 | # import and safeguard the PROBLEMS, EXPERIMENT_SPECS with checks
144 | def import_guard_asset():
145 |     PROBLEMS = json.loads(open(path.join(SPEC_PATH, 'problems.json')).read())
146 |     EXPERIMENT_SPECS = {}
147 |     spec_files = [spec_json for spec_json in listdir(
148 |         SPEC_PATH) if spec_json.endswith('experiment_specs.json')]
149 |     for filename in spec_files:
150 |         specs = json.loads(open(path.join(SPEC_PATH, filename)).read())
151 |         EXPERIMENT_SPECS.update(specs)
152 | 
153 |     REQUIRED_PROBLEM_KEYS = [
154 |         'GYM_ENV_NAME', 'SOLVED_MEAN_REWARD',
155 |         'MAX_EPISODES', 'REWARD_MEAN_LEN']
156 |     REQUIRED_SPEC_KEYS = [
157 |         'problem', 'Agent', 'HyperOptimizer',
158 |         'Memory', 'Optimizer', 'Policy', 'PreProcessor', 'param']
159 | 
160 |     for problem_name, problem in PROBLEMS.items():
161 |         assert all(k in problem for k in REQUIRED_PROBLEM_KEYS), \
162 |             '{} needs all REQUIRED_PROBLEM_KEYS'.format(
163 |             problem_name)
164 | 
165 |     for experiment_name, spec in EXPERIMENT_SPECS.items():
166 |         assert all(k in spec for k in REQUIRED_SPEC_KEYS), \
167 |             '{} needs all REQUIRED_SPEC_KEYS'.format(experiment_name)
168 |         EXPERIMENT_SPECS[experiment_name]['experiment_name'] = experiment_name
169 |         check_component_locks(spec)  # check component_locks.json
170 |         if 'param_range' not in EXPERIMENT_SPECS[experiment_name]:
171 |             continue
172 | 
173 |         param_range = EXPERIMENT_SPECS[experiment_name]['param_range']
174 |         for param_key, param_val in param_range.items():
175 |             if isinstance(param_val, list):
176 |                 param_range[param_key] = sorted(param_val)
177 |             elif isinstance(param_val, dict):
178 |                 pass
179 |             else:
180 |                 assert False, \
181 |                     'param_range value must be list or dict: {}.{}:{}'.format(
182 |                         experiment_name, param_key, param_val)
183 | 
184 |         EXPERIMENT_SPECS[experiment_name]['param_range'] = param_range
185 |     return PROBLEMS, EXPERIMENT_SPECS
186 | 
187 | PROBLEMS, EXPERIMENT_SPECS = import_guard_asset()
188 | 
189 | 
190 | def log_self(subject):
191 |     max_info_len = 300
192 |     info = '{}, param: {}'.format(
193 |         subject.__class__.__name__,
194 |         to_json(subject.__dict__))
195 |     trunc_info = (
196 |         info[:max_info_len] + '...' if len(info) > max_info_len else info)
197 |     logger.debug(trunc_info)
198 | 
199 | 
200 | def wrap_text(text):
201 |     return '\n'.join(wrap(text, 60))
202 | 
203 | 
204 | def make_line(line='-'):
205 |     if environ.get('CI'):
206 |         return
207 |     columns = 80
208 |     line_str = line*int(columns)
209 |     return line_str
210 | 
211 | 
212 | def log_delimiter(msg, line='-'):
213 |     delim_msg = '''\n{0}\n{1}\n{0}\n\n'''.format(
214 |         make_line(line), msg)
215 |     logger.info(delim_msg)
216 | 
217 | 
218 | def log_trial_delimiter(trial, action):
219 |     log_delimiter('{} Trial #{}/{} on PID {}:\n{}'.format(
220 |         action, trial.trial_num, trial.num_of_trials,
221 |         getpid(), trial.trial_id), '=')
222 | 
223 | 
224 | def log_session_delimiter(sess, action):
225 |     log_delimiter(
226 |         '{} Session #{}/{} of Trial #{}/{} on PID {}:\n{}'.format(
227 |             action, sess.session_num, sess.num_of_sessions,
228 |             sess.trial.trial_num, sess.trial.num_of_trials,
229 |             getpid(), sess.session_id))
230 | 
231 | 
232 | def timestamp():
233 |     '''timestamp used for filename'''
234 |     timestamp_str = '{:%Y_%m_%d_%H%M%S}'.format(datetime.now())
235 |     assert re.search(TIMESTAMP_REGEX, timestamp_str)
236 |     return timestamp_str
237 | 
238 | 
239 | def timestamp_elapse(s1, s2):
240 |     '''calculate the time elapsed between timestamps from s1 to s2'''
241 |     FMT = '%Y_%m_%d_%H%M%S'
242 |     delta_t = datetime.strptime(s2, FMT) - datetime.strptime(s1, FMT)
243 |     return str(delta_t)
244 | 
245 | 
246 | def timestamp_elapse_to_seconds(s1):
247 |     a = datetime.strptime(s1, '%H:%M:%S')
248 |     secs = timedelta(hours=a.hour, minutes=a.minute, seconds=a.second).seconds
249 |     return secs
250 | 
251 | 
252 | # own custom sorted json serializer, cuz python
253 | def to_json(o, level=0):
254 |     INDENT = 2
255 |     SPACE = " "
256 |     NEWLINE = "\n"
257 |     ret = ""
258 |     if isinstance(o, dict):
259 |         ret += "{" + NEWLINE
260 |         comma = ""
261 |         for k in sorted(o.keys()):
262 |             v = o[k]
263 |             ret += comma
264 |             comma = ",\n"
265 |             ret += SPACE * INDENT * (level+1)
266 |             ret += '"' + str(k) + '":' + SPACE
267 |             ret += to_json(v, level + 1)
268 | 
269 |         ret += NEWLINE + SPACE * INDENT * level + "}"
270 |     elif isinstance(o, str):
271 |         ret += '"' + o + '"'
272 |     elif isinstance(o, list) or isinstance(o, tuple):
273 |         ret += "[" + ",".join([to_json(e, level+1) for e in o]) + "]"
274 |     elif isinstance(o, bool):
275 |         ret += "true" if o else "false"
276 |     elif isinstance(o, int):
277 |         ret += str(o)
278 |     elif isinstance(o, float):
279 |         ret += '%.7g' % o
280 |     elif isinstance(o, np.ndarray) and np.issubdtype(o.dtype, np.integer):
281 |         ret += "[" + ','.join(map(str, o.flatten().tolist())) + "]"
282 |     elif isinstance(o, np.ndarray) and np.issubdtype(o.dtype, np.inexact):
283 |         ret += "[" + \
284 |             ','.join(map(lambda x: '%.7g' % x, o.flatten().tolist())) + "]"
285 |     elif o is None:
286 |         ret += 'null'
287 |     elif hasattr(o, '__class__'):
288 |         ret += '"' + o.__class__.__name__ + '"'
289 |     else:
290 |         raise TypeError(
291 |             "Unknown type '%s' for json serialization" % str(type(o)))
292 |     return ret
293 | 
294 | 
295 | # format object and its properties into printable dict
296 | def format_obj_dict(obj, keys):
297 |     if isinstance(obj, dict):
298 |         return to_json(
299 |             {k: obj.get(k) for k in keys if obj.get(k) is not None})
300 |     else:
301 |         return to_json(
302 |             {k: getattr(obj, k, None) for k in keys
303 |              if getattr(obj, k, None) is not None})
304 | 
305 | 
306 | # cast dict to have flat values (int, float, str)
307 | def flat_cast_dict(d):
308 |     for k in d:
309 |         v = d[k]
310 |         if not isinstance(v, (int, float)):
311 |             d[k] = str(v)
312 |     return d
313 | 
314 | 
315 | def flatten_dict(d, parent_key='', sep='_'):
316 |     items = []
317 |     for k, v in d.items():
318 |         new_key = parent_key + sep + k if parent_key else k
319 |         if isinstance(v, collections.MutableMapping):
320 |             items.extend(flatten_dict(v, new_key, sep=sep).items())
321 |         else:
322 |             items.append((new_key, v))
323 |     return dict(items)
324 | 
325 | 
326 | def get_module(GREF, dot_path):
327 |     # get module from globals() by string dot_path
328 |     path_arr = dot_path.split('.')
329 |     # base level from globals
330 |     mod = GREF.get(path_arr.pop(0))
331 |     for deeper_path in path_arr:
332 |         mod = getattr(mod, deeper_path)
333 |     return mod
334 | 
335 | 
336 | def import_package_files(globals_, locals_, __file__):
337 |     '''
338 |     Dynamically import all the public attributes of the python modules in this
339 |     file's directory (the package directory) and return a list of their names.
340 |     '''
341 |     exports = []
342 |     # globals_, locals_ = globals(), locals()
343 |     package_path = path.dirname(__file__)
344 |     package_name = path.basename(package_path)
345 | 
346 |     for filename in listdir(package_path):
347 |         modulename, ext = path.splitext(filename)
348 |         if modulename[0] != '_' and ext in ('.py', '.pyw'):
349 |             subpackage = '{}.{}'.format(
350 |                 package_name, modulename)  # pkg relative
351 |             module = __import__(subpackage, globals_, locals_, [modulename])
352 |             modict = module.__dict__
353 |             names = (modict['__all__'] if '__all__' in modict else
354 |                      [name for name in
355 |                       modict if inspect.isclass(modict[name])])  # all public
356 |             exports.extend(names)
357 |             globals_.update((name, modict[name]) for name in names)
358 | 
359 |     return exports
360 | 
361 | 
362 | def clean_id_str(id_str):
363 |     return id_str.split('/').pop().split('.').pop(0)
364 | 
365 | 
366 | def parse_trial_id(id_str):
367 |     c_id_str = clean_id_str(id_str)
368 |     if re.search(TIMESTAMP_REGEX, c_id_str):
369 |         name_time_trial = re.split(TIMESTAMP_REGEX, c_id_str)
370 |         if len(name_time_trial) == 3:
371 |             return c_id_str
372 |         else:
373 |             return None
374 |     else:
375 |         return None
376 | 
377 | 
378 | def parse_experiment_id(id_str):
379 |     c_id_str = clean_id_str(id_str)
380 |     if re.search(TIMESTAMP_REGEX, c_id_str):
381 |         name_time_trial = re.split(TIMESTAMP_REGEX, c_id_str)
382 |         name_time_trial.pop()
383 |         experiment_id = ''.join(name_time_trial)
384 |         return experiment_id
385 |     else:
386 |         return None
387 | 
388 | 
389 | def parse_experiment_name(id_str):
390 |     c_id_str = clean_id_str(id_str)
391 |     experiment_id = parse_experiment_id(c_id_str)
392 |     if experiment_id is None:
393 |         experiment_name = c_id_str
394 |     else:
395 |         experiment_name = re.sub(TIMESTAMP_REGEX, '', experiment_id).strip('-')
396 |     assert experiment_name in EXPERIMENT_SPECS, \
397 |         '{} not in EXPERIMENT_SPECS'.format(experiment_name)
398 |     return experiment_name
399 | 
400 | 
401 | def load_data_from_trial_id(id_str):
402 |     experiment_id = parse_experiment_id(id_str)
403 |     trial_id = parse_trial_id(id_str)
404 |     data_filename = './data/{}/{}.json'.format(experiment_id, trial_id)
405 |     try:
406 |         data = json.loads(open(data_filename).read())
407 |     except (FileNotFoundError, json.JSONDecodeError):
408 |         data = None
409 |     return data
410 | 
411 | 
412 | def load_data_array_from_experiment_id(id_str):
413 |     # to load all ./data files for a series of trials
414 |     experiment_id = parse_experiment_id(id_str)
415 |     data_path = './data/{}'.format(experiment_id)
416 |     trial_id_array = [
417 |         f for f in listdir(data_path)
418 |         if (path.isfile(path.join(data_path, f)) and
419 |             f.startswith(experiment_id) and
420 |             f.endswith('.json'))
421 |     ]
422 |     return list(filter(None, [load_data_from_trial_id(trial_id)
423 |                               for trial_id in trial_id_array]))
424 | 
425 | 
426 | def save_experiment_data(data_df, trial_id):
427 |     experiment_id = parse_experiment_id(trial_id)
428 |     filedir = './data/{0}'.format(experiment_id)
429 |     filename = '{0}_analysis_data.csv'.format(experiment_id)
430 |     filepath = '{}/{}'.format(filedir, filename)
431 |     data_df.round(6).to_csv(filepath, index=False)
432 | 
433 |     # zip the csv and best trial json for upload to PR
434 |     zipfile.ZipFile(filepath+'.zip', mode='w').write(
435 |         filepath, arcname=filename)
436 |     trial_filename = data_df.loc[0, 'trial_id'] + '.json'
437 |     trial_filepath = '{}/{}'.format(filedir, trial_filename)
438 |     zipfile.ZipFile(trial_filepath+'.zip', mode='w').write(
439 |         trial_filepath, arcname=trial_filename)
440 | 
441 |     logger.info(
442 |         'experiment data saved to {}'.format(filepath))
443 | 
444 | 
445 | def configure_hardware(RAND_SEED):
446 |     '''configure rand seed, GPU'''
447 |     from keras import backend as K
448 |     if K.backend() == 'tensorflow':
449 |         K.tf.set_random_seed(RAND_SEED)
450 |     else:
451 |         K.theano.tensor.shared_randomstreams.RandomStreams(seed=RAND_SEED)
452 | 
453 |     if K.backend() != 'tensorflow':
454 |         # GPU config for tf only
455 |         return
456 | 
457 |     process_num = PARALLEL_PROCESS_NUM if args.param_selection else 1
458 |     tf = K.tf
459 |     gpu_options = tf.GPUOptions(
460 |         allow_growth=True,
461 |         per_process_gpu_memory_fraction=1./float(process_num))
462 |     config = tf.ConfigProto(
463 |         gpu_options=gpu_options,
464 |         allow_soft_placement=True)
465 |     sess = tf.Session(config=config)
466 |     K.set_session(sess)
467 |     return sess
468 | 
469 | 
470 | def debug_mem_usage():
471 |     import psutil
472 |     from mem_top import mem_top
473 |     pid = getpid()
474 |     logger.debug(
475 |         'MEM USAGE for PID {}, MEM_INFO: {}\n{}'.format(
476 |             pid, psutil.Process().memory_info(), mem_top()))
477 | 
478 | 
479 | def del_self_attr(subject):
480 |     self_attrs = list(subject.__dict__.keys())
481 |     for attr in self_attrs:
482 |         delattr(subject, attr)
483 |     import gc
484 |     gc.collect()
485 | 
486 | 
487 | # clone a keras model without file I/O
488 | def clone_model(model, custom_objects=None):
489 |     from keras.models import model_from_config
490 |     custom_objects = custom_objects or {}
491 |     config = {
492 |         'class_name': model.__class__.__name__,
493 |         'config': model.get_config(),
494 |     }
495 |     clone = model_from_config(config, custom_objects=custom_objects)
496 |     clone.set_weights(model.get_weights())
497 |     return clone
498 | 
499 | 
500 | # clone a keras optimizer without file I/O
501 | def clone_optimizer(optimizer):
502 |     from keras.optimizers import optimizer_from_config
503 |     if isinstance(optimizer, str):
504 |         return get(optimizer)
505 |     params = dict([(k, v) for k, v in optimizer.get_config().items()])
506 |     config = {
507 |         'class_name': optimizer.__class__.__name__,
508 |         'config': params,
509 |     }
510 |     clone = optimizer_from_config(config)
511 |     return clone
512 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from setuptools import setup
 4 | from setuptools.command.test import test as TestCommand
 5 | 
 6 | 
 7 | # explicitly config
 8 | test_args = [
 9 |     '-n 4',
10 |     '--cov-report=term',
11 |     '--cov-report=html',
12 |     '--cov=rl',
13 |     'test'
14 | ]
15 | 
16 | 
17 | class PyTest(TestCommand):
18 |     user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
19 | 
20 |     def initialize_options(self):
21 |         TestCommand.initialize_options(self)
22 |         self.pytest_args = test_args
23 | 
24 |     def run_tests(self):
25 |         # import here, cause outside the eggs aren't loaded
26 |         import pytest
27 |         errno = pytest.main(self.pytest_args)
28 |         sys.exit(errno)
29 | 
30 | 
31 | # Utility function to read the README file.
32 | def read(fname):
33 |     return open(os.path.join(os.path.dirname(__file__), fname)).read()
34 | 
35 | 
36 | # the setup
37 | setup(
38 |     name='openai_lab',
39 |     version='1.0.0',
40 |     description='An experimentation system for Reinforcement Learning using OpenAI and Keras',
41 |     long_description=read('README.md'),
42 |     keywords='openai gym',
43 |     url='https://github.com/kengz/openai_lab',
44 |     author='kengz,lgraesser',
45 |     author_email='kengzwl@gmail.com',
46 |     license='MIT',
47 |     packages=[],
48 |     zip_safe=False,
49 |     include_package_data=True,
50 |     install_requires=[],
51 |     dependency_links=[],
52 |     extras_require={
53 |         'dev': [],
54 |         'docs': [],
55 |         'testing': []
56 |     },
57 |     classifiers=[],
58 |     tests_require=['pytest', 'pytest-cov'],
59 |     test_suite='test',
60 |     cmdclass={'test': PyTest}
61 | )
62 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kengz/openai_lab/d0669d89268f2dc01c1cf878e4879775c7b6eb3c/test/__init__.py


--------------------------------------------------------------------------------
/test/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import rl
 3 | from os import environ
 4 | 
 5 | environ['CI'] = environ.get('CI') or 'true'
 6 | 
 7 | 
 8 | def pytest_runtest_setup(item):
 9 |     for problem in rl.util.PROBLEMS:
10 |         if problem == 'TestPassCartPole-v0':
11 |             pass
12 |         else:
13 |             rl.util.PROBLEMS[problem]['MAX_EPISODES'] = 3
14 | 


--------------------------------------------------------------------------------
/test/test_atari.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pytest
 3 | from os import environ
 4 | from rl.experiment import run
 5 | from . import conftest
 6 | import pandas as pd
 7 | 
 8 | 
 9 | class AtariTest(unittest.TestCase):
10 | 
11 |     @unittest.skipIf(environ.get('CI'), "Delay CI test until dev stable")
12 |     @classmethod
13 |     def test_breakout_dqn(cls):
14 |         data_df = run('breakout_dqn')
15 |         assert isinstance(data_df, pd.DataFrame)
16 | 
17 |     @unittest.skipIf(environ.get('CI'), "Delay CI test until dev stable")
18 |     @classmethod
19 |     def test_breakout_double_dqn(cls):
20 |         data_df = run('breakout_double_dqn')
21 |         assert isinstance(data_df, pd.DataFrame)
22 | 


--------------------------------------------------------------------------------
/test/test_box2d.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pytest
 3 | from os import environ
 4 | from rl.experiment import run
 5 | from . import conftest
 6 | import pandas as pd
 7 | 
 8 | 
 9 | class Box2DTest(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def test_lunar_dqn(cls):
13 |         data_df = run('lunar_dqn')
14 |         assert isinstance(data_df, pd.DataFrame)
15 | 
16 |     @classmethod
17 |     def test_lunar_double_dqn(cls):
18 |         data_df = run('lunar_double_dqn')
19 |         assert isinstance(data_df, pd.DataFrame)
20 | 
21 |     @classmethod
22 |     def test_lunar_freeze(cls):
23 |         data_df = run('lunar_freeze')
24 |         assert isinstance(data_df, pd.DataFrame)
25 | 
26 |     @classmethod
27 |     def test_walker_ddpg_linearnoise(cls):
28 |         data_df = run('walker_ddpg_linearnoise')
29 |         assert isinstance(data_df, pd.DataFrame)
30 | 


--------------------------------------------------------------------------------
/test/test_classic.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pytest
 3 | from os import environ
 4 | from rl.experiment import run
 5 | from . import conftest
 6 | import pandas as pd
 7 | 
 8 | 
 9 | class ClassicTest(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def test_quickstart_dqn(cls):
13 |         data_df = run('quickstart_dqn')
14 |         assert isinstance(data_df, pd.DataFrame)
15 | 
16 |     @classmethod
17 |     def test_dqn_epsilon(cls):
18 |         data_df = run('dqn_epsilon')
19 |         assert isinstance(data_df, pd.DataFrame)
20 | 
21 |     @classmethod
22 |     def test_dqn(cls):
23 |         data_df = run('dqn')
24 |         assert isinstance(data_df, pd.DataFrame)
25 | 
26 |     @classmethod
27 |     def test_dqn_per(cls):
28 |         data_df = run('dqn_per')
29 |         assert isinstance(data_df, pd.DataFrame)
30 | 
31 |     @classmethod
32 |     def test_double_dqn(cls):
33 |         data_df = run('double_dqn')
34 |         assert isinstance(data_df, pd.DataFrame)
35 | 
36 |     @classmethod
37 |     def test_sarsa(cls):
38 |         data_df = run('sarsa')
39 |         assert isinstance(data_df, pd.DataFrame)
40 | 
41 |     @classmethod
42 |     def test_exp_sarsa(cls):
43 |         data_df = run('exp_sarsa')
44 |         assert isinstance(data_df, pd.DataFrame)
45 | 
46 |     @classmethod
47 |     def test_offpol_sarsa(cls):
48 |         data_df = run('offpol_sarsa')
49 |         assert isinstance(data_df, pd.DataFrame)
50 | 
51 |     @classmethod
52 |     def test_cartpole_ac_argmax(cls):
53 |         data_df = run('cartpole_ac_argmax')
54 |         assert isinstance(data_df, pd.DataFrame)
55 | 
56 |     @classmethod
57 |     def test_dqn_v1(cls):
58 |         data_df = run('dqn_v1')
59 |         assert isinstance(data_df, pd.DataFrame)
60 | 
61 |     @classmethod
62 |     def test_acrobot(cls):
63 |         data_df = run('acrobot')
64 |         assert isinstance(data_df, pd.DataFrame)
65 | 
66 |     @classmethod
67 |     def test_pendulum_ddpg_linearnoise(cls):
68 |         data_df = run('pendulum_ddpg_linearnoise')
69 |         assert isinstance(data_df, pd.DataFrame)
70 | 
71 |     @classmethod
72 |     def test_mountain_dqn(cls):
73 |         data_df = run('mountain_dqn')
74 |         assert isinstance(data_df, pd.DataFrame)
75 | 


--------------------------------------------------------------------------------
/test/test_dev.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pytest
 3 | from os import environ
 4 | from rl.experiment import run
 5 | from . import conftest
 6 | import pandas as pd
 7 | 
 8 | 
 9 | class DevTest(unittest.TestCase):
10 | 
11 |     @classmethod
12 |     def test_clean_import(cls):
13 |         print(dir())
14 |         assert 'keras' not in dir(
15 |         ), 'keras import should be contained within classes'
16 |         assert 'matplotlib' not in dir(
17 |         ), 'matplotlib import should be contained within classes'
18 | 
19 |     @classmethod
20 |     def test_gym_tour(cls):
21 |         data_df = run('dummy')
22 |         assert isinstance(data_df, pd.DataFrame)
23 | 
24 |     @classmethod
25 |     def test_q_table(cls):
26 |         data_df = run('q_table')
27 |         assert isinstance(data_df, pd.DataFrame)
28 | 
29 |     @unittest.skipIf(environ.get('CI'),
30 |                      "Causing build to crash since it's unstable.")
31 |     @classmethod
32 |     def test_dqn_pass(cls):
33 |         data_df = run('test_dqn_pass')
34 |         max_total_rewards = data_df['max_total_rewards_stats_mean'][0]
35 |         print(max_total_rewards)
36 |         assert max_total_rewards > 50, 'dqn failed to hit max_total_rewards'
37 | 
38 |     # TODO cant run searches with these shits together, will hang everything wtf
39 |     # @classmethod
40 |     # def test_dqn_grid_search(cls):
41 |     #     data_df = run('test_dqn_grid_search', param_selection=True)
42 |     #     assert isinstance(data_df, pd.DataFrame)
43 | 
44 |     # TODO cant run searches with these shits together, will hang everything wtf
45 |     # @classmethod
46 |     # def test_dqn_random_search(cls):
47 |     #     data_df = run('test_dqn_random_search', param_selection=True)
48 |     #     assert isinstance(data_df, pd.DataFrame)
49 | 


--------------------------------------------------------------------------------