├── .gitignore
├── LICENSE
├── hw1
    ├── README.md
    ├── demo.bash
    ├── experts
    │   ├── Ant-v2.pkl
    │   ├── HalfCheetah-v2.pkl
    │   ├── Hopper-v2.pkl
    │   ├── Humanoid-v2.pkl
    │   ├── Reacher-v2.pkl
    │   └── Walker2d-v2.pkl
    ├── load_policy.py
    ├── requirements.txt
    ├── run_expert.py
    └── tf_util.py
├── hw2
    ├── README.md
    ├── hw2_instructions.pdf
    ├── hw2_instructions.tex
    ├── logz.py
    ├── lunar_lander.py
    ├── plot.py
    ├── requirements.txt
    └── train_pg_f18.py
├── hw3
    ├── README.md
    ├── atari_wrappers.py
    ├── dqn.py
    ├── dqn_utils.py
    ├── logz.py
    ├── lunar_lander.py
    ├── plot.py
    ├── requirements.txt
    ├── run_dqn_atari.py
    ├── run_dqn_lander.py
    ├── run_dqn_ram.py
    └── train_ac_f18.py
├── hw4
    ├── .gitignore
    ├── half_cheetah_env.py
    ├── logger.py
    ├── main.py
    ├── model_based_policy.py
    ├── model_based_rl.py
    ├── plot.py
    ├── requirements.txt
    ├── run_all.sh
    ├── tabulate.py
    ├── timer.py
    └── utils.py
├── hw5
    ├── exp
    │   ├── README.md
    │   ├── density_model.py
    │   ├── ex_utils.py
    │   ├── exploration.py
    │   ├── hw5a.pdf
    │   ├── logz.py
    │   ├── plot.py
    │   ├── pointmass.py
    │   ├── replay.py
    │   ├── requirements.txt
    │   ├── run_all.sh
    │   ├── sparse_half_cheetah.py
    │   └── train_ac_exploration_f18.py
    ├── meta
    │   ├── README.md
    │   ├── logz.py
    │   ├── plot.py
    │   ├── point_mass.py
    │   ├── point_mass_observed.py
    │   ├── replay_buffer.py
    │   ├── requirements.txt
    │   └── train_policy.py
    └── sac
    │   ├── README.md
    │   ├── environment.yml
    │   ├── logz.py
    │   ├── nn.py
    │   ├── plot.py
    │   ├── sac.py
    │   ├── train_mujoco.py
    │   └── utils.py
└── project
    └── project_assignment.pdf


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 berkeleydeeprlcourse
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/hw1/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 1: Imitation Learning
 2 | 
 3 | Dependencies:
 4 |  * Python **3.5**
 5 |  * Numpy version **1.14.5**
 6 |  * TensorFlow version **1.10.5**
 7 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
 8 |  * OpenAI Gym version **0.10.5**
 9 | 
10 | Once Python **3.5** is installed, you can install the remaining dependencies using `pip install -r requirements.txt`.
11 | 
12 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines.
13 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638).
14 | 
15 | **Note**: Students enrolled in the course will receive an email with their MuJoCo activation key. Please do **not** share this key.
16 | 
17 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data.
18 | 
19 | In `experts/`, the provided expert policies are:
20 | * Ant-v2.pkl
21 | * HalfCheetah-v2.pkl
22 | * Hopper-v2.pkl
23 | * Humanoid-v2.pkl
24 | * Reacher-v2.pkl
25 | * Walker2d-v2.pkl
26 | 
27 | The name of the pickle file corresponds to the name of the gym environment.
28 | 


--------------------------------------------------------------------------------
/hw1/demo.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 | for e in Hopper-v2 Ant-v2 HalfCheetah-v2 Humanoid-v2 Reacher-v2 Walker2d-v2
4 | do
5 |     python run_expert.py experts/$e.pkl $e --render --num_rollouts=1
6 | done
7 | 


--------------------------------------------------------------------------------
/hw1/experts/Ant-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Ant-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/HalfCheetah-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/HalfCheetah-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/Hopper-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Hopper-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/Humanoid-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Humanoid-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/Reacher-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Reacher-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/Walker2d-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Walker2d-v2.pkl


--------------------------------------------------------------------------------
/hw1/load_policy.py:
--------------------------------------------------------------------------------
 1 | import pickle, tensorflow as tf, tf_util, numpy as np
 2 | 
 3 | def load_policy(filename):
 4 |     with open(filename, 'rb') as f:
 5 |         data = pickle.loads(f.read())
 6 | 
 7 |     # assert len(data.keys()) == 2
 8 |     nonlin_type = data['nonlin_type']
 9 |     policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
10 | 
11 |     assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
12 |     policy_params = data[policy_type]
13 | 
14 |     assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
15 | 
16 |     # Keep track of input and output dims (i.e. observation and action dims) for the user
17 | 
18 |     def build_policy(obs_bo):
19 |         def read_layer(l):
20 |             assert list(l.keys()) == ['AffineLayer']
21 |             assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
22 |             return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)
23 | 
24 |         def apply_nonlin(x):
25 |             if nonlin_type == 'lrelu':
26 |                 return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
27 |             elif nonlin_type == 'tanh':
28 |                 return tf.tanh(x)
29 |             else:
30 |                 raise NotImplementedError(nonlin_type)
31 | 
32 |         # Build the policy. First, observation normalization.
33 |         assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
34 |         obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
35 |         obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
36 |         obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
37 |         print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
38 |         normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
39 | 
40 |         curr_activations_bd = normedobs_bo
41 | 
42 |         # Hidden layers next
43 |         assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
44 |         layer_params = policy_params['hidden']['FeedforwardNet']
45 |         for layer_name in sorted(layer_params.keys()):
46 |             l = layer_params[layer_name]
47 |             W, b = read_layer(l)
48 |             curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)
49 | 
50 |         # Output layer
51 |         W, b = read_layer(policy_params['out'])
52 |         output_bo = tf.matmul(curr_activations_bd, W) + b
53 |         return output_bo
54 | 
55 |     obs_bo = tf.placeholder(tf.float32, [None, None])
56 |     a_ba = build_policy(obs_bo)
57 |     policy_fn = tf_util.function([obs_bo], a_ba)
58 |     return policy_fn


--------------------------------------------------------------------------------
/hw1/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.5
2 | mujoco-py==1.50.1.56
3 | tensorflow
4 | numpy
5 | seaborn
6 | 


--------------------------------------------------------------------------------
/hw1/run_expert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Code to load an expert policy and generate roll-out data for behavioral cloning.
 5 | Example usage:
 6 |     python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
 7 |             --num_rollouts 20
 8 | 
 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
10 | """
11 | 
12 | import os
13 | import pickle
14 | import tensorflow as tf
15 | import numpy as np
16 | import tf_util
17 | import gym
18 | import load_policy
19 | 
20 | def main():
21 |     import argparse
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('expert_policy_file', type=str)
24 |     parser.add_argument('envname', type=str)
25 |     parser.add_argument('--render', action='store_true')
26 |     parser.add_argument("--max_timesteps", type=int)
27 |     parser.add_argument('--num_rollouts', type=int, default=20,
28 |                         help='Number of expert roll outs')
29 |     args = parser.parse_args()
30 | 
31 |     print('loading and building expert policy')
32 |     policy_fn = load_policy.load_policy(args.expert_policy_file)
33 |     print('loaded and built')
34 | 
35 |     with tf.Session():
36 |         tf_util.initialize()
37 | 
38 |         import gym
39 |         env = gym.make(args.envname)
40 |         max_steps = args.max_timesteps or env.spec.timestep_limit
41 | 
42 |         returns = []
43 |         observations = []
44 |         actions = []
45 |         for i in range(args.num_rollouts):
46 |             print('iter', i)
47 |             obs = env.reset()
48 |             done = False
49 |             totalr = 0.
50 |             steps = 0
51 |             while not done:
52 |                 action = policy_fn(obs[None,:])
53 |                 observations.append(obs)
54 |                 actions.append(action)
55 |                 obs, r, done, _ = env.step(action)
56 |                 totalr += r
57 |                 steps += 1
58 |                 if args.render:
59 |                     env.render()
60 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
61 |                 if steps >= max_steps:
62 |                     break
63 |             returns.append(totalr)
64 | 
65 |         print('returns', returns)
66 |         print('mean return', np.mean(returns))
67 |         print('std of return', np.std(returns))
68 | 
69 |         expert_data = {'observations': np.array(observations),
70 |                        'actions': np.array(actions)}
71 | 
72 |         with open(os.path.join('expert_data', args.envname + '.pkl'), 'wb') as f:
73 |             pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
74 | 
75 | if __name__ == '__main__':
76 |     main()
77 | 


--------------------------------------------------------------------------------
/hw2/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 2: Policy Gradient
 2 | 
 3 | Dependencies:
 4 |  * Python **3.5**
 5 |  * Numpy version **1.14.5**
 6 |  * TensorFlow version **1.10.5**
 7 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
 8 |  * OpenAI Gym version **0.10.5**
 9 |  * seaborn
10 |  * Box2D==**2.3.2**
11 | 
12 | Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file.
13 | 
14 | The only file that you need to look at is `train_pg_f18.py`, which you will implement.
15 | 
16 | See the [HW2 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw2.pdf) for further instructions.
17 | 


--------------------------------------------------------------------------------
/hw2/hw2_instructions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw2/hw2_instructions.pdf


--------------------------------------------------------------------------------
/hw2/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw2/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     plt.show()
 59 | 
 60 | 
 61 | def get_datasets(fpath, condition=None):
 62 |     unit = 0
 63 |     datasets = []
 64 |     for root, dir, files in os.walk(fpath):
 65 |         if 'log.txt' in files:
 66 |             param_path = open(os.path.join(root,'params.json'))
 67 |             params = json.load(param_path)
 68 |             exp_name = params['exp_name']
 69 |             
 70 |             log_path = os.path.join(root,'log.txt')
 71 |             experiment_data = pd.read_table(log_path)
 72 | 
 73 |             experiment_data.insert(
 74 |                 len(experiment_data.columns),
 75 |                 'Unit',
 76 |                 unit
 77 |                 )        
 78 |             experiment_data.insert(
 79 |                 len(experiment_data.columns),
 80 |                 'Condition',
 81 |                 condition or exp_name
 82 |                 )
 83 | 
 84 |             datasets.append(experiment_data)
 85 |             unit += 1
 86 | 
 87 |     return datasets
 88 | 
 89 | 
 90 | def main():
 91 |     import argparse
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('logdir', nargs='*')
 94 |     parser.add_argument('--legend', nargs='*')
 95 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 96 |     args = parser.parse_args()
 97 | 
 98 |     use_legend = False
 99 |     if args.legend is not None:
100 |         assert len(args.legend) == len(args.logdir), \
101 |             "Must give a legend title for each set of experiments."
102 |         use_legend = True
103 | 
104 |     data = []
105 |     if use_legend:
106 |         for logdir, legend_title in zip(args.logdir, args.legend):
107 |             data += get_datasets(logdir, legend_title)
108 |     else:
109 |         for logdir in args.logdir:
110 |             data += get_datasets(logdir)
111 | 
112 |     if isinstance(args.value, list):
113 |         values = args.value
114 |     else:
115 |         values = [args.value]
116 |     for value in values:
117 |         plot_data(data, value=value)
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/hw2/requirements.txt:
--------------------------------------------------------------------------------
1 | mujoco-py==1.50.1.56
2 | gym==0.10.5
3 | tensorflow==1.10.0
4 | numpy==1.14.5
5 | seaborn
6 | Box2D==2.3.2
7 | 


--------------------------------------------------------------------------------
/hw3/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 3: Q-Learning
 2 | 
 3 | Dependencies:
 4 |  * Python **3.5**
 5 |  * Numpy version **1.14.5**
 6 |  * TensorFlow version **1.10.5**
 7 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
 8 |  * OpenAI Gym version **0.10.5**
 9 |  * seaborn
10 |  * Box2D==**2.3.2**
11 |  * OpenCV
12 |  * ffmpeg
13 | 
14 | Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file.
15 | 
16 | The only files that you need to look at are `dqn.py` and `train_ac_f18.py`, which you will implement.
17 | 
18 | See the [HW3 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw3.pdf) for further instructions.
19 | 
20 | The starter code was based on an implementation of Q-learning for Atari generously provided by Szymon Sidor from OpenAI.
21 | 


--------------------------------------------------------------------------------
/hw3/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | #import sys
  2 | #sys.path.remove('/opt/ros/kinetic/lib/python2.7/dist-packages')
  3 | 
  4 | import cv2
  5 | import numpy as np
  6 | from collections import deque
  7 | import gym
  8 | from gym import spaces
  9 | 
 10 | 
 11 | class NoopResetEnv(gym.Wrapper):
 12 |     def __init__(self, env=None, noop_max=30):
 13 |         """Sample initial states by taking random number of no-ops on reset.
 14 |         No-op is assumed to be action 0.
 15 |         """
 16 |         super(NoopResetEnv, self).__init__(env)
 17 |         self.noop_max = noop_max
 18 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 19 | 
 20 |     def _reset(self):
 21 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 22 |         self.env.reset()
 23 |         noops = np.random.randint(1, self.noop_max + 1)
 24 |         for _ in range(noops):
 25 |             obs, _, _, _ = self.env.step(0)
 26 |         return obs
 27 | 
 28 | class FireResetEnv(gym.Wrapper):
 29 |     def __init__(self, env=None):
 30 |         """Take action on reset for environments that are fixed until firing."""
 31 |         super(FireResetEnv, self).__init__(env)
 32 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 33 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 34 | 
 35 |     def _reset(self):
 36 |         self.env.reset()
 37 |         obs, _, _, _ = self.env.step(1)
 38 |         obs, _, _, _ = self.env.step(2)
 39 |         return obs
 40 | 
 41 | class EpisodicLifeEnv(gym.Wrapper):
 42 |     def __init__(self, env=None):
 43 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 44 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 45 |         """
 46 |         super(EpisodicLifeEnv, self).__init__(env)
 47 |         self.lives = 0
 48 |         self.was_real_done  = True
 49 |         self.was_real_reset = False
 50 | 
 51 |     def _step(self, action):
 52 |         obs, reward, done, info = self.env.step(action)
 53 |         self.was_real_done = done
 54 |         # check current lives, make loss of life terminal,
 55 |         # then update lives to handle bonus lives
 56 |         lives = self.env.unwrapped.ale.lives()
 57 |         if lives < self.lives and lives > 0:
 58 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 59 |             # so its important to keep lives > 0, so that we only reset once
 60 |             # the environment advertises done.
 61 |             done = True
 62 |         self.lives = lives
 63 |         return obs, reward, done, info
 64 | 
 65 |     def _reset(self):
 66 |         """Reset only when lives are exhausted.
 67 |         This way all states are still reachable even though lives are episodic,
 68 |         and the learner need not know about any of this behind-the-scenes.
 69 |         """
 70 |         if self.was_real_done:
 71 |             obs = self.env.reset()
 72 |             self.was_real_reset = True
 73 |         else:
 74 |             # no-op step to advance from terminal/lost life state
 75 |             obs, _, _, _ = self.env.step(0)
 76 |             self.was_real_reset = False
 77 |         self.lives = self.env.unwrapped.ale.lives()
 78 |         return obs
 79 | 
 80 | class MaxAndSkipEnv(gym.Wrapper):
 81 |     def __init__(self, env=None, skip=4):
 82 |         """Return only every `skip`-th frame"""
 83 |         super(MaxAndSkipEnv, self).__init__(env)
 84 |         # most recent raw observations (for max pooling across time steps)
 85 |         self._obs_buffer = deque(maxlen=2)
 86 |         self._skip       = skip
 87 | 
 88 |     def _step(self, action):
 89 |         total_reward = 0.0
 90 |         done = None
 91 |         for _ in range(self._skip):
 92 |             obs, reward, done, info = self.env.step(action)
 93 |             self._obs_buffer.append(obs)
 94 |             total_reward += reward
 95 |             if done:
 96 |                 break
 97 | 
 98 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 99 | 
100 |         return max_frame, total_reward, done, info
101 | 
102 |     def _reset(self):
103 |         """Clear past frame buffer and init. to first obs. from inner env."""
104 |         self._obs_buffer.clear()
105 |         obs = self.env.reset()
106 |         self._obs_buffer.append(obs)
107 |         return obs
108 | 
109 | def _process_frame84(frame):
110 |     img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
111 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
112 |     resized_screen = cv2.resize(img, (84, 110),  interpolation=cv2.INTER_LINEAR)
113 |     x_t = resized_screen[18:102, :]
114 |     x_t = np.reshape(x_t, [84, 84, 1])
115 |     return x_t.astype(np.uint8)
116 | 
117 | class ProcessFrame84(gym.Wrapper):
118 |     def __init__(self, env=None):
119 |         super(ProcessFrame84, self).__init__(env)
120 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
121 | 
122 |     def _step(self, action):
123 |         obs, reward, done, info = self.env.step(action)
124 |         return _process_frame84(obs), reward, done, info
125 | 
126 |     def _reset(self):
127 |         return _process_frame84(self.env.reset())
128 | 
129 | class ClippedRewardsWrapper(gym.Wrapper):
130 |     def _step(self, action):
131 |         obs, reward, done, info = self.env.step(action)
132 |         return obs, np.sign(reward), done, info
133 | 
134 | def wrap_deepmind_ram(env):
135 |     env = EpisodicLifeEnv(env)
136 |     env = NoopResetEnv(env, noop_max=30)
137 |     env = MaxAndSkipEnv(env, skip=4)
138 |     if 'FIRE' in env.unwrapped.get_action_meanings():
139 |         env = FireResetEnv(env)
140 |     env = ClippedRewardsWrapper(env)
141 |     return env
142 | 
143 | def wrap_deepmind(env):
144 |     assert 'NoFrameskip' in env.spec.id
145 |     env = EpisodicLifeEnv(env)
146 |     env = NoopResetEnv(env, noop_max=30)
147 |     env = MaxAndSkipEnv(env, skip=4)
148 |     if 'FIRE' in env.unwrapped.get_action_meanings():
149 |         env = FireResetEnv(env)
150 |     env = ProcessFrame84(env)
151 |     env = ClippedRewardsWrapper(env)
152 |     return env
153 | 


--------------------------------------------------------------------------------
/hw3/dqn.py:
--------------------------------------------------------------------------------
  1 | import uuid
  2 | import time
  3 | import pickle
  4 | import sys
  5 | import gym.spaces
  6 | import itertools
  7 | import numpy as np
  8 | import random
  9 | import tensorflow                as tf
 10 | import tensorflow.contrib.layers as layers
 11 | from collections import namedtuple
 12 | from dqn_utils import *
 13 | 
 14 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
 15 | 
 16 | class QLearner(object):
 17 | 
 18 |   def __init__(
 19 |     self,
 20 |     env,
 21 |     q_func,
 22 |     optimizer_spec,
 23 |     session,
 24 |     exploration=LinearSchedule(1000000, 0.1),
 25 |     stopping_criterion=None,
 26 |     replay_buffer_size=1000000,
 27 |     batch_size=32,
 28 |     gamma=0.99,
 29 |     learning_starts=50000,
 30 |     learning_freq=4,
 31 |     frame_history_len=4,
 32 |     target_update_freq=10000,
 33 |     grad_norm_clipping=10,
 34 |     rew_file=None,
 35 |     double_q=True,
 36 |     lander=False):
 37 |     """Run Deep Q-learning algorithm.
 38 | 
 39 |     You can specify your own convnet using q_func.
 40 | 
 41 |     All schedules are w.r.t. total number of steps taken in the environment.
 42 | 
 43 |     Parameters
 44 |     ----------
 45 |     env: gym.Env
 46 |         gym environment to train on.
 47 |     q_func: function
 48 |         Model to use for computing the q function. It should accept the
 49 |         following named arguments:
 50 |             img_in: tf.Tensor
 51 |                 tensorflow tensor representing the input image
 52 |             num_actions: int
 53 |                 number of actions
 54 |             scope: str
 55 |                 scope in which all the model related variables
 56 |                 should be created
 57 |             reuse: bool
 58 |                 whether previously created variables should be reused.
 59 |     optimizer_spec: OptimizerSpec
 60 |         Specifying the constructor and kwargs, as well as learning rate schedule
 61 |         for the optimizer
 62 |     session: tf.Session
 63 |         tensorflow session to use.
 64 |     exploration: rl_algs.deepq.utils.schedules.Schedule
 65 |         schedule for probability of chosing random action.
 66 |     stopping_criterion: (env, t) -> bool
 67 |         should return true when it's ok for the RL algorithm to stop.
 68 |         takes in env and the number of steps executed so far.
 69 |     replay_buffer_size: int
 70 |         How many memories to store in the replay buffer.
 71 |     batch_size: int
 72 |         How many transitions to sample each time experience is replayed.
 73 |     gamma: float
 74 |         Discount Factor
 75 |     learning_starts: int
 76 |         After how many environment steps to start replaying experiences
 77 |     learning_freq: int
 78 |         How many steps of environment to take between every experience replay
 79 |     frame_history_len: int
 80 |         How many past frames to include as input to the model.
 81 |     target_update_freq: int
 82 |         How many experience replay rounds (not steps!) to perform between
 83 |         each update to the target Q network
 84 |     grad_norm_clipping: float or None
 85 |         If not None gradients' norms are clipped to this value.
 86 |     double_q: bool
 87 |         If True, then use double Q-learning to compute target values. Otherwise, use vanilla DQN.
 88 |         https://papers.nips.cc/paper/3964-double-q-learning.pdf
 89 |     """
 90 |     assert type(env.observation_space) == gym.spaces.Box
 91 |     assert type(env.action_space)      == gym.spaces.Discrete
 92 | 
 93 |     self.target_update_freq = target_update_freq
 94 |     self.optimizer_spec = optimizer_spec
 95 |     self.batch_size = batch_size
 96 |     self.learning_freq = learning_freq
 97 |     self.learning_starts = learning_starts
 98 |     self.stopping_criterion = stopping_criterion
 99 |     self.env = env
100 |     self.session = session
101 |     self.exploration = exploration
102 |     self.rew_file = str(uuid.uuid4()) + '.pkl' if rew_file is None else rew_file
103 | 
104 |     ###############
105 |     # BUILD MODEL #
106 |     ###############
107 | 
108 |     if len(self.env.observation_space.shape) == 1:
109 |         # This means we are running on low-dimensional observations (e.g. RAM)
110 |         input_shape = self.env.observation_space.shape
111 |     else:
112 |         img_h, img_w, img_c = self.env.observation_space.shape
113 |         input_shape = (img_h, img_w, frame_history_len * img_c)
114 |     self.num_actions = self.env.action_space.n
115 | 
116 |     # set up placeholders
117 |     # placeholder for current observation (or state)
118 |     self.obs_t_ph              = tf.placeholder(
119 |         tf.float32 if lander else tf.uint8, [None] + list(input_shape))
120 |     # placeholder for current action
121 |     self.act_t_ph              = tf.placeholder(tf.int32,   [None])
122 |     # placeholder for current reward
123 |     self.rew_t_ph              = tf.placeholder(tf.float32, [None])
124 |     # placeholder for next observation (or state)
125 |     self.obs_tp1_ph            = tf.placeholder(
126 |         tf.float32 if lander else tf.uint8, [None] + list(input_shape))
127 |     # placeholder for end of episode mask
128 |     # this value is 1 if the next state corresponds to the end of an episode,
129 |     # in which case there is no Q-value at the next state; at the end of an
130 |     # episode, only the current state reward contributes to the target, not the
131 |     # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
132 |     self.done_mask_ph          = tf.placeholder(tf.float32, [None])
133 | 
134 |     # casting to float on GPU ensures lower data transfer times.
135 |     if lander:
136 |       obs_t_float = self.obs_t_ph
137 |       obs_tp1_float = self.obs_tp1_ph
138 |     else:
139 |       obs_t_float   = tf.cast(self.obs_t_ph,   tf.float32) / 255.0
140 |       obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0
141 | 
142 |     # Here, you should fill in your own code to compute the Bellman error. This requires
143 |     # evaluating the current and next Q-values and constructing the corresponding error.
144 |     # TensorFlow will differentiate this error for you, you just need to pass it to the
145 |     # optimizer. See assignment text for details.
146 |     # Your code should produce one scalar-valued tensor: total_error
147 |     # This will be passed to the optimizer in the provided code below.
148 |     # Your code should also produce two collections of variables:
149 |     # q_func_vars
150 |     # target_q_func_vars
151 |     # These should hold all of the variables of the Q-function network and target network,
152 |     # respectively. A convenient way to get these is to make use of TF's "scope" feature.
153 |     # For example, you can create your Q-function network with the scope "q_func" like this:
154 |     # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
155 |     # And then you can obtain the variables like this:
156 |     # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
157 |     # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
158 |     # Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error
159 |     ######
160 | 
161 |     # YOUR CODE HERE
162 | 
163 |     ######
164 | 
165 |     # construct optimization op (with gradient clipping)
166 |     self.learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
167 |     optimizer = self.optimizer_spec.constructor(learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
168 |     self.train_fn = minimize_and_clip(optimizer, self.total_error,
169 |                  var_list=q_func_vars, clip_val=grad_norm_clipping)
170 | 
171 |     # update_target_fn will be called periodically to copy Q network to target Q network
172 |     update_target_fn = []
173 |     for var, var_target in zip(sorted(q_func_vars,        key=lambda v: v.name),
174 |                                sorted(target_q_func_vars, key=lambda v: v.name)):
175 |         update_target_fn.append(var_target.assign(var))
176 |     self.update_target_fn = tf.group(*update_target_fn)
177 | 
178 |     # construct the replay buffer
179 |     self.replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len, lander=lander)
180 |     self.replay_buffer_idx = None
181 | 
182 |     ###############
183 |     # RUN ENV     #
184 |     ###############
185 |     self.model_initialized = False
186 |     self.num_param_updates = 0
187 |     self.mean_episode_reward      = -float('nan')
188 |     self.best_mean_episode_reward = -float('inf')
189 |     self.last_obs = self.env.reset()
190 |     self.log_every_n_steps = 10000
191 | 
192 |     self.start_time = None
193 |     self.t = 0
194 | 
195 |   def stopping_criterion_met(self):
196 |     return self.stopping_criterion is not None and self.stopping_criterion(self.env, self.t)
197 | 
198 |   def step_env(self):
199 |     ### 2. Step the env and store the transition
200 |     # At this point, "self.last_obs" contains the latest observation that was
201 |     # recorded from the simulator. Here, your code needs to store this
202 |     # observation and its outcome (reward, next observation, etc.) into
203 |     # the replay buffer while stepping the simulator forward one step.
204 |     # At the end of this block of code, the simulator should have been
205 |     # advanced one step, and the replay buffer should contain one more
206 |     # transition.
207 |     # Specifically, self.last_obs must point to the new latest observation.
208 |     # Useful functions you'll need to call:
209 |     # obs, reward, done, info = env.step(action)
210 |     # this steps the environment forward one step
211 |     # obs = env.reset()
212 |     # this resets the environment if you reached an episode boundary.
213 |     # Don't forget to call env.reset() to get a new observation if done
214 |     # is true!!
215 |     # Note that you cannot use "self.last_obs" directly as input
216 |     # into your network, since it needs to be processed to include context
217 |     # from previous frames. You should check out the replay buffer
218 |     # implementation in dqn_utils.py to see what functionality the replay
219 |     # buffer exposes. The replay buffer has a function called
220 |     # encode_recent_observation that will take the latest observation
221 |     # that you pushed into the buffer and compute the corresponding
222 |     # input that should be given to a Q network by appending some
223 |     # previous frames.
224 |     # Don't forget to include epsilon greedy exploration!
225 |     # And remember that the first time you enter this loop, the model
226 |     # may not yet have been initialized (but of course, the first step
227 |     # might as well be random, since you haven't trained your net...)
228 | 
229 |     #####
230 | 
231 |     # YOUR CODE HERE
232 | 
233 |   def update_model(self):
234 |     ### 3. Perform experience replay and train the network.
235 |     # note that this is only done if the replay buffer contains enough samples
236 |     # for us to learn something useful -- until then, the model will not be
237 |     # initialized and random actions should be taken
238 |     if (self.t > self.learning_starts and \
239 |         self.t % self.learning_freq == 0 and \
240 |         self.replay_buffer.can_sample(self.batch_size)):
241 |       # Here, you should perform training. Training consists of four steps:
242 |       # 3.a: use the replay buffer to sample a batch of transitions (see the
243 |       # replay buffer code for function definition, each batch that you sample
244 |       # should consist of current observations, current actions, rewards,
245 |       # next observations, and done indicator).
246 |       # 3.b: initialize the model if it has not been initialized yet; to do
247 |       # that, call
248 |       #    initialize_interdependent_variables(self.session, tf.global_variables(), {
249 |       #        self.obs_t_ph: obs_t_batch,
250 |       #        self.obs_tp1_ph: obs_tp1_batch,
251 |       #    })
252 |       # where obs_t_batch and obs_tp1_batch are the batches of observations at
253 |       # the current and next time step. The boolean variable model_initialized
254 |       # indicates whether or not the model has been initialized.
255 |       # Remember that you have to update the target network too (see 3.d)!
256 |       # 3.c: train the model. To do this, you'll need to use the self.train_fn and
257 |       # self.total_error ops that were created earlier: self.total_error is what you
258 |       # created to compute the total Bellman error in a batch, and self.train_fn
259 |       # will actually perform a gradient step and update the network parameters
260 |       # to reduce total_error. When calling self.session.run on these you'll need to
261 |       # populate the following placeholders:
262 |       # self.obs_t_ph
263 |       # self.act_t_ph
264 |       # self.rew_t_ph
265 |       # self.obs_tp1_ph
266 |       # self.done_mask_ph
267 |       # (this is needed for computing self.total_error)
268 |       # self.learning_rate -- you can get this from self.optimizer_spec.lr_schedule.value(t)
269 |       # (this is needed by the optimizer to choose the learning rate)
270 |       # 3.d: periodically update the target network by calling
271 |       # self.session.run(self.update_target_fn)
272 |       # you should update every target_update_freq steps, and you may find the
273 |       # variable self.num_param_updates useful for this (it was initialized to 0)
274 |       #####
275 | 
276 |       # YOUR CODE HERE
277 | 
278 |       self.num_param_updates += 1
279 | 
280 |     self.t += 1
281 | 
282 |   def log_progress(self):
283 |     episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards()
284 | 
285 |     if len(episode_rewards) > 0:
286 |       self.mean_episode_reward = np.mean(episode_rewards[-100:])
287 | 
288 |     if len(episode_rewards) > 100:
289 |       self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward)
290 | 
291 |     if self.t % self.log_every_n_steps == 0 and self.model_initialized:
292 |       print("Timestep %d" % (self.t,))
293 |       print("mean reward (100 episodes) %f" % self.mean_episode_reward)
294 |       print("best mean reward %f" % self.best_mean_episode_reward)
295 |       print("episodes %d" % len(episode_rewards))
296 |       print("exploration %f" % self.exploration.value(self.t))
297 |       print("learning_rate %f" % self.optimizer_spec.lr_schedule.value(self.t))
298 |       if self.start_time is not None:
299 |         print("running time %f" % ((time.time() - self.start_time) / 60.))
300 | 
301 |       self.start_time = time.time()
302 | 
303 |       sys.stdout.flush()
304 | 
305 |       with open(self.rew_file, 'wb') as f:
306 |         pickle.dump(episode_rewards, f, pickle.HIGHEST_PROTOCOL)
307 | 
308 | def learn(*args, **kwargs):
309 |   alg = QLearner(*args, **kwargs)
310 |   while not alg.stopping_criterion_met():
311 |     alg.step_env()
312 |     # at this point, the environment should have been advanced one step (and
313 |     # reset if done was true), and self.last_obs should point to the new latest
314 |     # observation
315 |     alg.update_model()
316 |     alg.log_progress()
317 | 
318 | 


--------------------------------------------------------------------------------
/hw3/dqn_utils.py:
--------------------------------------------------------------------------------
  1 | """This file includes a collection of utility functions that are useful for
  2 | implementing DQN."""
  3 | import gym
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | def huber_loss(x, delta=1.0):
  9 |     # https://en.wikipedia.org/wiki/Huber_loss
 10 |     return tf.where(
 11 |         tf.abs(x) < delta,
 12 |         tf.square(x) * 0.5,
 13 |         delta * (tf.abs(x) - 0.5 * delta)
 14 |     )
 15 | 
 16 | def sample_n_unique(sampling_f, n):
 17 |     """Helper function. Given a function `sampling_f` that returns
 18 |     comparable objects, sample n such unique objects.
 19 |     """
 20 |     res = []
 21 |     while len(res) < n:
 22 |         candidate = sampling_f()
 23 |         if candidate not in res:
 24 |             res.append(candidate)
 25 |     return res
 26 | 
 27 | class Schedule(object):
 28 |     def value(self, t):
 29 |         """Value of the schedule at time t"""
 30 |         raise NotImplementedError()
 31 | 
 32 | class ConstantSchedule(object):
 33 |     def __init__(self, value):
 34 |         """Value remains constant over time.
 35 |         Parameters
 36 |         ----------
 37 |         value: float
 38 |             Constant value of the schedule
 39 |         """
 40 |         self._v = value
 41 | 
 42 |     def value(self, t):
 43 |         """See Schedule.value"""
 44 |         return self._v
 45 | 
 46 | def linear_interpolation(l, r, alpha):
 47 |     return l + alpha * (r - l)
 48 | 
 49 | class PiecewiseSchedule(object):
 50 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 51 |         """Piecewise schedule.
 52 |         endpoints: [(int, int)]
 53 |             list of pairs `(time, value)` meanining that schedule should output
 54 |             `value` when `t==time`. All the values for time must be sorted in
 55 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 56 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 57 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 58 |             time passed between `time_a` and `time_b` for time `t`.
 59 |         interpolation: lambda float, float, float: float
 60 |             a function that takes value to the left and to the right of t according
 61 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 62 |             right endpoint that t has covered. See linear_interpolation for example.
 63 |         outside_value: float
 64 |             if the value is requested outside of all the intervals sepecified in
 65 |             `endpoints` this value is returned. If None then AssertionError is
 66 |             raised when outside value is requested.
 67 |         """
 68 |         idxes = [e[0] for e in endpoints]
 69 |         assert idxes == sorted(idxes)
 70 |         self._interpolation = interpolation
 71 |         self._outside_value = outside_value
 72 |         self._endpoints      = endpoints
 73 | 
 74 |     def value(self, t):
 75 |         """See Schedule.value"""
 76 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 77 |             if l_t <= t and t < r_t:
 78 |                 alpha = float(t - l_t) / (r_t - l_t)
 79 |                 return self._interpolation(l, r, alpha)
 80 | 
 81 |         # t does not belong to any of the pieces, so doom.
 82 |         assert self._outside_value is not None
 83 |         return self._outside_value
 84 | 
 85 | class LinearSchedule(object):
 86 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 87 |         """Linear interpolation between initial_p and final_p over
 88 |         schedule_timesteps. After this many timesteps pass final_p is
 89 |         returned.
 90 |         Parameters
 91 |         ----------
 92 |         schedule_timesteps: int
 93 |             Number of timesteps for which to linearly anneal initial_p
 94 |             to final_p
 95 |         initial_p: float
 96 |             initial output value
 97 |         final_p: float
 98 |             final output value
 99 |         """
100 |         self.schedule_timesteps = schedule_timesteps
101 |         self.final_p            = final_p
102 |         self.initial_p          = initial_p
103 | 
104 |     def value(self, t):
105 |         """See Schedule.value"""
106 |         fraction  = min(float(t) / self.schedule_timesteps, 1.0)
107 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
108 | 
109 | def compute_exponential_averages(variables, decay):
110 |     """Given a list of tensorflow scalar variables
111 |     create ops corresponding to their exponential
112 |     averages
113 |     Parameters
114 |     ----------
115 |     variables: [tf.Tensor]
116 |         List of scalar tensors.
117 |     Returns
118 |     -------
119 |     averages: [tf.Tensor]
120 |         List of scalar tensors corresponding to averages
121 |         of al the `variables` (in order)
122 |     apply_op: tf.runnable
123 |         Op to be run to update the averages with current value
124 |         of variables.
125 |     """
126 |     averager = tf.train.ExponentialMovingAverage(decay=decay)
127 |     apply_op = averager.apply(variables)
128 |     return [averager.average(v) for v in variables], apply_op
129 | 
130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
131 |     """Minimized `objective` using `optimizer` w.r.t. variables in
132 |     `var_list` while ensure the norm of the gradients for each
133 |     variable is clipped to `clip_val`
134 |     """
135 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
136 |     for i, (grad, var) in enumerate(gradients):
137 |         if grad is not None:
138 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
139 |     return optimizer.apply_gradients(gradients)
140 | 
141 | def initialize_interdependent_variables(session, vars_list, feed_dict):
142 |     """Initialize a list of variables one at a time, which is useful if
143 |     initialization of some variables depends on initialization of the others.
144 |     """
145 |     vars_left = vars_list
146 |     while len(vars_left) > 0:
147 |         new_vars_left = []
148 |         for v in vars_left:
149 |             try:
150 |                 # If using an older version of TensorFlow, uncomment the line
151 |                 # below and comment out the line after it.
152 | 		#session.run(tf.initialize_variables([v]), feed_dict)
153 |                 session.run(tf.variables_initializer([v]), feed_dict)
154 |             except tf.errors.FailedPreconditionError:
155 |                 new_vars_left.append(v)
156 |         if len(new_vars_left) >= len(vars_left):
157 |             # This can happend if the variables all depend on each other, or more likely if there's
158 |             # another variable outside of the list, that still needs to be initialized. This could be
159 |             # detected here, but life's finite.
160 |             raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
161 |         else:
162 |             vars_left = new_vars_left
163 | 
164 | def get_wrapper_by_name(env, classname):
165 |     currentenv = env
166 |     while True:
167 |         if classname in currentenv.__class__.__name__:
168 |             return currentenv
169 |         elif isinstance(env, gym.Wrapper):
170 |             currentenv = currentenv.env
171 |         else:
172 |             raise ValueError("Couldn't find wrapper named %s"%classname)
173 | 
174 | class ReplayBuffer(object):
175 |     def __init__(self, size, frame_history_len, lander=False):
176 |         """This is a memory efficient implementation of the replay buffer.
177 | 
178 |         The sepecific memory optimizations use here are:
179 |             - only store each frame once rather than k times
180 |               even if every observation normally consists of k last frames
181 |             - store frames as np.uint8 (actually it is most time-performance
182 |               to cast them back to float32 on GPU to minimize memory transfer
183 |               time)
184 |             - store frame_t and frame_(t+1) in the same buffer.
185 | 
186 |         For the tipical use case in Atari Deep RL buffer with 1M frames the total
187 |         memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes
188 | 
189 |         Warning! Assumes that returning frame of zeros at the beginning
190 |         of the episode, when there is less frames than `frame_history_len`,
191 |         is acceptable.
192 | 
193 |         Parameters
194 |         ----------
195 |         size: int
196 |             Max number of transitions to store in the buffer. When the buffer
197 |             overflows the old memories are dropped.
198 |         frame_history_len: int
199 |             Number of memories to be retried for each observation.
200 |         """
201 |         self.lander = lander
202 | 
203 |         self.size = size
204 |         self.frame_history_len = frame_history_len
205 | 
206 |         self.next_idx      = 0
207 |         self.num_in_buffer = 0
208 | 
209 |         self.obs      = None
210 |         self.action   = None
211 |         self.reward   = None
212 |         self.done     = None
213 | 
214 |     def can_sample(self, batch_size):
215 |         """Returns true if `batch_size` different transitions can be sampled from the buffer."""
216 |         return batch_size + 1 <= self.num_in_buffer
217 | 
218 |     def _encode_sample(self, idxes):
219 |         obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
220 |         act_batch      = self.action[idxes]
221 |         rew_batch      = self.reward[idxes]
222 |         next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
223 |         done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)
224 | 
225 |         return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
226 | 
227 | 
228 |     def sample(self, batch_size):
229 |         """Sample `batch_size` different transitions.
230 | 
231 |         i-th sample transition is the following:
232 | 
233 |         when observing `obs_batch[i]`, action `act_batch[i]` was taken,
234 |         after which reward `rew_batch[i]` was received and subsequent
235 |         observation  next_obs_batch[i] was observed, unless the epsiode
236 |         was done which is represented by `done_mask[i]` which is equal
237 |         to 1 if episode has ended as a result of that action.
238 | 
239 |         Parameters
240 |         ----------
241 |         batch_size: int
242 |             How many transitions to sample.
243 | 
244 |         Returns
245 |         -------
246 |         obs_batch: np.array
247 |             Array of shape
248 |             (batch_size, img_h, img_w, img_c * frame_history_len)
249 |             and dtype np.uint8
250 |         act_batch: np.array
251 |             Array of shape (batch_size,) and dtype np.int32
252 |         rew_batch: np.array
253 |             Array of shape (batch_size,) and dtype np.float32
254 |         next_obs_batch: np.array
255 |             Array of shape
256 |             (batch_size, img_h, img_w, img_c * frame_history_len)
257 |             and dtype np.uint8
258 |         done_mask: np.array
259 |             Array of shape (batch_size,) and dtype np.float32
260 |         """
261 |         assert self.can_sample(batch_size)
262 |         idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
263 |         return self._encode_sample(idxes)
264 | 
265 |     def encode_recent_observation(self):
266 |         """Return the most recent `frame_history_len` frames.
267 | 
268 |         Returns
269 |         -------
270 |         observation: np.array
271 |             Array of shape (img_h, img_w, img_c * frame_history_len)
272 |             and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
273 |             encodes frame at time `t - frame_history_len + i`
274 |         """
275 |         assert self.num_in_buffer > 0
276 |         return self._encode_observation((self.next_idx - 1) % self.size)
277 | 
278 |     def _encode_observation(self, idx):
279 |         end_idx   = idx + 1 # make noninclusive
280 |         start_idx = end_idx - self.frame_history_len
281 |         # this checks if we are using low-dimensional observations, such as RAM
282 |         # state, in which case we just directly return the latest RAM.
283 |         if len(self.obs.shape) == 2:
284 |             return self.obs[end_idx-1]
285 |         # if there weren't enough frames ever in the buffer for context
286 |         if start_idx < 0 and self.num_in_buffer != self.size:
287 |             start_idx = 0
288 |         for idx in range(start_idx, end_idx - 1):
289 |             if self.done[idx % self.size]:
290 |                 start_idx = idx + 1
291 |         missing_context = self.frame_history_len - (end_idx - start_idx)
292 |         # if zero padding is needed for missing context
293 |         # or we are on the boundry of the buffer
294 |         if start_idx < 0 or missing_context > 0:
295 |             frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
296 |             for idx in range(start_idx, end_idx):
297 |                 frames.append(self.obs[idx % self.size])
298 |             return np.concatenate(frames, 2)
299 |         else:
300 |             # this optimization has potential to saves about 30% compute time \o/
301 |             img_h, img_w = self.obs.shape[1], self.obs.shape[2]
302 |             return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)
303 | 
304 |     def store_frame(self, frame):
305 |         """Store a single frame in the buffer at the next available index, overwriting
306 |         old frames if necessary.
307 | 
308 |         Parameters
309 |         ----------
310 |         frame: np.array
311 |             Array of shape (img_h, img_w, img_c) and dtype np.uint8
312 |             the frame to be stored
313 | 
314 |         Returns
315 |         -------
316 |         idx: int
317 |             Index at which the frame is stored. To be used for `store_effect` later.
318 |         """
319 |         if self.obs is None:
320 |             self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.float32 if self.lander else np.uint8)
321 |             self.action   = np.empty([self.size],                     dtype=np.int32)
322 |             self.reward   = np.empty([self.size],                     dtype=np.float32)
323 |             self.done     = np.empty([self.size],                     dtype=np.bool)
324 |         self.obs[self.next_idx] = frame
325 | 
326 |         ret = self.next_idx
327 |         self.next_idx = (self.next_idx + 1) % self.size
328 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
329 | 
330 |         return ret
331 | 
332 |     def store_effect(self, idx, action, reward, done):
333 |         """Store effects of action taken after obeserving frame stored
334 |         at index idx. The reason `store_frame` and `store_effect` is broken
335 |         up into two functions is so that once can call `encode_recent_observation`
336 |         in between.
337 | 
338 |         Paramters
339 |         ---------
340 |         idx: int
341 |             Index in buffer of recently observed frame (returned by `store_frame`).
342 |         action: int
343 |             Action that was performed upon observing this frame.
344 |         reward: float
345 |             Reward that was received when the actions was performed.
346 |         done: bool
347 |             True if episode was finished after performing that action.
348 |         """
349 |         self.action[idx] = action
350 |         self.reward[idx] = reward
351 |         self.done[idx]   = done
352 | 
353 | 


--------------------------------------------------------------------------------
/hw3/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw3/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     plt.show()
 59 | 
 60 | 
 61 | def get_datasets(fpath, condition=None):
 62 |     unit = 0
 63 |     datasets = []
 64 |     for root, dir, files in os.walk(fpath):
 65 |         if 'log.txt' in files:
 66 |             param_path = open(os.path.join(root,'params.json'))
 67 |             params = json.load(param_path)
 68 |             exp_name = params['exp_name']
 69 |             
 70 |             log_path = os.path.join(root,'log.txt')
 71 |             experiment_data = pd.read_table(log_path)
 72 | 
 73 |             experiment_data.insert(
 74 |                 len(experiment_data.columns),
 75 |                 'Unit',
 76 |                 unit
 77 |                 )        
 78 |             experiment_data.insert(
 79 |                 len(experiment_data.columns),
 80 |                 'Condition',
 81 |                 condition or exp_name
 82 |                 )
 83 | 
 84 |             datasets.append(experiment_data)
 85 |             unit += 1
 86 | 
 87 |     return datasets
 88 | 
 89 | 
 90 | def main():
 91 |     import argparse
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('logdir', nargs='*')
 94 |     parser.add_argument('--legend', nargs='*')
 95 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 96 |     args = parser.parse_args()
 97 | 
 98 |     use_legend = False
 99 |     if args.legend is not None:
100 |         assert len(args.legend) == len(args.logdir), \
101 |             "Must give a legend title for each set of experiments."
102 |         use_legend = True
103 | 
104 |     data = []
105 |     if use_legend:
106 |         for logdir, legend_title in zip(args.logdir, args.legend):
107 |             data += get_datasets(logdir, legend_title)
108 |     else:
109 |         for logdir in args.logdir:
110 |             data += get_datasets(logdir)
111 | 
112 |     if isinstance(args.value, list):
113 |         values = args.value
114 |     else:
115 |         values = [args.value]
116 |     for value in values:
117 |         plot_data(data, value=value)
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/hw3/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.5
2 | gym[atari]
3 | box2d
4 | mujoco-py==1.50.1.56
5 | tensorflow
6 | numpy
7 | seaborn
8 | opencv-python
9 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(img_in, num_actions, scope, reuse=False):
 16 |     # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
 17 |     with tf.variable_scope(scope, reuse=reuse):
 18 |         out = img_in
 19 |         with tf.variable_scope("convnet"):
 20 |             # original architecture
 21 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
 22 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
 23 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
 24 |         out = layers.flatten(out)
 25 |         with tf.variable_scope("action_value"):
 26 |             out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
 27 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 28 | 
 29 |         return out
 30 | 
 31 | def atari_learn(env,
 32 |                 session,
 33 |                 num_timesteps):
 34 |     # This is just a rough estimate
 35 |     num_iterations = float(num_timesteps) / 4.0
 36 | 
 37 |     lr_multiplier = 1.0
 38 |     lr_schedule = PiecewiseSchedule([
 39 |                                          (0,                   1e-4 * lr_multiplier),
 40 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 41 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 42 |                                     ],
 43 |                                     outside_value=5e-5 * lr_multiplier)
 44 |     optimizer = dqn.OptimizerSpec(
 45 |         constructor=tf.train.AdamOptimizer,
 46 |         kwargs=dict(epsilon=1e-4),
 47 |         lr_schedule=lr_schedule
 48 |     )
 49 | 
 50 |     def stopping_criterion(env, t):
 51 |         # notice that here t is the number of steps of the wrapped env,
 52 |         # which is different from the number of steps in the underlying env
 53 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 54 | 
 55 |     exploration_schedule = PiecewiseSchedule(
 56 |         [
 57 |             (0, 1.0),
 58 |             (1e6, 0.1),
 59 |             (num_iterations / 2, 0.01),
 60 |         ], outside_value=0.01
 61 |     )
 62 | 
 63 |     dqn.learn(
 64 |         env=env,
 65 |         q_func=atari_model,
 66 |         optimizer_spec=optimizer,
 67 |         session=session,
 68 |         exploration=exploration_schedule,
 69 |         stopping_criterion=stopping_criterion,
 70 |         replay_buffer_size=1000000,
 71 |         batch_size=32,
 72 |         gamma=0.99,
 73 |         learning_starts=50000,
 74 |         learning_freq=4,
 75 |         frame_history_len=4,
 76 |         target_update_freq=10000,
 77 |         grad_norm_clipping=10,
 78 |         double_q=True
 79 |     )
 80 |     env.close()
 81 | 
 82 | def get_available_gpus():
 83 |     from tensorflow.python.client import device_lib
 84 |     local_device_protos = device_lib.list_local_devices()
 85 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 86 | 
 87 | def set_global_seeds(i):
 88 |     try:
 89 |         import tensorflow as tf
 90 |     except ImportError:
 91 |         pass
 92 |     else:
 93 |         tf.set_random_seed(i)
 94 |     np.random.seed(i)
 95 |     random.seed(i)
 96 | 
 97 | def get_session():
 98 |     tf.reset_default_graph()
 99 |     tf_config = tf.ConfigProto(
100 |         inter_op_parallelism_threads=1,
101 |         intra_op_parallelism_threads=1)
102 |     session = tf.Session(config=tf_config)
103 |     print("AVAILABLE GPUS: ", get_available_gpus())
104 |     return session
105 | 
106 | def get_env(task, seed):
107 |     env = gym.make('PongNoFrameskip-v4')
108 | 
109 |     set_global_seeds(seed)
110 |     env.seed(seed)
111 | 
112 |     expt_dir = '/tmp/hw3_vid_dir2/'
113 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
114 |     env = wrap_deepmind(env)
115 | 
116 |     return env
117 | 
118 | def main():
119 |     # Get Atari games.
120 |     task = gym.make('PongNoFrameskip-v4')
121 | 
122 |     # Run training
123 |     seed = random.randint(0, 9999)
124 |     print('random seed = %d' % seed)
125 |     env = get_env(task, seed)
126 |     session = get_session()
127 |     atari_learn(env, session, num_timesteps=2e8)
128 | 
129 | if __name__ == "__main__":
130 |     main()
131 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_lander.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | 
 13 | def lander_model(obs, num_actions, scope, reuse=False):
 14 |     with tf.variable_scope(scope, reuse=reuse):
 15 |         out = obs
 16 |         with tf.variable_scope("action_value"):
 17 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 18 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 19 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 20 | 
 21 |         return out
 22 | 
 23 | def lander_optimizer():
 24 |     return dqn.OptimizerSpec(
 25 |         constructor=tf.train.AdamOptimizer,
 26 |         lr_schedule=ConstantSchedule(1e-3),
 27 |         kwargs={}
 28 |     )
 29 | 
 30 | def lander_stopping_criterion(num_timesteps):
 31 |     def stopping_criterion(env, t):
 32 |         # notice that here t is the number of steps of the wrapped env,
 33 |         # which is different from the number of steps in the underlying env
 34 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 35 |     return stopping_criterion
 36 | 
 37 | def lander_exploration_schedule(num_timesteps):
 38 |     return PiecewiseSchedule(
 39 |         [
 40 |             (0, 1),
 41 |             (num_timesteps * 0.1, 0.02),
 42 |         ], outside_value=0.02
 43 |     )
 44 | 
 45 | def lander_kwargs():
 46 |     return {
 47 |         'optimizer_spec': lander_optimizer(),
 48 |         'q_func': lander_model,
 49 |         'replay_buffer_size': 50000,
 50 |         'batch_size': 32,
 51 |         'gamma': 1.00,
 52 |         'learning_starts': 1000,
 53 |         'learning_freq': 1,
 54 |         'frame_history_len': 1,
 55 |         'target_update_freq': 3000,
 56 |         'grad_norm_clipping': 10,
 57 |         'lander': True
 58 |     }
 59 | 
 60 | def lander_learn(env,
 61 |                  session,
 62 |                  num_timesteps,
 63 |                  seed):
 64 | 
 65 |     optimizer = lander_optimizer()
 66 |     stopping_criterion = lander_stopping_criterion(num_timesteps)
 67 |     exploration_schedule = lander_exploration_schedule(num_timesteps)
 68 | 
 69 |     dqn.learn(
 70 |         env=env,
 71 |         session=session,
 72 |         exploration=lander_exploration_schedule(num_timesteps),
 73 |         stopping_criterion=lander_stopping_criterion(num_timesteps),
 74 |         double_q=True,
 75 |         **lander_kwargs()
 76 |     )
 77 |     env.close()
 78 | 
 79 | def set_global_seeds(i):
 80 |     tf.set_random_seed(i)
 81 |     np.random.seed(i)
 82 |     random.seed(i)
 83 | 
 84 | def get_session():
 85 |     tf.reset_default_graph()
 86 |     tf_config = tf.ConfigProto(
 87 |         inter_op_parallelism_threads=1,
 88 |         intra_op_parallelism_threads=1,
 89 |         device_count={'GPU': 0})
 90 |     # GPUs don't significantly speed up deep Q-learning for lunar lander,
 91 |     # since the observations are low-dimensional
 92 |     session = tf.Session(config=tf_config)
 93 |     return session
 94 | 
 95 | def get_env(seed):
 96 |     env = gym.make('LunarLander-v2')
 97 | 
 98 |     set_global_seeds(seed)
 99 |     env.seed(seed)
100 | 
101 |     expt_dir = '/tmp/hw3_vid_dir/'
102 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
103 | 
104 |     return env
105 | 
106 | def main():
107 |     # Run training
108 |     seed = 4565 # you may want to randomize this
109 |     print('random seed = %d' % seed)
110 |     env = get_env(seed)
111 |     session = get_session()
112 |     set_global_seeds(seed)
113 |     lander_learn(env, session, num_timesteps=500000, seed=seed)
114 | 
115 | if __name__ == "__main__":
116 |     main()
117 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_ram.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(ram_in, num_actions, scope, reuse=False):
 16 |     with tf.variable_scope(scope, reuse=reuse):
 17 |         out = ram_in
 18 |         #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
 19 |         with tf.variable_scope("action_value"):
 20 |             out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
 21 |             out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
 22 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 23 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 24 | 
 25 |         return out
 26 | 
 27 | def atari_learn(env,
 28 |                 session,
 29 |                 num_timesteps):
 30 |     # This is just a rough estimate
 31 |     num_iterations = float(num_timesteps) / 4.0
 32 | 
 33 |     lr_multiplier = 1.0
 34 |     lr_schedule = PiecewiseSchedule([
 35 |                                          (0,                   1e-4 * lr_multiplier),
 36 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 37 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 38 |                                     ],
 39 |                                     outside_value=5e-5 * lr_multiplier)
 40 |     optimizer = dqn.OptimizerSpec(
 41 |         constructor=tf.train.AdamOptimizer,
 42 |         kwargs=dict(epsilon=1e-4),
 43 |         lr_schedule=lr_schedule
 44 |     )
 45 | 
 46 |     def stopping_criterion(env, t):
 47 |         # notice that here t is the number of steps of the wrapped env,
 48 |         # which is different from the number of steps in the underlying env
 49 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 50 | 
 51 |     exploration_schedule = PiecewiseSchedule(
 52 |         [
 53 |             (0, 0.2),
 54 |             (1e6, 0.1),
 55 |             (num_iterations / 2, 0.01),
 56 |         ], outside_value=0.01
 57 |     )
 58 | 
 59 |     dqn.learn(
 60 |         env,
 61 |         q_func=atari_model,
 62 |         optimizer_spec=optimizer,
 63 |         session=session,
 64 |         exploration=exploration_schedule,
 65 |         stopping_criterion=stopping_criterion,
 66 |         replay_buffer_size=1000000,
 67 |         batch_size=32,
 68 |         gamma=0.99,
 69 |         learning_starts=50000,
 70 |         learning_freq=4,
 71 |         frame_history_len=1,
 72 |         target_update_freq=10000,
 73 |         grad_norm_clipping=10
 74 |     )
 75 |     env.close()
 76 | 
 77 | def get_available_gpus():
 78 |     from tensorflow.python.client import device_lib
 79 |     local_device_protos = device_lib.list_local_devices()
 80 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 81 | 
 82 | def set_global_seeds(i):
 83 |     try:
 84 |         import tensorflow as tf
 85 |     except ImportError:
 86 |         pass
 87 |     else:
 88 |         tf.set_random_seed(i)
 89 |     np.random.seed(i)
 90 |     random.seed(i)
 91 | 
 92 | def get_session():
 93 |     tf.reset_default_graph()
 94 |     tf_config = tf.ConfigProto(
 95 |         inter_op_parallelism_threads=1,
 96 |         intra_op_parallelism_threads=1)
 97 |     session = tf.Session(config=tf_config)
 98 |     print("AVAILABLE GPUS: ", get_available_gpus())
 99 |     return session
100 | 
101 | def get_env(seed):
102 |     env = gym.make('Pong-ram-v0')
103 | 
104 |     set_global_seeds(seed)
105 |     env.seed(seed)
106 | 
107 |     expt_dir = '/tmp/hw3_vid_dir/'
108 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
109 |     env = wrap_deepmind_ram(env)
110 | 
111 |     return env
112 | 
113 | def main():
114 |     # Run training
115 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
116 |     env = get_env(seed)
117 |     session = get_session()
118 |     atari_learn(env, session, num_timesteps=int(4e7))
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/hw4/.gitignore:
--------------------------------------------------------------------------------
1 | plots/
2 | data/
3 | 


--------------------------------------------------------------------------------
/hw4/half_cheetah_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from gym import utils
 4 | from gym.envs.mujoco import mujoco_env
 5 | 
 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 7 |     def __init__(self):
 8 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
 9 |         utils.EzPickle.__init__(self)
10 | 
11 |     def step(self, action):
12 |         xposbefore = self.sim.data.qpos[0]
13 |         self.do_simulation(action, self.frame_skip)
14 |         xposafter = self.sim.data.qpos[0]
15 |         ob = self._get_obs()
16 |         reward_ctrl = - 0.1 * np.square(action).sum()
17 |         reward_run = (xposafter - xposbefore)/self.dt
18 |         reward = reward_ctrl + reward_run
19 |         done = False
20 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
21 | 
22 |     def _get_obs(self):
23 |         return np.concatenate([
24 |             self.sim.data.qpos.flat[1:],
25 |             self.sim.data.qvel.flat,
26 |             self.get_body_com("torso").flat,
27 |             # self.get_body_comvel("torso").flat,
28 |         ])
29 | 
30 |     def reset_model(self):
31 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
32 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
33 |         self.set_state(qpos, qvel)
34 |         return self._get_obs()
35 | 
36 |     def viewer_setup(self):
37 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
38 | 
39 |     @staticmethod
40 |     def cost_fn(states, actions, next_states):
41 |         is_tf = tf.contrib.framework.is_tensor(states)
42 |         is_single_state = (len(states.get_shape()) == 1) if is_tf else (len(states.shape) == 1)
43 | 
44 |         if is_single_state:
45 |             states = states[None, ...]
46 |             actions = actions[None, ...]
47 |             next_states = next_states[None, ...]
48 | 
49 |         scores = tf.zeros(actions.get_shape()[0].value) if is_tf else np.zeros(actions.shape[0])
50 | 
51 |         heading_penalty_factor = 10
52 | 
53 |         # dont move front shin back so far that you tilt forward
54 |         front_leg = states[:, 5]
55 |         my_range = 0.2
56 |         if is_tf:
57 |             scores += tf.cast(front_leg >= my_range, tf.float32) * heading_penalty_factor
58 |         else:
59 |             scores += (front_leg >= my_range) * heading_penalty_factor
60 | 
61 |         front_shin = states[:, 6]
62 |         my_range = 0
63 |         if is_tf:
64 |             scores += tf.cast(front_shin >= my_range, tf.float32) * heading_penalty_factor
65 |         else:
66 |             scores += (front_shin >= my_range) * heading_penalty_factor
67 | 
68 |         front_foot = states[:, 7]
69 |         my_range = 0
70 |         if is_tf:
71 |             scores += tf.cast(front_foot >= my_range, tf.float32) * heading_penalty_factor
72 |         else:
73 |             scores += (front_foot >= my_range) * heading_penalty_factor
74 | 
75 |         scores -= (next_states[:, 17] - states[:, 17]) / 0.01
76 | 
77 |         if is_single_state:
78 |             scores = scores[0]
79 | 
80 |         return scores
81 | 


--------------------------------------------------------------------------------
/hw4/logger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from collections import defaultdict
  3 | import logging
  4 | from colorlog import ColoredFormatter
  5 | 
  6 | import pandas
  7 | import numpy as np
  8 | 
  9 | from tabulate import tabulate
 10 | 
 11 | 
 12 | class LoggerClass(object):
 13 |     GLOBAL_LOGGER_NAME = '_global_logger'
 14 | 
 15 |     _color_formatter = ColoredFormatter(
 16 |         "%(asctime)s %(log_color)s%(name)-10s %(levelname)-8s%(reset)s %(white)s%(message)s",
 17 |         datefmt='%m-%d %H:%M:%S',
 18 |         reset=True,
 19 |         log_colors={
 20 |             'DEBUG': 'cyan',
 21 |             'INFO': 'green',
 22 |             'WARNING': 'yellow',
 23 |             'ERROR': 'red',
 24 |             'CRITICAL': 'red,bg_white',
 25 |         },
 26 |         secondary_log_colors={},
 27 |         style='%'
 28 |     )
 29 | 
 30 |     _normal_formatter = logging.Formatter(
 31 |         '%(asctime)s %(name)-10s %(levelname)-8s %(message)s',
 32 |         datefmt='%m-%d %H:%M:%S',
 33 |         style='%'
 34 |     )
 35 | 
 36 |     def __init__(self):
 37 |         self._dir = None
 38 |         self._logger = None
 39 |         self._log_path = None
 40 |         self._csv_path = None
 41 |         self._tabular = defaultdict(list)
 42 |         self._curr_recorded = list()
 43 |         self._num_dump_tabular_calls = 0
 44 | 
 45 |     @property
 46 |     def dir(self):
 47 |         return self._dir
 48 | 
 49 |     #############
 50 |     ### Setup ###
 51 |     #############
 52 | 
 53 |     def setup(self, display_name, log_path, lvl):
 54 |         self._dir = os.path.dirname(log_path)
 55 |         self._logger = self._get_logger(LoggerClass.GLOBAL_LOGGER_NAME,
 56 |                                         log_path,
 57 |                                         lvl=lvl,
 58 |                                         display_name=display_name)
 59 |         self._csv_path = os.path.splitext(log_path)[0] + '.csv'
 60 | 
 61 |         ### load csv if exists
 62 |         if os.path.exists(self._csv_path):
 63 |             self._tabular = {k: list(v) for k, v in pandas.read_csv(self._csv_path).items()}
 64 |             self._num_dump_tabular_calls = len(tuple(self._tabular.values())[0])
 65 | 
 66 |     def _get_logger(self, name, log_path, lvl=logging.INFO, display_name=None):
 67 |         if isinstance(lvl, str):
 68 |             lvl = lvl.lower().strip()
 69 |             if lvl == 'debug':
 70 |                 lvl = logging.DEBUG
 71 |             elif lvl == 'info':
 72 |                 lvl = logging.INFO
 73 |             elif lvl == 'warn' or lvl == 'warning':
 74 |                 lvl = logging.WARN
 75 |             elif lvl == 'error':
 76 |                 lvl = logging.ERROR
 77 |             elif lvl == 'fatal' or lvl == 'critical':
 78 |                 lvl = logging.CRITICAL
 79 |             else:
 80 |                 raise ValueError('unknown logging level')
 81 | 
 82 |         file_handler = logging.FileHandler(log_path)
 83 |         file_handler.setLevel(logging.DEBUG)
 84 |         file_handler.setFormatter(LoggerClass._normal_formatter)
 85 |         console_handler = logging.StreamHandler()
 86 |         console_handler.setLevel(lvl)
 87 |         console_handler.setFormatter(LoggerClass._color_formatter)
 88 |         if display_name is None:
 89 |             display_name = name
 90 |         logger = logging.getLogger(display_name)
 91 |         logger.setLevel(logging.DEBUG)
 92 |         logger.addHandler(console_handler)
 93 |         logger.addHandler(file_handler)
 94 | 
 95 |         return logger
 96 | 
 97 |     ###############
 98 |     ### Logging ###
 99 |     ###############
100 | 
101 |     def debug(self, s):
102 |         assert (self._logger is not None)
103 |         self._logger.debug(s)
104 | 
105 |     def info(self, s):
106 |         assert (self._logger is not None)
107 |         self._logger.info(s)
108 | 
109 |     def warn(self, s):
110 |         assert (self._logger is not None)
111 |         self._logger.warn(s)
112 | 
113 |     def error(self, s):
114 |         assert (self._logger is not None)
115 |         self._logger.error(s)
116 | 
117 |     def critical(self, s):
118 |         assert (self._logger is not None)
119 |         self._logger.critical(s)
120 | 
121 |     ####################
122 |     ### Data logging ###
123 |     ####################
124 | 
125 |     def record_tabular(self, key, val):
126 |         assert (str(key) not in self._curr_recorded)
127 |         self._curr_recorded.append(str(key))
128 | 
129 |         if key in self._tabular:
130 |             self._tabular[key].append(val)
131 |         else:
132 |             self._tabular[key] = [np.nan] * self._num_dump_tabular_calls + [val]
133 | 
134 |     def dump_tabular(self, print_func=None):
135 |         if len(self._curr_recorded) == 0:
136 |             return ''
137 | 
138 |         ### reset
139 |         self._curr_recorded = list()
140 |         self._num_dump_tabular_calls += 1
141 | 
142 |         ### make sure all same length
143 |         for k, v in self._tabular.items():
144 |             if len(v) == self._num_dump_tabular_calls:
145 |                 pass
146 |             elif len(v) == self._num_dump_tabular_calls - 1:
147 |                 self._tabular[k].append(np.nan)
148 |             else:
149 |                 raise ValueError('key {0} should not have {1} items when {2} calls have been made'.format(
150 |                     k, len(v), self._num_dump_tabular_calls))
151 | 
152 |         ### print
153 |         if print_func is not None:
154 |             log_str = tabulate(sorted([(k, v[-1]) for k, v in self._tabular.items()], key=lambda kv: kv[0]))
155 |             for line in log_str.split('\n'):
156 |                 print_func(line)
157 | 
158 |         ### write to file
159 |         tabular_pandas = pandas.DataFrame({k: pandas.Series(v) for k, v in self._tabular.items()})
160 |         tabular_pandas.to_csv(self._csv_path)
161 | 
162 | 
163 | logger = LoggerClass()
164 | 


--------------------------------------------------------------------------------
/hw4/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import time
 4 | 
 5 | from half_cheetah_env import HalfCheetahEnv
 6 | from logger import logger
 7 | from model_based_rl import ModelBasedRL
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('question', type=str, choices=('q1, q2, q3'))
11 | parser.add_argument('--exp_name', type=str, default=None)
12 | parser.add_argument('--env', type=str, default='HalfCheetah', choices=('HalfCheetah',))
13 | parser.add_argument('--render', action='store_true')
14 | parser.add_argument('--mpc_horizon', type=int, default=15)
15 | parser.add_argument('--num_random_action_selection', type=int, default=4096)
16 | parser.add_argument('--nn_layers', type=int, default=1)
17 | args = parser.parse_args()
18 | 
19 | data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
20 | exp_name = '{0}_{1}_{2}'.format(args.env,
21 |                                 args.question,
22 |                                 args.exp_name if args.exp_name else time.strftime("%d-%m-%Y_%H-%M-%S"))
23 | exp_dir = os.path.join(data_dir, exp_name)
24 | assert not os.path.exists(exp_dir),\
25 |     'Experiment directory {0} already exists. Either delete the directory, or run the experiment with a different name'.format(exp_dir)
26 | os.makedirs(exp_dir, exist_ok=True)
27 | logger.setup(exp_name, os.path.join(exp_dir, 'log.txt'), 'debug')
28 | 
29 | env = {
30 |     'HalfCheetah': HalfCheetahEnv()
31 | }[args.env]
32 | 
33 | mbrl = ModelBasedRL(env=env,
34 |                     render=args.render,
35 |                     mpc_horizon=args.mpc_horizon,
36 |                     num_random_action_selection=args.num_random_action_selection,
37 |                     nn_layers=args.nn_layers)
38 | 
39 | run_func = {
40 |     'q1': mbrl.run_q1,
41 |     'q2': mbrl.run_q2,
42 |     'q3': mbrl.run_q3
43 | }[args.question]
44 | run_func()
45 | 


--------------------------------------------------------------------------------
/hw4/model_based_policy.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | import utils
  5 | 
  6 | 
  7 | class ModelBasedPolicy(object):
  8 | 
  9 |     def __init__(self,
 10 |                  env,
 11 |                  init_dataset,
 12 |                  horizon=15,
 13 |                  num_random_action_selection=4096,
 14 |                  nn_layers=1):
 15 |         self._cost_fn = env.cost_fn
 16 |         self._state_dim = env.observation_space.shape[0]
 17 |         self._action_dim = env.action_space.shape[0]
 18 |         self._action_space_low = env.action_space.low
 19 |         self._action_space_high = env.action_space.high
 20 |         self._init_dataset = init_dataset
 21 |         self._horizon = horizon
 22 |         self._num_random_action_selection = num_random_action_selection
 23 |         self._nn_layers = nn_layers
 24 |         self._learning_rate = 1e-3
 25 | 
 26 |         self._sess, self._state_ph, self._action_ph, self._next_state_ph,\
 27 |             self._next_state_pred, self._loss, self._optimizer, self._best_action = self._setup_graph()
 28 | 
 29 |     def _setup_placeholders(self):
 30 |         """
 31 |             Creates the placeholders used for training, prediction, and action selection
 32 | 
 33 |             returns:
 34 |                 state_ph: current state
 35 |                 action_ph: current_action
 36 |                 next_state_ph: next state
 37 | 
 38 |             implementation details:
 39 |                 (a) the placeholders should have 2 dimensions,
 40 |                     in which the 1st dimension is variable length (i.e., None)
 41 |         """
 42 |         ### PROBLEM 1
 43 |         ### YOUR CODE HERE
 44 |         raise NotImplementedError
 45 | 
 46 |         return state_ph, action_ph, next_state_ph
 47 | 
 48 |     def _dynamics_func(self, state, action, reuse):
 49 |         """
 50 |             Takes as input a state and action, and predicts the next state
 51 | 
 52 |             returns:
 53 |                 next_state_pred: predicted next state
 54 | 
 55 |             implementation details (in order):
 56 |                 (a) Normalize both the state and action by using the statistics of self._init_dataset and
 57 |                     the utils.normalize function
 58 |                 (b) Concatenate the normalized state and action
 59 |                 (c) Pass the concatenated, normalized state-action tensor through a neural network with
 60 |                     self._nn_layers number of layers using the function utils.build_mlp. The resulting output
 61 |                     is the normalized predicted difference between the next state and the current state
 62 |                 (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
 63 |                     the predicted next state
 64 | 
 65 |         """
 66 |         ### PROBLEM 1
 67 |         ### YOUR CODE HERE
 68 |         raise NotImplementedError
 69 | 
 70 |         return next_state_pred
 71 | 
 72 |     def _setup_training(self, state_ph, next_state_ph, next_state_pred):
 73 |         """
 74 |             Takes as input the current state, next state, and predicted next state, and returns
 75 |             the loss and optimizer for training the dynamics model
 76 | 
 77 |             returns:
 78 |                 loss: Scalar loss tensor
 79 |                 optimizer: Operation used to perform gradient descent
 80 | 
 81 |             implementation details (in order):
 82 |                 (a) Compute both the actual state difference and the predicted state difference
 83 |                 (b) Normalize both of these state differences by using the statistics of self._init_dataset and
 84 |                     the utils.normalize function
 85 |                 (c) The loss function is the mean-squared-error between the normalized state difference and
 86 |                     normalized predicted state difference
 87 |                 (d) Create the optimizer by minimizing the loss using the Adam optimizer with self._learning_rate
 88 | 
 89 |         """
 90 |         ### PROBLEM 1
 91 |         ### YOUR CODE HERE
 92 |         raise NotImplementedError
 93 | 
 94 |         return loss, optimizer
 95 | 
 96 |     def _setup_action_selection(self, state_ph):
 97 |         """
 98 |             Computes the best action from the current state by using randomly sampled action sequences
 99 |             to predict future states, evaluating these predictions according to a cost function,
100 |             selecting the action sequence with the lowest cost, and returning the first action in that sequence
101 | 
102 |             returns:
103 |                 best_action: the action that minimizes the cost function (tensor with shape [self._action_dim])
104 | 
105 |             implementation details (in order):
106 |                 (a) We will assume state_ph has a batch size of 1 whenever action selection is performed
107 |                 (b) Randomly sample uniformly self._num_random_action_selection number of action sequences,
108 |                     each of length self._horizon
109 |                 (c) Starting from the input state, unroll each action sequence using your neural network
110 |                     dynamics model
111 |                 (d) While unrolling the action sequences, keep track of the cost of each action sequence
112 |                     using self._cost_fn
113 |                 (e) Find the action sequence with the lowest cost, and return the first action in that sequence
114 | 
115 |             Hints:
116 |                 (i) self._cost_fn takes three arguments: states, actions, and next states. These arguments are
117 |                     2-dimensional tensors, where the 1st dimension is the batch size and the 2nd dimension is the
118 |                     state or action size
119 |                 (ii) You should call self._dynamics_func and self._cost_fn a total of self._horizon times
120 |                 (iii) Use tf.random_uniform(...) to generate the random action sequences
121 | 
122 |         """
123 |         ### PROBLEM 2
124 |         ### YOUR CODE HERE
125 |         raise NotImplementedError
126 | 
127 |         return best_action
128 | 
129 |     def _setup_graph(self):
130 |         """
131 |         Sets up the tensorflow computation graph for training, prediction, and action selection
132 | 
133 |         The variables returned will be set as class attributes (see __init__)
134 |         """
135 |         sess = tf.Session()
136 | 
137 |         ### PROBLEM 1
138 |         ### YOUR CODE HERE
139 |         raise NotImplementedError
140 |         ### PROBLEM 2
141 |         ### YOUR CODE HERE
142 |         best_action = None
143 | 
144 |         sess.run(tf.global_variables_initializer())
145 | 
146 |         return sess, state_ph, action_ph, next_state_ph, \
147 |                 next_state_pred, loss, optimizer, best_action
148 | 
149 |     def train_step(self, states, actions, next_states):
150 |         """
151 |         Performs one step of gradient descent
152 | 
153 |         returns:
154 |             loss: the loss from performing gradient descent
155 |         """
156 |         ### PROBLEM 1
157 |         ### YOUR CODE HERE
158 |         raise NotImplementedError
159 | 
160 |         return loss
161 | 
162 |     def predict(self, state, action):
163 |         """
164 |         Predicts the next state given the current state and action
165 | 
166 |         returns:
167 |             next_state_pred: predicted next state
168 | 
169 |         implementation detils:
170 |             (i) The state and action arguments are 1-dimensional vectors (NO batch dimension)
171 |         """
172 |         assert np.shape(state) == (self._state_dim,)
173 |         assert np.shape(action) == (self._action_dim,)
174 | 
175 |         ### PROBLEM 1
176 |         ### YOUR CODE HERE
177 |         raise NotImplementedError
178 | 
179 |         assert np.shape(next_state_pred) == (self._state_dim,)
180 |         return next_state_pred
181 | 
182 |     def get_action(self, state):
183 |         """
184 |         Computes the action that minimizes the cost function given the current state
185 | 
186 |         returns:
187 |             best_action: the best action
188 |         """
189 |         assert np.shape(state) == (self._state_dim,)
190 | 
191 |         ### PROBLEM 2
192 |         ### YOUR CODE HERE
193 |         raise NotImplementedError
194 | 
195 |         assert np.shape(best_action) == (self._action_dim,)
196 |         return best_action
197 | 


--------------------------------------------------------------------------------
/hw4/model_based_rl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | from model_based_policy import ModelBasedPolicy
  7 | import utils
  8 | from logger import logger
  9 | from timer import timeit
 10 | 
 11 | 
 12 | class ModelBasedRL(object):
 13 | 
 14 |     def __init__(self,
 15 |                  env,
 16 |                  num_init_random_rollouts=10,
 17 |                  max_rollout_length=500,
 18 |                  num_onplicy_iters=10,
 19 |                  num_onpolicy_rollouts=10,
 20 |                  training_epochs=60,
 21 |                  training_batch_size=512,
 22 |                  render=False,
 23 |                  mpc_horizon=15,
 24 |                  num_random_action_selection=4096,
 25 |                  nn_layers=1):
 26 |         self._env = env
 27 |         self._max_rollout_length = max_rollout_length
 28 |         self._num_onpolicy_iters = num_onplicy_iters
 29 |         self._num_onpolicy_rollouts = num_onpolicy_rollouts
 30 |         self._training_epochs = training_epochs
 31 |         self._training_batch_size = training_batch_size
 32 |         self._render = render
 33 | 
 34 |         logger.info('Gathering random dataset')
 35 |         self._random_dataset = self._gather_rollouts(utils.RandomPolicy(env),
 36 |                                                      num_init_random_rollouts)
 37 | 
 38 |         logger.info('Creating policy')
 39 |         self._policy = ModelBasedPolicy(env,
 40 |                                         self._random_dataset,
 41 |                                         horizon=mpc_horizon,
 42 |                                         num_random_action_selection=num_random_action_selection)
 43 | 
 44 |         timeit.reset()
 45 |         timeit.start('total')
 46 | 
 47 |     def _gather_rollouts(self, policy, num_rollouts):
 48 |         dataset = utils.Dataset()
 49 | 
 50 |         for _ in range(num_rollouts):
 51 |             state = self._env.reset()
 52 |             done = False
 53 |             t = 0
 54 |             while not done:
 55 |                 if self._render:
 56 |                     timeit.start('render')
 57 |                     self._env.render()
 58 |                     timeit.stop('render')
 59 |                 timeit.start('get action')
 60 |                 action = policy.get_action(state)
 61 |                 timeit.stop('get action')
 62 |                 timeit.start('env step')
 63 |                 next_state, reward, done, _ = self._env.step(action)
 64 |                 timeit.stop('env step')
 65 |                 done = done or (t >= self._max_rollout_length)
 66 |                 dataset.add(state, action, next_state, reward, done)
 67 | 
 68 |                 state = next_state
 69 |                 t += 1
 70 | 
 71 |         return dataset
 72 | 
 73 |     def _train_policy(self, dataset):
 74 |         """
 75 |         Train the model-based policy
 76 | 
 77 |         implementation details:
 78 |             (a) Train for self._training_epochs number of epochs
 79 |             (b) The dataset.random_iterator(...)  method will iterate through the dataset once in a random order
 80 |             (c) Use self._training_batch_size for iterating through the dataset
 81 |             (d) Keep track of the loss values by appending them to the losses array
 82 |         """
 83 |         timeit.start('train policy')
 84 | 
 85 |         losses = []
 86 |         ### PROBLEM 1
 87 |         ### YOUR CODE HERE
 88 |         raise NotImplementedError
 89 | 
 90 |         logger.record_tabular('TrainingLossStart', losses[0])
 91 |         logger.record_tabular('TrainingLossFinal', losses[-1])
 92 | 
 93 |         timeit.stop('train policy')
 94 | 
 95 |     def _log(self, dataset):
 96 |         timeit.stop('total')
 97 |         dataset.log()
 98 |         logger.dump_tabular(print_func=logger.info)
 99 |         logger.debug('')
100 |         for line in str(timeit).split('\n'):
101 |             logger.debug(line)
102 |         timeit.reset()
103 |         timeit.start('total')
104 | 
105 |     def run_q1(self):
106 |         """
107 |         Train on a dataset, and see how good the learned dynamics model's predictions are.
108 | 
109 |         implementation details:
110 |             (i) Train using the self._random_dataset
111 |             (ii) For each rollout, use the initial state and all actions to predict the future states.
112 |                  Store these predicted states in the pred_states list.
113 |                  NOTE: you should *not* be using any of the states in states[1:]. Only use states[0]
114 |             (iii) After predicting the future states, we have provided plotting code that plots the actual vs
115 |                   predicted states and saves these to the experiment's folder. You do not need to modify this code.
116 |         """
117 |         logger.info('Training policy....')
118 |         ### PROBLEM 1
119 |         ### YOUR CODE HERE
120 |         raise NotImplementedError
121 | 
122 |         logger.info('Evaluating predictions...')
123 |         for r_num, (states, actions, _, _, _) in enumerate(self._random_dataset.rollout_iterator()):
124 |             pred_states = []
125 | 
126 |             ### PROBLEM 1
127 |             ### YOUR CODE HERE
128 |             raise NotImplementedError
129 | 
130 |             states = np.asarray(states)
131 |             pred_states = np.asarray(pred_states)
132 | 
133 |             state_dim = states.shape[1]
134 |             rows = int(np.sqrt(state_dim))
135 |             cols = state_dim // rows
136 |             f, axes = plt.subplots(rows, cols, figsize=(3*cols, 3*rows))
137 |             f.suptitle('Model predictions (red) versus ground truth (black) for open-loop predictions')
138 |             for i, (ax, state_i, pred_state_i) in enumerate(zip(axes.ravel(), states.T, pred_states.T)):
139 |                 ax.set_title('state {0}'.format(i))
140 |                 ax.plot(state_i, color='k')
141 |                 ax.plot(pred_state_i, color='r')
142 |             plt.tight_layout()
143 |             plt.subplots_adjust(top=0.90)
144 |             f.savefig(os.path.join(logger.dir, 'prediction_{0:03d}.jpg'.format(r_num)), bbox_inches='tight')
145 | 
146 |         logger.info('All plots saved to folder')
147 | 
148 |     def run_q2(self):
149 |         """
150 |         Train the model-based policy on a random dataset, and evaluate the performance of the resulting policy
151 |         """
152 |         logger.info('Random policy')
153 |         self._log(self._random_dataset)
154 | 
155 |         logger.info('Training policy....')
156 |         ### PROBLEM 2
157 |         ### YOUR CODE HERE
158 |         raise NotImplementedError
159 | 
160 |         logger.info('Evaluating policy...')
161 |         ### PROBLEM 2
162 |         ### YOUR CODE HERE
163 |         raise NotImplementedError
164 | 
165 |         logger.info('Trained policy')
166 |         self._log(eval_dataset)
167 | 
168 |     def run_q3(self):
169 |         """
170 |         Starting with the random dataset, train the policy on the dataset, gather rollouts with the policy,
171 |         append the new rollouts to the existing dataset, and repeat
172 |         """
173 |         dataset = self._random_dataset
174 | 
175 |         itr = -1
176 |         logger.info('Iteration {0}'.format(itr))
177 |         logger.record_tabular('Itr', itr)
178 |         self._log(dataset)
179 | 
180 |         for itr in range(self._num_onpolicy_iters + 1):
181 |             logger.info('Iteration {0}'.format(itr))
182 |             logger.record_tabular('Itr', itr)
183 | 
184 |             ### PROBLEM 3
185 |             ### YOUR CODE HERE
186 |             logger.info('Training policy...')
187 |             raise NotImplementedError
188 | 
189 |             ### PROBLEM 3
190 |             ### YOUR CODE HERE
191 |             logger.info('Gathering rollouts...')
192 |             raise NotImplementedError
193 | 
194 |             ### PROBLEM 3
195 |             ### YOUR CODE HERE
196 |             logger.info('Appending dataset...')
197 |             raise NotImplementedError
198 | 
199 |             self._log(new_dataset)
200 | 


--------------------------------------------------------------------------------
/hw4/plot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib.cm as cm
 6 | import pandas
 7 | 
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--exps',  nargs='+', type=str)
11 | parser.add_argument('--save', type=str, default=None)
12 | args = parser.parse_args()
13 | 
14 | f, ax = plt.subplots(1, 1)
15 | for i, exp in enumerate(args.exps):
16 |     log_fname = os.path.join('data', exp, 'log.csv')
17 |     csv = pandas.read_csv(log_fname)
18 | 
19 |     color = cm.viridis(i / float(len(args.exps)))
20 |     ax.plot(csv['Itr'], csv['ReturnAvg'], color=color, label=exp)
21 |     ax.fill_between(csv['Itr'], csv['ReturnAvg'] - csv['ReturnStd'], csv['ReturnAvg'] + csv['ReturnStd'],
22 |                     color=color, alpha=0.2)
23 | 
24 | ax.legend()
25 | ax.set_xlabel('Iteration')
26 | ax.set_ylabel('Return')
27 | 
28 | if args.save:
29 |     os.makedirs('plots', exist_ok=True)
30 |     f.savefig(os.path.join('plots', args.save + '.jpg'))
31 | else:
32 |     plt.show()
33 | 


--------------------------------------------------------------------------------
/hw4/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | matplotlib
3 | colorlog


--------------------------------------------------------------------------------
/hw4/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ##########
 4 | ### Q1 ###
 5 | ##########
 6 | 
 7 | python main.py q1 --exp_name exp
 8 | 
 9 | ##########
10 | ### Q2 ###
11 | ##########
12 | 
13 | python main.py q2 --exp_name exp
14 | 
15 | ###########
16 | ### Q3a ###
17 | ###########
18 | 
19 | python main.py q3 --exp_name default
20 | python plot.py --exps HalfCheetah_q3_default --save HalfCheetah_q3_default
21 | 
22 | ###########
23 | ### Q3b ###
24 | ###########
25 | 
26 | python main.py q3 --exp_name action128 --num_random_action_selection 128
27 | python main.py q3 --exp_name action4096 --num_random_action_selection 4096
28 | python main.py q3 --exp_name action16384 --num_random_action_selection 16384
29 | python plot.py --exps HalfCheetah_q3_action128 HalfCheetah_q3_action4096 HalfCheetah_q3_action16384 --save HalfCheetah_q3_actions
30 | 
31 | python main.py q3 --exp_name horizon10 --mpc_horizon 10
32 | python main.py q3 --exp_name horizon15 --mpc_horizon 15
33 | python main.py q3 --exp_name horizon20 --mpc_horizon 20
34 | python plot.py --exps HalfCheetah_q3_horizon10 HalfCheetah_q3_horizon15 HalfCheetah_q3_horizon20 --save HalfCheetah_q3_mpc_horizon
35 | 
36 | python main.py q3 --exp_name layers1 --nn_layers 1
37 | python main.py q3 --exp_name layers2 --nn_layers 2
38 | python main.py q3 --exp_name layers3 --nn_layers 3
39 | python plot.py --exps HalfCheetah_q3_layers1 HalfCheetah_q3_layers2 HalfCheetah_q3_layers3 --save HalfCheetah_q3_nn_layers
40 | 


--------------------------------------------------------------------------------
/hw4/timer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from collections import defaultdict
 3 | 
 4 | class TimeIt(object):
 5 |     def __init__(self, prefix=''):
 6 |         self.prefix = prefix
 7 |         self.start_times = dict()
 8 |         self.elapsed_times = defaultdict(int)
 9 | 
10 |     def start(self, name):
11 |         assert(name not in self.start_times)
12 |         self.start_times[name] = time.time()
13 | 
14 |     def stop(self, name):
15 |         assert(name in self.start_times)
16 |         self.elapsed_times[name] += time.time() - self.start_times[name]
17 |         self.start_times.pop(name)
18 | 
19 |     def elapsed(self, name):
20 |         return self.elapsed_times[name]
21 | 
22 |     def reset(self):
23 |         self.start_times = dict()
24 |         self.elapsed_times = defaultdict(int)
25 | 
26 |     def __str__(self):
27 |         s = ''
28 |         names_elapsed = sorted(self.elapsed_times.items(), key=lambda x: x[1], reverse=True)
29 |         for name, elapsed in names_elapsed:
30 |             if 'total' not in self.elapsed_times:
31 |                 s += '{0}: {1: <10} {2:.1f}\n'.format(self.prefix, name, elapsed)
32 |             else:
33 |                 assert(self.elapsed_times['total'] >= max(self.elapsed_times.values()))
34 |                 pct = 100. * elapsed / self.elapsed_times['total']
35 |                 s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, name, elapsed, pct)
36 |         if 'total' in self.elapsed_times:
37 |             times_summed = sum([t for k, t in self.elapsed_times.items() if k != 'total'])
38 |             other_time = self.elapsed_times['total'] - times_summed
39 |             assert(other_time >= 0)
40 |             pct = 100. * other_time / self.elapsed_times['total']
41 |             s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, 'other', other_time, pct)
42 |         return s
43 | 
44 | timeit = TimeIt()
45 | 


--------------------------------------------------------------------------------
/hw4/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | from logger import logger
  5 | 
  6 | 
  7 | ############
  8 | ### Data ###
  9 | ############
 10 | 
 11 | class Dataset(object):
 12 | 
 13 |     def __init__(self):
 14 |         self._states = []
 15 |         self._actions = []
 16 |         self._next_states = []
 17 |         self._rewards = []
 18 |         self._dones = []
 19 | 
 20 |     @property
 21 |     def is_empty(self):
 22 |         return len(self) == 0
 23 | 
 24 |     def __len__(self):
 25 |         return len(self._states)
 26 | 
 27 |     ##################
 28 |     ### Statistics ###
 29 |     ##################
 30 | 
 31 |     @property
 32 |     def state_mean(self):
 33 |         return np.mean(self._states, axis=0)
 34 | 
 35 |     @property
 36 |     def state_std(self):
 37 |         return np.std(self._states, axis=0)
 38 | 
 39 |     @property
 40 |     def action_mean(self):
 41 |         return np.mean(self._actions, axis=0)
 42 | 
 43 |     @property
 44 |     def action_std(self):
 45 |         return np.std(self._actions, axis=0)
 46 | 
 47 |     @property
 48 |     def delta_state_mean(self):
 49 |         return np.mean(np.array(self._next_states) - np.array(self._states), axis=0)
 50 | 
 51 |     @property
 52 |     def delta_state_std(self):
 53 |         return np.std(np.array(self._next_states) - np.array(self._states), axis=0)
 54 | 
 55 |     ###################
 56 |     ### Adding data ###
 57 |     ###################
 58 | 
 59 |     def add(self, state, action, next_state, reward, done):
 60 |         """
 61 |         Add (s, a, r, s') to this dataset
 62 |         """
 63 |         if not self.is_empty:
 64 |             # ensure the state, action, next_state are of the same dimension
 65 |             assert len(self._states[-1]) == len(np.ravel(state))
 66 |             assert len(self._actions[-1]) == len(np.ravel(action))
 67 |             assert len(self._next_states[-1]) == len(np.ravel(next_state))
 68 | 
 69 |         self._states.append(np.ravel(state))
 70 |         self._actions.append(np.ravel(action))
 71 |         self._next_states.append(np.ravel(next_state))
 72 |         self._rewards.append(reward)
 73 |         self._dones.append(done)
 74 | 
 75 |     def append(self, other_dataset):
 76 |         """
 77 |         Append other_dataset to this dataset
 78 |         """
 79 |         if not self.is_empty and not other_dataset.is_empty:
 80 |             # ensure the state, action, next_state are of the same dimension
 81 |             assert len(self._states[-1]) == len(other_dataset._states[-1])
 82 |             assert len(self._actions[-1]) == len(other_dataset._actions[-1])
 83 |             assert len(self._next_states[-1]) == len(other_dataset._next_states[-1])
 84 | 
 85 |         self._states += other_dataset._states
 86 |         self._actions += other_dataset._actions
 87 |         self._next_states += other_dataset._next_states
 88 |         self._rewards += other_dataset._rewards
 89 |         self._dones += other_dataset._dones
 90 | 
 91 |     ############################
 92 |     ### Iterate through data ###
 93 |     ############################
 94 | 
 95 |     def rollout_iterator(self):
 96 |         """
 97 |         Iterate through all the rollouts in the dataset sequentially
 98 |         """
 99 |         end_indices = np.nonzero(self._dones)[0] + 1
100 | 
101 |         states = np.asarray(self._states)
102 |         actions = np.asarray(self._actions)
103 |         next_states = np.asarray(self._next_states)
104 |         rewards = np.asarray(self._rewards)
105 |         dones = np.asarray(self._dones)
106 | 
107 |         start_idx = 0
108 |         for end_idx in end_indices:
109 |             indices = np.arange(start_idx, end_idx)
110 |             yield states[indices], actions[indices], next_states[indices], rewards[indices], dones[indices]
111 |             start_idx = end_idx
112 | 
113 |     def random_iterator(self, batch_size):
114 |         """
115 |         Iterate once through all (s, a, r, s') in batches in a random order
116 |         """
117 |         all_indices = np.nonzero(np.logical_not(self._dones))[0]
118 |         np.random.shuffle(all_indices)
119 | 
120 |         states = np.asarray(self._states)
121 |         actions = np.asarray(self._actions)
122 |         next_states = np.asarray(self._next_states)
123 |         rewards = np.asarray(self._rewards)
124 |         dones = np.asarray(self._dones)
125 | 
126 |         i = 0
127 |         while i < len(all_indices):
128 |             indices = all_indices[i:i+batch_size]
129 | 
130 |             yield states[indices], actions[indices], next_states[indices], rewards[indices], dones[indices]
131 | 
132 |             i += batch_size
133 | 
134 |     ###############
135 |     ### Logging ###
136 |     ###############
137 | 
138 |     def log(self):
139 |         end_idxs = np.nonzero(self._dones)[0] + 1
140 | 
141 |         returns = []
142 | 
143 |         start_idx = 0
144 |         for end_idx in end_idxs:
145 |             rewards = self._rewards[start_idx:end_idx]
146 |             returns.append(np.sum(rewards))
147 | 
148 |             start_idx = end_idx
149 | 
150 |         logger.record_tabular('ReturnAvg', np.mean(returns))
151 |         logger.record_tabular('ReturnStd', np.std(returns))
152 |         logger.record_tabular('ReturnMin', np.min(returns))
153 |         logger.record_tabular('ReturnMax', np.max(returns))
154 | 
155 | ##################
156 | ### Tensorflow ###
157 | ##################
158 | 
159 | def build_mlp(input_layer,
160 |               output_dim,
161 |               scope,
162 |               n_layers=1,
163 |               hidden_dim=500,
164 |               activation=tf.nn.relu,
165 |               output_activation=None,
166 |               reuse=False):
167 |     layer = input_layer
168 |     with tf.variable_scope(scope, reuse=reuse):
169 |         for _ in range(n_layers):
170 |             layer = tf.layers.dense(layer, hidden_dim, activation=activation)
171 |         layer = tf.layers.dense(layer, output_dim, activation=output_activation)
172 |     return layer
173 | 
174 | def normalize(x, mean, std, eps=1e-8):
175 |     return (x - mean) / (std + eps)
176 | 
177 | def unnormalize(x, mean, std):
178 |     return x * std + mean
179 | 
180 | ################
181 | ### Policies ###
182 | ################
183 | 
184 | class RandomPolicy(object):
185 | 
186 |     def __init__(self, env):
187 |         self._action_space_low = env.action_space.low
188 |         self._action_space_high = env.action_space.high
189 | 
190 |     def get_action(self, state):
191 |         return np.random.uniform(self._action_space_low, self._action_space_high)
192 | 
193 | 


--------------------------------------------------------------------------------
/hw5/exp/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 5a: Exploration
 2 | 
 3 | Dependencies:
 4 |  * Python **3.5**
 5 |  * Numpy version **1.14.5**
 6 |  * TensorFlow version **1.10.5**
 7 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
 8 |  * seaborn
 9 |  * tqdm==**4.26.0**
10 | 
11 | Before doing anything, first replace `gym/envs/mujoco/half_cheetah.py` with the provided `sparse_half_cheetah.py` file. It is always a good idea to keep a copy of the original `gym/envs/mujoco/half_cheetah.py` just in case you need it for something else.
12 | 
13 | You will implement `density_model.py`, `exploration.py`, and `train_ac_exploration_f18.py`.
14 | 
15 | See the hw5a.pdf in this folder for further instructions.
16 | <!--See the [HW5 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5a.pdf) for further instructions-->.
17 | 


--------------------------------------------------------------------------------
/hw5/exp/density_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import tensorflow_probability as tfp
  4 | from ex_utils import build_mlp
  5 | 
  6 | class Density_Model(object):
  7 |     def __init__(self):
  8 |         super(Density_Model, self).__init__()
  9 | 
 10 |     def receive_tf_sess(self, sess):
 11 |         self.sess = sess
 12 | 
 13 |     def get_prob(self, state):
 14 |         raise NotImplementedError
 15 | 
 16 | class Histogram(Density_Model):
 17 |     def __init__(self, nbins, preprocessor):
 18 |         super(Histogram, self).__init__()
 19 |         self.nbins = nbins
 20 |         self.total = 0.
 21 |         self.hist = {}
 22 |         for i in range(int(self.nbins)):
 23 |             self.hist[i] = 0
 24 |         self.preprocessor = preprocessor
 25 | 
 26 |     def update_count(self, state, increment):
 27 |         """
 28 |             ### PROBLEM 1
 29 |             ### YOUR CODE HERE
 30 | 
 31 |             args:
 32 |                 state: numpy array
 33 |                 increment: int
 34 | 
 35 |             TODO:
 36 |                 1. increment the entry "bin_name" in self.hist by "increment"
 37 |                 2. increment self.total by "increment" 
 38 |         """
 39 |         bin_name = self.preprocessor(state)
 40 |         raise NotImplementedError
 41 | 
 42 |     def get_count(self, states):
 43 |         """
 44 |             ### PROBLEM 1
 45 |             ### YOUR CODE HERE
 46 | 
 47 |             args:
 48 |                 states: numpy array (bsize, ob_dim)
 49 | 
 50 |             returns: 
 51 |                 counts: numpy_array (bsize)
 52 | 
 53 |             TODO:
 54 |                 For each state in states:
 55 |                     1. get the bin_name using self.preprocessor
 56 |                     2. get the value of self.hist with key bin_name
 57 |         """
 58 |         raise NotImplementedError
 59 |         return counts
 60 | 
 61 |     def get_prob(self, states):
 62 |         """
 63 |             ### PROBLEM 1
 64 |             ### YOUR CODE HERE
 65 | 
 66 |             args:
 67 |                 states: numpy array (bsize, ob_dim)
 68 |     
 69 |             returns:
 70 |                 return the probabilities of the state (bsize)
 71 | 
 72 |             NOTE:
 73 |                 remember to normalize by float(self.total)
 74 |         """
 75 |         raise NotImplementedError
 76 |         return probs
 77 | 
 78 | class RBF(Density_Model):
 79 |     """
 80 |         https://en.wikipedia.org/wiki/Radial_basis_function_kernel
 81 |         https://en.wikipedia.org/wiki/Kernel_density_estimation
 82 |     """
 83 |     def __init__(self, sigma):
 84 |         super(RBF, self).__init__()
 85 |         self.sigma = sigma
 86 |         self.means = None
 87 | 
 88 |     def fit_data(self, data):
 89 |         """
 90 |             ### PROBLEM 2
 91 |             ### YOUR CODE HERE
 92 | 
 93 |             args:
 94 |                 data: list of states of shape (ob_dim)
 95 | 
 96 |             TODO:
 97 |                 We simply assign self.means to be equal to the data points.
 98 |                 Let the length of the data be B
 99 |                 self.means: np array (B, ob_dim)
100 |         """
101 |         B, ob_dim = len(data), len(data[0])
102 |         raise NotImplementedError
103 |         self.means = None
104 |         assert self.means.shape == (B, ob_dim)
105 | 
106 |     def get_prob(self, states):
107 |         """
108 |             ### PROBLEM 2
109 |             ### YOUR CODE HERE
110 | 
111 |             given:
112 |                 states: (b, ob_dim)
113 |                     where b is the number of states we wish to get the
114 |                     probability of
115 | 
116 |                 self.means: (B, ob_dim)
117 |                     where B is the number of states in the replay buffer
118 |                     we will plop a Gaussian distribution on top of each
119 |                     of self.means with a std of self.sigma
120 | 
121 |             TODO:
122 |                 1. Compute deltas: for each state in states, compute the
123 |                     difference between that state and every mean in self.means.
124 |                 2. Euclidean distance: sum the squared deltas
125 |                 3. Gaussian: evaluate the probability of the state under the 
126 |                     gaussian centered around each mean. The hyperparameters
127 |                     for the reference solution assume that you do not normalize
128 |                     the gaussian. This is fine since the rewards will be 
129 |                     normalized later when we compute advantages anyways.
130 |                 4. Average: average the probabilities from each gaussian
131 |         """
132 |         b, ob_dim = states.shape
133 |         if self.means is None:
134 |             # Return a uniform distribution if we don't have samples in the 
135 |             # replay buffer yet.
136 |             return (1.0/len(states))*np.ones(len(states))
137 |         else:
138 |             B, replay_dim = self.means.shape
139 |             assert states.ndim == self.means.ndim and ob_dim == replay_dim
140 | 
141 |             # 1. Compute deltas
142 |             deltas = raise NotImplementedError
143 |             assert deltas.shape == (b, B, ob_dim)
144 | 
145 |             # 2. Euclidean distance
146 |             euc_dists = raise NotImplementedError
147 |             assert euc_dists.shape == (b, B)
148 | 
149 |             # Gaussian
150 |             gaussians = raise NotImplementedError
151 |             assert gaussians.shape == (b, B)
152 | 
153 |             # 4. Average
154 |             densities = raise NotImplementedError
155 |             assert densities.shape == (b,)
156 | 
157 |             return densities
158 | 
159 | class Exemplar(Density_Model):
160 |     def __init__(self, ob_dim, hid_dim, learning_rate, kl_weight):
161 |         super(Exemplar, self).__init__()
162 |         self.ob_dim = ob_dim
163 |         self.hid_dim = hid_dim
164 |         self.learning_rate = learning_rate
165 |         self.kl_weight = kl_weight
166 | 
167 |     def build_computation_graph(self):
168 |         """
169 |             ### PROBLEM 3
170 |             ### YOUR CODE HERE
171 | 
172 |             TODO:
173 |                 1. self.log_likelihood. shape: (batch_size)
174 |                     - use tf.squeeze
175 |                     - use the discriminator to get the log prob of the discrim_target
176 |                 2. self.likelihood. shape: (batch_size)
177 |                     - use tf.squeeze
178 |                     - use the discriminator to get the prob of the discrim_target
179 |                 3. self.kl. shape: (batch_size)
180 |                     - simply add the kl divergence between self.encoder1 and 
181 |                         the prior and the kl divergence between self.encoder2 
182 |                         and the prior. Do not average.
183 |                 4. self.elbo: 
184 |                     - subtract the kl (weighted by self.kl_weight) from the 
185 |                         log_likelihood, and average over the batch
186 |                 5. self.update_op: use the AdamOptimizer with self.learning_rate 
187 |                     to minimize the -self.elbo (Note the negative sign!)
188 | 
189 |             Hint:
190 |                 https://www.tensorflow.org/probability/api_docs/python/tfp/distributions
191 |         """
192 |         self.state1, self.state2 = self.define_placeholders()
193 |         self.encoder1, self.encoder2, self.prior, self.discriminator = self.forward_pass(self.state1, self.state2)
194 |         self.discrim_target = tf.placeholder(shape=[None, 1], name="discrim_target", dtype=tf.float32)
195 | 
196 |         raise NotImplementedError
197 |         self.log_likelihood = None
198 |         self.likelihood = None
199 |         self.kl = None
200 |         assert len(self.log_likelihood.shape) == len(self.likelihood.shape) == len(self.kl.shape) == 1
201 | 
202 |         raise NotImplementedError
203 |         self.elbo = None
204 |         self.update_op = None
205 | 
206 |     def define_placeholders(self):
207 |         state1 = tf.placeholder(shape=[None, self.ob_dim], name="s1", dtype=tf.float32)
208 |         state2 = tf.placeholder(shape=[None, self.ob_dim], name="s2", dtype=tf.float32)
209 |         return state1, state2
210 | 
211 |     def make_encoder(self, state, z_size, scope, n_layers, hid_size):
212 |         """
213 |             ### PROBLEM 3
214 |             ### YOUR CODE HERE
215 | 
216 |             args:
217 |                 state: tf variable
218 |                 z_size: output dimension of the encoder network
219 |                 scope: scope name
220 |                 n_layers: number of layers of the encoder network
221 |                 hid_size: hidden dimension of encoder network
222 | 
223 |             TODO:
224 |                 1. z_mean: the output of a neural network that takes the state as input,
225 |                     has output dimension z_size, n_layers layers, and hidden 
226 |                     dimension hid_size
227 |                 2. z_logstd: a trainable variable, initialized to 0
228 |                     shape (z_size,)
229 | 
230 |             Hint: use build_mlp
231 |         """
232 |         z_mean = raise NotImplementedError
233 |         z_logstd = raise NotImplementedError
234 |         return tfp.distributions.MultivariateNormalDiag(loc=z_mean, scale_diag=tf.exp(z_logstd))
235 | 
236 |     def make_prior(self, z_size):
237 |         """
238 |             ### PROBLEM 3
239 |             ### YOUR CODE HERE
240 | 
241 |             args:
242 |                 z_size: output dimension of the encoder network
243 | 
244 |             TODO:
245 |                 prior_mean and prior_logstd are for a standard normal distribution
246 |                     both have dimension z_size
247 |         """
248 |         prior_mean = raise NotImplementedError
249 |         prior_logstd = raise NotImplementedError
250 |         return tfp.distributions.MultivariateNormalDiag(loc=prior_mean, scale_diag=tf.exp(prior_logstd))
251 | 
252 |     def make_discriminator(self, z, output_size, scope, n_layers, hid_size):
253 |         """
254 |             ### PROBLEM 3
255 |             ### YOUR CODE HERE
256 | 
257 |             args:
258 |                 z: input to to discriminator network
259 |                 output_size: output dimension of discriminator network
260 |                 scope: scope name
261 |                 n_layers: number of layers of discriminator network
262 |                 hid_size: hidden dimension of discriminator network 
263 | 
264 |             TODO:
265 |                 1. logit: the output of a neural network that takes z as input,
266 |                     has output size output_size, n_layers layers, and hidden
267 |                     dimension hid_size
268 | 
269 |             Hint: use build_mlp
270 |         """
271 |         logit = raise NotImplementedError
272 |         return tfp.distributions.Bernoulli(logit)
273 | 
274 |     def forward_pass(self, state1, state2):
275 |         """
276 |             ### PROBLEM 3
277 |             ### YOUR CODE HERE
278 | 
279 |             args:
280 |                 state1: tf variable
281 |                 state2: tf variable
282 |             
283 |             encoder1: tfp.distributions.MultivariateNormalDiag distribution
284 |             encoder2: tfp.distributions.MultivariateNormalDiag distribution
285 |             prior: tfp.distributions.MultivariateNormalDiag distribution
286 |             discriminator: tfp.distributions.Bernoulli distribution
287 | 
288 |             TODO:
289 |                 1. z1: sample from encoder1
290 |                 2. z2: sample from encoder2
291 |                 3. z: concatenate z1 and z2
292 | 
293 |             Hint: 
294 |                 https://www.tensorflow.org/probability/api_docs/python/tfp/distributions
295 |         """
296 |         # Reuse
297 |         make_encoder1 = tf.make_template('encoder1', self.make_encoder)
298 |         make_encoder2 = tf.make_template('encoder2', self.make_encoder)
299 |         make_discriminator = tf.make_template('decoder', self.make_discriminator)
300 | 
301 |         # Encoder
302 |         encoder1 = make_encoder1(state1, self.hid_dim/2, 'z1', n_layers=2, hid_size=self.hid_dim)
303 |         encoder2 = make_encoder2(state2, self.hid_dim/2, 'z2', n_layers=2, hid_size=self.hid_dim)
304 | 
305 |         # Prior
306 |         prior = self.make_prior(self.hid_dim/2)
307 | 
308 |         # Sampled Latent
309 |         z1 = raise NotImplementedError
310 |         z2 = raise NotImplementedError
311 |         z = raise NotImplementedError
312 | 
313 |         # Discriminator
314 |         discriminator = make_discriminator(z, 1, 'discriminator', n_layers=2, hid_size=self.hid_dim)
315 |         return encoder1, encoder2, prior, discriminator
316 | 
317 |     def update(self, state1, state2, target):
318 |         """
319 |             ### PROBLEM 3
320 |             ### YOUR CODE HERE
321 | 
322 |             args:
323 |                 state1: np array (batch_size, ob_dim)
324 |                 state2: np array (batch_size, ob_dim)
325 |                 target: np array (batch_size, 1)
326 | 
327 |             TODO:
328 |                 train the density model and return
329 |                     ll: log_likelihood
330 |                     kl: kl divergence
331 |                     elbo: elbo
332 |         """
333 |         assert state1.ndim == state2.ndim == target.ndim
334 |         assert state1.shape[1] == state2.shape[1] == self.ob_dim
335 |         assert state1.shape[0] == state2.shape[0] == target.shape[0]
336 |         raise NotImplementedError
337 |         return ll, kl, elbo
338 | 
339 |     def get_likelihood(self, state1, state2):
340 |         """
341 |             ### PROBLEM 3
342 |             ### YOUR CODE HERE
343 | 
344 |             args:
345 |                 state1: np array (batch_size, ob_dim)
346 |                 state2: np array (batch_size, ob_dim)
347 | 
348 |             TODO:
349 |                 likelihood of state1 == state2
350 | 
351 |             Hint:
352 |                 what should be the value of self.discrim_target?
353 |         """
354 |         assert state1.ndim == state2.ndim
355 |         assert state1.shape[1] == state2.shape[1] == self.ob_dim
356 |         assert state1.shape[0] == state2.shape[0]
357 |         raise NotImplementedError
358 |         return likelihood
359 | 
360 |     def get_prob(self, state):
361 |         """
362 |             ### PROBLEM 3
363 |             ### YOUR CODE HERE
364 |         
365 |             args:
366 |                 state: np array (batch_size, ob_dim)
367 | 
368 |             TODO:
369 |                 likelihood: 
370 |                     evaluate the discriminator D(x,x) on the same input
371 |                 prob:
372 |                     compute the probability density of x from the discriminator
373 |                     likelihood (see homework doc)
374 |         """
375 |         likelihood = raise NotImplementedError
376 |         # avoid divide by 0 and log(0)
377 |         likelihood = np.clip(np.squeeze(likelihood), 1e-5, 1-1e-5)
378 |         prob = raise NotImplementedError
379 |         return prob
380 | 


--------------------------------------------------------------------------------
/hw5/exp/ex_utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None):
 4 |     """
 5 |         Builds a feedforward neural network
 6 |         
 7 |         arguments:
 8 |             input_placeholder: placeholder variable for the state (batch_size, input_size)
 9 |             output_size: size of the output layer
10 |             scope: variable scope of the network
11 |             n_layers: number of hidden layers
12 |             size: dimension of the hidden layer
13 |             activation: activation of the hidden layers
14 |             output_activation: activation of the ouput layers
15 | 
16 |         returns:
17 |             output placeholder of the network (the result of a forward pass) 
18 | 
19 |         Hint: use tf.layers.dense    
20 |     """
21 |     output_placeholder = input_placeholder
22 |     with tf.variable_scope(scope):
23 |         for _ in range(n_layers):
24 |             output_placeholder = tf.layers.dense(output_placeholder, size, activation=activation)
25 |         output_placeholder = tf.layers.dense(output_placeholder, output_size, activation=output_activation)
26 |     return output_placeholder


--------------------------------------------------------------------------------
/hw5/exp/exploration.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | 
  5 | from density_model import Density_Model
  6 | from replay import Replay_Buffer
  7 | 
  8 | class Exploration(object):
  9 |     def __init__(self, density_model, bonus_coeff):
 10 |         super(Exploration, self).__init__()
 11 |         self.density_model = density_model
 12 |         self.bonus_coeff = bonus_coeff
 13 | 
 14 |     def receive_tf_sess(self, sess):
 15 |         self.density_model.receive_tf_sess(sess)
 16 |         self.sess = sess
 17 | 
 18 |     def bonus_function(self, x):
 19 |         # You do not need to do anything here
 20 |         raise NotImplementedError
 21 | 
 22 |     def fit_density_model(self, states):
 23 |         # You do not need to do anything here
 24 |         raise NotImplementedError
 25 | 
 26 |     def compute_reward_bonus(self, states):
 27 |         # You do not need to do anything here
 28 |         raise NotImplementedError
 29 | 
 30 |     def modify_reward(self, rewards, states):
 31 |         """
 32 |             ### PROBLEM 1
 33 |             ### YOUR CODE HERE
 34 | 
 35 |             args:
 36 |                 states: (bsize, ob_dim)
 37 | 
 38 |             TODO:
 39 |                 Use self.compute_reward_bonus to compute the reward
 40 |                 bonus and then modify the rewards with the bonus
 41 |                 and store that in new_rewards, which you will return
 42 |         """
 43 |         raise NotImplementedError
 44 |         bonus = None
 45 |         new_rewards = None
 46 |         return new_rewards
 47 | 
 48 | class DiscreteExploration(Exploration):
 49 |     def __init__(self, density_model, bonus_coeff):
 50 |         super(DiscreteExploration, self).__init__(density_model, bonus_coeff)
 51 | 
 52 |     def fit_density_model(self, states):
 53 |         """
 54 |             ### PROBLEM 1
 55 |             ### YOUR CODE HERE
 56 | 
 57 |             args:
 58 |                 states: (bsize, ob_dim)
 59 |         """
 60 |         raise NotImplementedError
 61 | 
 62 |     def bonus_function(self, count):
 63 |         """
 64 |             ### PROBLEM 1
 65 |             ### YOUR CODE HERE
 66 | 
 67 |             args:
 68 |                 count: np array (bsize)
 69 |         """
 70 |         raise NotImplementedError
 71 | 
 72 |     def compute_reward_bonus(self, states):
 73 |         """
 74 |             ### PROBLEM 1
 75 |             ### YOUR CODE HERE
 76 | 
 77 |             args:
 78 |                 states: (bsize, ob_dim)
 79 |         """
 80 |         count = raise NotImplementedError
 81 |         bonus = raise NotImplementedError
 82 |         return bonus
 83 | 
 84 | 
 85 | class ContinuousExploration(Exploration):
 86 |     def __init__(self, density_model, bonus_coeff, replay_size):
 87 |         super(ContinuousExploration, self).__init__(density_model, bonus_coeff)
 88 |         self.replay_buffer = Replay_Buffer(max_size=replay_size)
 89 | 
 90 |     def fit_density_model(self, states):
 91 |         # You do not need to do anything here
 92 |         raise NotImplementedError
 93 | 
 94 |     def bonus_function(self, prob):
 95 |         """
 96 |             ### PROBLEM 2
 97 |             ### YOUR CODE HERE
 98 | 
 99 |             args:
100 |                 prob: np array (bsize,)
101 |         """
102 |         raise NotImplementedError
103 | 
104 |     def compute_reward_bonus(self, states):
105 |         """
106 |             ### PROBLEM 2
107 |             ### YOUR CODE HERE
108 |         
109 |             args:
110 |                 states: (bsize, ob_dim)
111 |         """
112 |         raise NotImplementedError
113 |         prob = None
114 |         bonus = None
115 |         return bonus
116 | 
117 | 
118 | class RBFExploration(ContinuousExploration):
119 |     def __init__(self, density_model, bonus_coeff, replay_size):
120 |         super(RBFExploration, self).__init__(density_model, bonus_coeff, replay_size)
121 | 
122 |     def fit_density_model(self, states):
123 |         """
124 |             args:
125 |                 states: (bsize, ob_dim)
126 |         """
127 |         self.replay_buffer.prepend(states)
128 |         self.density_model.fit_data(self.replay_buffer.get_memory())
129 | 
130 | 
131 | class ExemplarExploration(ContinuousExploration):
132 |     def __init__(self, density_model, bonus_coeff, train_iters, bsize, replay_size):
133 |         super(ExemplarExploration, self).__init__(density_model, bonus_coeff, replay_size)
134 |         self.train_iters = train_iters
135 |         self.bsize = bsize   
136 | 
137 |     def sample_idxs(self, states, batch_size):
138 |         states = copy.deepcopy(states)
139 |         data_size = len(states)
140 |         pos_idxs = np.random.randint(data_size, size=batch_size)
141 |         continue_sampling = True
142 |         while continue_sampling:
143 |             neg_idxs = np.random.randint(data_size, size=batch_size)
144 |             if np.all(pos_idxs != neg_idxs):
145 |                 continue_sampling = False
146 |         positives = np.concatenate([states[pos_idxs], states[pos_idxs]], axis=0)
147 |         negatives = np.concatenate([states[pos_idxs], states[neg_idxs]], axis=0)
148 |         return positives, negatives
149 | 
150 |     def sample_idxs_replay(self, states, batch_size):
151 |         states = copy.deepcopy(states)
152 |         data_size = len(states)
153 |         pos_idxs = np.random.randint(data_size, size=batch_size)
154 |         neg_idxs = np.random.randint(data_size, len(self.replay_buffer), size=batch_size)
155 |         positives = np.concatenate([states[pos_idxs], states[pos_idxs]], axis=0)
156 |         negatives = np.concatenate([states[pos_idxs], self.replay_buffer[neg_idxs]], axis=0)
157 |         return positives, negatives
158 | 
159 |     def fit_density_model(self, states):
160 |         """
161 |             args:
162 |                 states: (bsize, ob_dim)
163 |         """
164 |         self.replay_buffer.prepend(states)
165 |         for i in range(self.train_iters):
166 |             if len(self.replay_buffer) >= 2*len(states):
167 |                 positives, negatives = self.sample_idxs_replay(states, self.bsize)
168 |             else:
169 |                 positives, negatives = self.sample_idxs(states, self.bsize)
170 |             labels = np.concatenate([np.ones((self.bsize, 1)), np.zeros((self.bsize, 1))], axis=0)
171 |             ll, kl, elbo = self.density_model.update(positives, negatives, labels)
172 |             if i % (self.train_iters/10) == 0:
173 |                 print('log likelihood\t{}\tkl divergence\t{}\t-elbo\t{}'.format(np.mean(ll), np.mean(kl), -elbo))
174 |         return ll, kl, elbo
175 | 


--------------------------------------------------------------------------------
/hw5/exp/hw5a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw5/exp/hw5a.pdf


--------------------------------------------------------------------------------
/hw5/exp/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw5/exp/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     # plt.legend(loc='best', bbox_to_anchor=(1, 1), fontsize=8).draggable()
 59 |     plt.show()
 60 | 
 61 | 
 62 | def get_datasets(fpath, condition=None):
 63 |     unit = 0
 64 |     datasets = []
 65 |     for root, dir, files in os.walk(fpath):
 66 |         if 'log.txt' in files:
 67 |             param_path = open(os.path.join(root,'params.json'))
 68 |             params = json.load(param_path)
 69 |             exp_name = params['exp_name']
 70 |             
 71 |             log_path = os.path.join(root,'log.txt')
 72 |             experiment_data = pd.read_table(log_path)
 73 | 
 74 |             experiment_data.insert(
 75 |                 len(experiment_data.columns),
 76 |                 'Unit',
 77 |                 unit
 78 |                 )        
 79 |             experiment_data.insert(
 80 |                 len(experiment_data.columns),
 81 |                 'Condition',
 82 |                 condition or exp_name
 83 |                 )
 84 | 
 85 |             datasets.append(experiment_data)
 86 |             unit += 1
 87 | 
 88 |     return datasets
 89 | 
 90 | 
 91 | def main():
 92 |     import argparse
 93 |     parser = argparse.ArgumentParser()
 94 |     parser.add_argument('logdir', nargs='*')
 95 |     parser.add_argument('--legend', nargs='*')
 96 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 97 |     args = parser.parse_args()
 98 | 
 99 |     use_legend = False
100 |     if args.legend is not None:
101 |         assert len(args.legend) == len(args.logdir), \
102 |             "Must give a legend title for each set of experiments."
103 |         use_legend = True
104 | 
105 |     data = []
106 |     if use_legend:
107 |         for logdir, legend_title in zip(args.logdir, args.legend):
108 |             data += get_datasets(logdir, legend_title)
109 |     else:
110 |         for logdir in args.logdir:
111 |             data += get_datasets(logdir)
112 | 
113 |     if isinstance(args.value, list):
114 |         values = args.value
115 |     else:
116 |         values = [args.value]
117 |     for value in values:
118 |         plot_data(data, value=value)
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/hw5/exp/pointmass.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym.envs.registration import EnvSpec
  3 | import imageio
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import os
  7 | import seaborn as sns
  8 | from tqdm import tqdm
  9 | 
 10 | class Env(object):
 11 |     def __init__(self):
 12 |         super(Env, self).__init__()
 13 | 
 14 |     def reset(self):
 15 |         raise NotImplementedError
 16 | 
 17 |     def step(self, action):
 18 |         raise NotImplementedError
 19 | 
 20 |     def seed(self, seed):
 21 |         raise NotImplementedError
 22 | 
 23 | class PointMass(Env):
 24 |     def __init__(self, max_episode_steps_coeff=1, scale=20, goal_padding=2.0):
 25 |         super(PointMass, self).__init__()
 26 |         # define scale such that the each square in the grid is 1 x 1
 27 |         self.scale = int(scale)
 28 |         self.grid_size = self.scale * self.scale
 29 |         self.observation_space = gym.spaces.Box(
 30 |             low=np.array([0.0, 0.0]),
 31 |             high=np.array([1.0, 1.0]))
 32 |         self.action_space = gym.spaces.Box(
 33 |             low=np.array([-np.inf, -np.inf]),
 34 |             high=np.array([np.inf, np.inf]))
 35 |         self.goal_padding = goal_padding
 36 |         self.spec = EnvSpec(id='PointMass-v0', max_episode_steps=int(max_episode_steps_coeff*self.scale))
 37 | 
 38 |     def reset(self):
 39 |         plt.close()
 40 |         self.state = np.array([self.goal_padding, self.goal_padding])
 41 |         state = self.state/self.scale
 42 |         return state
 43 | 
 44 |     def step(self, action):
 45 |         x, y = action
 46 | 
 47 |         # next state
 48 |         new_x = self.state[0]+x
 49 |         new_y = self.state[1]+y
 50 |         if new_x < 0:
 51 |             new_x = 0
 52 |         if new_x > self.scale:
 53 |             new_x = self.scale
 54 |         if new_y < 0:
 55 |             new_y = 0
 56 |         if new_y > self.scale:
 57 |             new_y = self.scale
 58 |         self.state = np.array([new_x, new_y])
 59 |         state = self.state/self.scale
 60 | 
 61 |         # reward
 62 |         reg_term = -0.01*np.sum(action**2)
 63 | 
 64 |         threshold = self.scale - self.goal_padding
 65 |         if new_x > threshold and new_y > threshold:
 66 |             reward = 10 + reg_term
 67 |         else:
 68 |             reward = 0 + reg_term
 69 | 
 70 |         # done
 71 |         done = False
 72 | 
 73 |         return state, reward, done, None
 74 | 
 75 |     def preprocess(self, state):
 76 |         scaled_state = self.scale * state
 77 |         x_floor, y_floor = np.floor(scaled_state)
 78 |         assert x_floor <= self.scale
 79 |         assert y_floor <= self.scale
 80 |         if x_floor == self.scale:
 81 |             x_floor -= 1
 82 |         if y_floor == self.scale:
 83 |             y_floor -= 1
 84 |         index = self.scale*x_floor + y_floor
 85 |         return index
 86 | 
 87 |     def unprocess(self, index):
 88 |         x_floor = index // self.scale
 89 |         y_floor = index % self.scale
 90 |         unscaled_state = np.array([x_floor, y_floor])/self.scale
 91 |         return unscaled_state
 92 | 
 93 |     def seed(self, seed):
 94 |         pass
 95 | 
 96 |     def render(self):
 97 |         # create a grid
 98 |         states = [self.state/self.scale]
 99 |         indices = np.array([int(self.preprocess(s)) for s in states])
100 |         a = np.zeros(self.grid_size)
101 |         for i in indices:
102 |             a[i] += 1
103 |         max_freq = np.max(a)
104 |         a/=float(max_freq)  # normalize
105 |         a = np.reshape(a, (self.scale, self.scale))
106 |         ax = sns.heatmap(a)
107 |         plt.draw()
108 |         plt.pause(0.001)
109 |         plt.clf()        
110 | 
111 |     def visualize(self, states, itr, dirname):
112 |         if states is None:
113 |             states = np.load(os.path.join(dirname, '{}.npy'.format(itr)))
114 |         indices = np.array([int(self.preprocess(s)) for s in states])
115 |         a = np.zeros(int(self.grid_size))
116 |         for i in indices:
117 |             a[i] += 1
118 |         max_freq = np.max(a)
119 |         a/=float(max_freq)  # normalize
120 |         a = np.reshape(a, (self.scale, self.scale))
121 |         ax = sns.heatmap(a)
122 |         plt.savefig(os.path.join(dirname, '{}.png'.format(itr)))
123 |         plt.close()
124 | 
125 |     def create_gif(self, dirname, density=False):
126 |         images = []
127 |         if density:
128 |             filenames = [x for x in os.listdir(dirname) if '_density.png' in x]
129 |             sorted_fnames = sorted(filenames, key=lambda x: int(x.split('_density.png')[0]))
130 |         else:
131 |             filenames = [x for x in os.listdir(dirname) if ('.png' in x and 'density' not in x)]
132 |             sorted_fnames = sorted(filenames, key=lambda x: int(x.split('.png')[0]))
133 |         for f in sorted_fnames:
134 |             images.append(imageio.imread(os.path.join(dirname, f)))
135 |         imageio.mimsave(os.path.join(dirname, 'exploration.gif'), images)
136 | 
137 |     def create_visualization(self, dirname, density=False):
138 |         for s in os.listdir(dirname):
139 |             for i in tqdm(range(100)):
140 |                 self.visualize(None, i, os.path.join(dirname, s))
141 |             self.create_gif(os.path.join(dirname, str(s)))
142 | 
143 | def debug():
144 |     logdir = 'pm_debug'
145 |     os.mkdir(logdir)
146 |     num_episodes = 10
147 |     num_steps_per_epoch = 20
148 | 
149 |     env = PointMass()
150 |     for epoch in range(num_episodes):
151 |         states = []
152 |         state = env.reset()
153 |         for i in range(num_steps_per_epoch):
154 |             action = np.random.rand(2)
155 |             state, reward, done, _ = env.step(action)
156 |             states.append(state)
157 |         env.visualize(np.array(states), epoch, logdir)
158 |     env.create_gif(logdir)
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     # debug()  # run this if you want to get a feel for how the PointMass environment works (make sure to comment out the code below)
163 |     import argparse
164 |     parser = argparse.ArgumentParser()
165 |     parser.add_argument('dirname', type=str)
166 |     args = parser.parse_args()
167 |     env = PointMass()
168 |     env.create_visualization(args.dirname)
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/hw5/exp/replay.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import copy
 4 | 
 5 | class Replay_Buffer(object):
 6 |     def __init__(self, max_size=np.inf):
 7 |         self.memory = []
 8 |         self.max_size = int(max_size)
 9 | 
10 |     def adjust_size(self):
11 |         if len(self.memory) > self.max_size:
12 |             diff = int(len(self.memory) - self.max_size)
13 |             self.memory = self.memory[:-diff]  # FIFO
14 |             print('Adjusted replay size')
15 | 
16 |     def prepend(self, x):
17 |         # assume x is a list of states
18 |         self.memory = list(x) + self.memory
19 |         self.adjust_size()
20 | 
21 |     def sample(self, batch_size):
22 |         random_batch = random.sample(self.memory, batch_size)
23 |         return random_batch
24 | 
25 |     def __len__(self):
26 |         return len(self.memory)
27 | 
28 |     def __getitem__(self, indices):
29 |         return copy.deepcopy(np.array([self.memory[i] for i in indices]))
30 | 
31 |     def get_memory(self):
32 |         return copy.deepcopy(self.memory)
33 | 
34 |     def clear_buffer(self):
35 |         del self.memory[:]


--------------------------------------------------------------------------------
/hw5/exp/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.5
2 | mujoco-py==1.50.1.56
3 | tensorflow
4 | numpy
5 | seaborn
6 | tqdm


--------------------------------------------------------------------------------
/hw5/exp/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ##########################
 4 | ### P1 Hist PointMass  ###
 5 | ##########################
 6 | 
 7 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model none -s 8 --exp_name PM_bc0_s8
 8 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model hist -bc 0.01 -s 8 --exp_name PM_hist_bc0.01_s8
 9 | 
10 | ##########################
11 | ###  P2 RBF PointMass  ###
12 | ##########################
13 | 
14 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model rbf -bc 0.01 -s 8 -sig 0.2 --exp_name PM_rbf_bc0.01_s8_sig0.2
15 | 
16 | ##########################
17 | ###  P3 EX2 PointMass  ###
18 | ##########################
19 | 
20 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model ex2 -s 8 -bc 0.05 -kl 0.1 -dlr 0.001 -dh 8 -dti 1000 --exp_name PM_ex2_s8_bc0.05_kl0.1_dlr0.001_dh8_dti1000
21 | 
22 | ###########################
23 | ###    P4 HalfCheetah   ###
24 | ###########################
25 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model none --exp_name HC_bc0
26 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.001 -kl 0.1 -dlr 0.005 -dti 1000 --exp_name HC_bc0.001_kl0.1_dlr0.005_dti1000
27 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.0001 -kl 0.1 -dlr 0.005 -dti 10000 --exp_name HC_bc0.0001_kl0.1_dlr0.005_dti10000
28 | 


--------------------------------------------------------------------------------
/hw5/exp/sparse_half_cheetah.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     def __init__(self):
 7 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
 8 |         utils.EzPickle.__init__(self)
 9 | 
10 |     def step(self, action):
11 |         #################################################
12 |         ctrl = False
13 |         relu = False
14 |         threshold = 10.0
15 |         #################################################
16 |         xposbefore = self.sim.data.qpos[0]
17 |         self.do_simulation(action, self.frame_skip)
18 |         xposafter = self.sim.data.qpos[0]
19 |         ob = self._get_obs()
20 |         # reward_ctrl = - 0.1 * np.square(action).sum()
21 |         # reward_run = (xposafter - xposbefore)/self.dt
22 |         #################################################
23 |         if ctrl:
24 |             reward_ctrl = - 0.1 * np.square(action).sum()
25 |         else:
26 |             reward_ctrl = 0
27 |         if abs(xposafter) <= threshold:
28 |             reward_run = 0.0
29 |         else:
30 |             if relu:
31 |                 reward_run = np.sign(xposafter)*(xposafter - xposbefore)/self.dt
32 |             else:
33 |                 reward_run = 1.0
34 |         #################################################
35 |         reward = reward_ctrl + reward_run
36 |         done = False
37 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
38 | 
39 |     def _get_obs(self):
40 |         return np.concatenate([
41 |             self.sim.data.qpos.flat[1:],
42 |             self.sim.data.qvel.flat,
43 |         ])
44 | 
45 |     def reset_model(self):
46 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
47 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
48 |         self.set_state(qpos, qvel)
49 |         return self._get_obs()
50 | 
51 |     def viewer_setup(self):
52 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
53 | 


--------------------------------------------------------------------------------
/hw5/meta/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 5c: Meta-Learning
 2 | 
 3 | Dependencies:
 4 |  * Python **3.5**
 5 |  * Numpy version 1.14.5
 6 |  * TensorFlow version 1.10.5
 7 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
 8 |  * OpenAI Gym version **0.10.5**
 9 |  * seaborn
10 |  * Box2D==2.3.2
11 | 
12 | See the [HW5c PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5c.pdf) for further instructions.
13 | 


--------------------------------------------------------------------------------
/hw5/meta/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw5/meta/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     plt.show()
 59 | 
 60 | 
 61 | def get_datasets(fpath, condition=None):
 62 |     unit = 0
 63 |     datasets = []
 64 |     for root, dir, files in os.walk(fpath):
 65 |         if 'log.txt' in files:
 66 |             param_path = open(os.path.join(root,'params.json'))
 67 |             params = json.load(param_path)
 68 |             exp_name = params['exp_name']
 69 |             
 70 |             log_path = os.path.join(root,'log.txt')
 71 |             experiment_data = pd.read_table(log_path)
 72 | 
 73 |             experiment_data.insert(
 74 |                 len(experiment_data.columns),
 75 |                 'Unit',
 76 |                 unit
 77 |                 )        
 78 |             experiment_data.insert(
 79 |                 len(experiment_data.columns),
 80 |                 'Condition',
 81 |                 condition or exp_name
 82 |                 )
 83 | 
 84 |             datasets.append(experiment_data)
 85 |             unit += 1
 86 | 
 87 |     return datasets
 88 | 
 89 | 
 90 | def main():
 91 |     import argparse
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('logdir', nargs='*')
 94 |     parser.add_argument('--legend', nargs='*')
 95 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 96 |     args = parser.parse_args()
 97 | 
 98 |     use_legend = False
 99 |     if args.legend is not None:
100 |         assert len(args.legend) == len(args.logdir), \
101 |             "Must give a legend title for each set of experiments."
102 |         use_legend = True
103 | 
104 |     data = []
105 |     if use_legend:
106 |         for logdir, legend_title in zip(args.logdir, args.legend):
107 |             data += get_datasets(logdir, legend_title)
108 |     else:
109 |         for logdir in args.logdir:
110 |             data += get_datasets(logdir)
111 | 
112 |     if isinstance(args.value, list):
113 |         values = args.value
114 |     else:
115 |         values = [args.value]
116 |     for value in values:
117 |         plot_data(data, value=value)
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/hw5/meta/point_mass.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import spaces
 3 | from gym import Env
 4 | 
 5 | 
 6 | class PointEnv(Env):
 7 |     """
 8 |     point mass on a 2-D plane
 9 |     goals are sampled randomly from a square
10 |     """
11 | 
12 |     def __init__(self, num_tasks=1):
13 |         self.reset_task()
14 |         self.reset()
15 |         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
16 |         self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
17 | 
18 |     def reset_task(self, is_evaluation=False):
19 |         '''
20 |         sample a new task randomly
21 | 
22 |         Problem 3: make training and evaluation goals disjoint sets
23 |         if `is_evaluation` is true, sample from the evaluation set,
24 |         otherwise sample from the training set
25 |         '''
26 |         #====================================================================================#
27 |         #                           ----------PROBLEM 3----------
28 |         #====================================================================================#
29 |         # YOUR CODE HERE
30 |         x = np.random.uniform(-10, 10)
31 |         y = np.random.uniform(-10, 10)
32 |         self._goal = np.array([x, y])
33 | 
34 |     def reset(self):
35 |         self._state = np.array([0, 0], dtype=np.float32)
36 |         return self._get_obs()
37 | 
38 |     def _get_obs(self):
39 |         return np.copy(self._state)
40 | 
41 |     def reward_function(self, x, y):
42 |         return - (x ** 2 + y ** 2) ** 0.5
43 | 
44 |     def step(self, action):
45 |         x, y = self._state
46 |         # compute reward, add penalty for large actions instead of clipping them
47 |         x -= self._goal[0]
48 |         y -= self._goal[1]
49 |         # check if task is complete
50 |         done = abs(x) < .01 and abs(y) < .01
51 |         reward = self.reward_function(x, y)
52 |         # move to next state
53 |         self._state = self._state + action
54 |         ob = self._get_obs()
55 |         return ob, reward, done, dict()
56 | 
57 |     def viewer_setup(self):
58 |         print('no viewer')
59 |         pass
60 | 
61 |     def render(self):
62 |         print('current state:', self._state)
63 | 
64 |     def seed(self, seed):
65 |         np.random.seed = seed
66 | 


--------------------------------------------------------------------------------
/hw5/meta/point_mass_observed.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import spaces
 3 | from gym import Env
 4 | 
 5 | 
 6 | class ObservedPointEnv(Env):
 7 |     """
 8 |     point mass on a 2-D plane
 9 |     four tasks: move to (-10, -10), (-10, 10), (10, -10), (10, 10)
10 | 
11 |     Problem 1: augment the observation with a one-hot vector encoding the task ID
12 |      - change the dimension of the observation space
13 |      - augment the observation with a one-hot vector that encodes the task ID
14 |     """
15 |     #====================================================================================#
16 |     #                           ----------PROBLEM 1----------
17 |     #====================================================================================#
18 |     # YOUR CODE SOMEWHERE HERE
19 |     def __init__(self, num_tasks=1):
20 |         self.tasks = [0, 1, 2, 3][:num_tasks]
21 |         self.task_idx = -1
22 |         self.reset_task()
23 |         self.reset()
24 | 
25 |         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
26 |         self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
27 | 
28 |     def reset_task(self, is_evaluation=False):
29 |         # for evaluation, cycle deterministically through all tasks
30 |         if is_evaluation:
31 |             self.task_idx = (self.task_idx + 1) % len(self.tasks)
32 |         # during training, sample tasks randomly
33 |         else:
34 |             self.task_idx = np.random.randint(len(self.tasks))
35 |         self._task = self.tasks[self.task_idx]
36 |         goals = [[-1, -1], [-1, 1], [1, -1], [1, 1]]
37 |         self._goal = np.array(goals[self.task_idx])*10
38 | 
39 |     def reset(self):
40 |         self._state = np.array([0, 0], dtype=np.float32)
41 |         return self._get_obs()
42 | 
43 |     def _get_obs(self):
44 |         return np.copy(self._state)
45 | 
46 |     def step(self, action):
47 |         x, y = self._state
48 |         # compute reward, add penalty for large actions instead of clipping them
49 |         x -= self._goal[0]
50 |         y -= self._goal[1]
51 |         reward = - (x ** 2 + y ** 2) ** 0.5
52 |         # check if task is complete
53 |         done = abs(x) < 0.01 and abs(y) < 0.01
54 |         # move to next state
55 |         self._state = self._state + action
56 |         ob = self._get_obs()
57 |         return ob, reward, done, dict()
58 | 
59 |     def viewer_setup(self):
60 |         print('no viewer')
61 |         pass
62 | 
63 |     def render(self):
64 |         print('current state:', self._state)
65 | 
66 |     def seed(self, seed):
67 |         np.random.seed = seed
68 | 


--------------------------------------------------------------------------------
/hw5/meta/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class ReplayBuffer(object):
  4 |     '''
  5 |     minimalistic replay buffer
  6 | 
  7 |     a sample consists of
  8 |      - observation
  9 |      - action
 10 |      - reward
 11 |      - terminal
 12 |      - hidden state for recurrent policy
 13 | 
 14 |      it is memory inefficient to store windowed observations this way
 15 |      so do not run on tasks with large observations (e.g. from vision)
 16 |     '''
 17 | 
 18 |     def __init__(self, max_size, ob_dim, ac_dim, hidden_dim, task_dim):
 19 |         self.max_size = max_size
 20 |         self.ob_dim = ob_dim
 21 |         self.ac_dim = ac_dim
 22 |         self.hidden_dim = hidden_dim
 23 |         self.task_dim = task_dim
 24 |         self.flush()
 25 | 
 26 |     def flush(self):
 27 |         '''
 28 |         set buffer to empty
 29 |         '''
 30 |         self._observations = np.zeros((self.max_size, *self.ob_dim))
 31 |         self._actions = np.zeros((self.max_size, *self.ac_dim))
 32 |         self._rewards = np.zeros((self.max_size, 1))
 33 |         self._terminals = np.zeros((self.max_size, 1))
 34 |         self._hiddens = np.zeros((self.max_size, self.hidden_dim))
 35 |         self._tasks = np.zeros((self.max_size, self.task_dim))
 36 |         self._top = 0
 37 |         self._size = 0
 38 | 
 39 |     def _advance(self):
 40 |         '''
 41 |         move pointer to top of buffer
 42 |         if end of buffer is reached, overwrite oldest data
 43 |         '''
 44 |         self._top = (self._top + 1) % self.max_size
 45 |         if self._size < self.max_size:
 46 |             self._size += 1
 47 | 
 48 |     def add_sample(self, ob, ac, re, te, hi, task):
 49 |         '''
 50 |         add sample to buffer
 51 |         '''
 52 |         self._observations[self._top] = ob
 53 |         self._actions[self._top] = ac
 54 |         self._rewards[self._top] = re
 55 |         self._terminals[self._top] = te
 56 |         self._hiddens[self._top] = hi
 57 |         self._tasks[self._top] = task
 58 | 
 59 |         self._advance()
 60 | 
 61 |     def get_samples(self, indices):
 62 |         '''
 63 |         return buffer data indexed by `indices`
 64 |         '''
 65 |         return dict(
 66 |             observations=self._observations[indices],
 67 |             actions=self._actions[indices],
 68 |             rewards=self._rewards[indices],
 69 |             terminals=self._terminals[indices],
 70 |             hiddens=self._hiddens[indices],
 71 |             tasks=self._tasks[indices],
 72 |         )
 73 | 
 74 |     def random_batch(self, batch_size):
 75 |         '''
 76 |         return random sample of `batch_size` transitions
 77 |         '''
 78 |         indices = np.random.randint(0, self._size, batch_size)
 79 |         return self.get_samples(indices)
 80 | 
 81 |     def all_batch(self):
 82 |         '''
 83 |         return all data in the buffer
 84 |         '''
 85 |         indices = list(range(self._size))
 86 |         return self.get_samples(indices)
 87 | 
 88 |     def num_steps_can_sample(self):
 89 |         return self._size
 90 | 
 91 | 
 92 | 
 93 | class PPOReplayBuffer(object):
 94 |     '''
 95 |     replay buffer for PPO algorithm
 96 |     store fixed log probs, advantages, and returns for use in multiple updates
 97 | 
 98 |     n.b. samples must be added as a batch, and we assume that the
 99 |     batch is the same size as that of the simple buffer
100 |     '''
101 | 
102 |     def __init__(self, simple_buffer):
103 |         self.simple_buffer = simple_buffer
104 |         self.max_size = self.simple_buffer.max_size
105 |         self.flush()
106 | 
107 |     def flush(self):
108 |         self.simple_buffer.flush()
109 |         self._log_probs = np.zeros((self.max_size, 1))
110 |         self._advantages = np.zeros((self.max_size, 1))
111 |         self._returns = np.zeros((self.max_size, 1))
112 | 
113 |     def add_samples(self, lp, adv, ret):
114 |         self._log_probs = lp
115 |         self._advantages = adv
116 |         self._returns = ret
117 | 
118 |     def get_samples(self, indices):
119 |         return dict(
120 |             log_probs = self._log_probs[indices],
121 |             advantages = self._advantages[indices],
122 |             returns = self._returns[indices],
123 |         )
124 | 
125 |     def random_batch(self, batch_size):
126 |         indices = np.random.randint(0, self.simple_buffer._size, batch_size)
127 |         simple = self.simple_buffer.get_samples(indices)
128 |         ppo = self.get_samples(indices)
129 |         return {**simple, **ppo}
130 | 


--------------------------------------------------------------------------------
/hw5/meta/requirements.txt:
--------------------------------------------------------------------------------
1 | mujoco-py==1.50.1.56
2 | gym==0.10.5
3 | tensorflow==1.10.0
4 | numpy==1.14.5
5 | scipy==1.1.0
6 | tensorflow-probability==0.3.0
7 | seaborn
8 | Box2D==2.3.2
9 | 


--------------------------------------------------------------------------------
/hw5/sac/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 5b: Soft Actor Critic
 2 | Original code from Tuomas Haarnoja, Soroush Nasiriany, and Aurick Zhou for CS294-112 Fall 2018
 3 | 
 4 | Dependencies:
 5 |  * Python **3.4.5**
 6 |  * Numpy version **1.15.2**
 7 |  * TensorFlow version **1.10.0**
 8 |  * tensorflow-probability version **0.4.0**
 9 |  * OpenAI Gym version **0.10.8**
10 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.59**
11 |  * seaborn version **0.9.0**
12 | 
13 | You will implement `sac.py`, and `nn.py`.
14 | 
15 | See the [HW5 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5b.pdf) for further instructions.
16 | 


--------------------------------------------------------------------------------
/hw5/sac/environment.yml:
--------------------------------------------------------------------------------
 1 | name: hw5-sac
 2 | dependencies:
 3 |     - python==3.4.5
 4 |     - pip:
 5 |         - gym==0.10.8
 6 |         - numpy==1.15.2
 7 |         - tensorflow==1.10.0
 8 |         - tensorflow-probability==0.4.0
 9 |         - mujoco-py==1.50.1.59
10 |         - seaborn==0.9.0
11 | 


--------------------------------------------------------------------------------
/hw5/sac/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, indent=2, separators=(',', ': '), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw5/sac/nn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.keras import layers
  4 | from tensorflow_probability import distributions
  5 | from tensorflow.python import keras
  6 | from tensorflow.python.keras.engine.network import Network
  7 | 
  8 | 
  9 | class QFunction(Network):
 10 |     def __init__(self, hidden_layer_sizes, **kwargs):
 11 |         super(QFunction, self).__init__(**kwargs)
 12 |         self._hidden_layer_sizes = hidden_layer_sizes
 13 | 
 14 |     def build(self, input_shape):
 15 |         inputs = [
 16 |             layers.Input(batch_shape=input_shape[0], name='observations'),
 17 |             layers.Input(batch_shape=input_shape[1], name='actions')
 18 |         ]
 19 | 
 20 |         x = layers.Concatenate(axis=1)(inputs)
 21 |         for hidden_units in self._hidden_layer_sizes:
 22 |             x = layers.Dense(hidden_units, activation='relu')(x)
 23 |         q_values = layers.Dense(1, activation=None)(x)
 24 | 
 25 |         self._init_graph_network(inputs, q_values)
 26 |         super(QFunction, self).build(input_shape)
 27 | 
 28 | 
 29 | class ValueFunction(Network):
 30 |     def __init__(self, hidden_layer_sizes, **kwargs):
 31 |         super(ValueFunction, self).__init__(**kwargs)
 32 |         self._hidden_layer_sizes = hidden_layer_sizes
 33 | 
 34 |     def build(self, input_shape):
 35 |         inputs = layers.Input(batch_shape=input_shape, name='observations')
 36 | 
 37 |         x = inputs
 38 |         for hidden_units in self._hidden_layer_sizes:
 39 |             x = layers.Dense(hidden_units, activation='relu')(x)
 40 |         values = layers.Dense(1, activation=None)(x)
 41 | 
 42 |         self._init_graph_network(inputs, values)
 43 |         super(ValueFunction, self).build(input_shape)
 44 | 
 45 | 
 46 | class GaussianPolicy(Network):
 47 |     def __init__(self, action_dim, hidden_layer_sizes, reparameterize, **kwargs):
 48 |         super(GaussianPolicy, self).__init__(**kwargs)
 49 |         self._action_dim = action_dim
 50 |         self._f = None
 51 |         self._hidden_layer_sizes = hidden_layer_sizes
 52 |         self._reparameterize = reparameterize
 53 | 
 54 |     def build(self, input_shape):
 55 |         inputs = layers.Input(batch_shape=input_shape, name='observations')
 56 | 
 57 |         x = inputs
 58 |         for hidden_units in self._hidden_layer_sizes:
 59 |             x = layers.Dense(hidden_units, activation='relu')(x)
 60 | 
 61 |         mean_and_log_std = layers.Dense(
 62 |             self._action_dim * 2, activation=None)(x)
 63 | 
 64 |         def create_distribution_layer(mean_and_log_std):
 65 |             mean, log_std = tf.split(
 66 |                 mean_and_log_std, num_or_size_splits=2, axis=1)
 67 |             log_std = tf.clip_by_value(log_std, -20., 2.)
 68 | 
 69 |             distribution = distributions.MultivariateNormalDiag(
 70 |                 loc=mean,
 71 |                 scale_diag=tf.exp(log_std))
 72 | 
 73 |             raw_actions = distribution.sample()
 74 |             if not self._reparameterize:
 75 |                 ### Problem 1.3.A
 76 |                 ### YOUR CODE HERE
 77 |                 raise NotImplementedError
 78 |             log_probs = distribution.log_prob(raw_actions)
 79 |             log_probs -= self._squash_correction(raw_actions)
 80 | 
 81 |             actions = None
 82 |             ### Problem 2.A
 83 |             ### YOUR CODE HERE
 84 |             raise NotImplementedError
 85 | 
 86 |             return actions, log_probs
 87 | 
 88 |         samples, log_probs = layers.Lambda(create_distribution_layer)(
 89 |             mean_and_log_std)
 90 | 
 91 |         self._init_graph_network(inputs=inputs, outputs=[samples, log_probs])
 92 |         super(GaussianPolicy, self).build(input_shape)
 93 | 
 94 |     def _squash_correction(self, raw_actions):
 95 |         ### Problem 2.B
 96 |         ### YOUR CODE HERE
 97 |         raise NotImplementedError
 98 | 
 99 |     def eval(self, observation):
100 |         assert self.built and observation.ndim == 1
101 | 
102 |         if self._f is None:
103 |             self._f = keras.backend.function(self.inputs, [self.outputs[0]])
104 | 
105 |         action, = self._f([observation[None]])
106 |         return action.flatten()
107 | 


--------------------------------------------------------------------------------
/hw5/sac/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     plt.show()
 59 | 
 60 | 
 61 | def get_datasets(fpath, condition=None):
 62 |     unit = 0
 63 |     datasets = []
 64 |     for root, dir, files in os.walk(fpath):
 65 |         if 'log.txt' in files:
 66 |             param_path = open(os.path.join(root,'params.json'))
 67 |             params = json.load(param_path)
 68 |             exp_name = params['exp_name']
 69 |             
 70 |             log_path = os.path.join(root,'log.txt')
 71 |             experiment_data = pd.read_table(log_path)
 72 | 
 73 |             experiment_data.insert(
 74 |                 len(experiment_data.columns),
 75 |                 'Unit',
 76 |                 unit
 77 |                 )        
 78 |             experiment_data.insert(
 79 |                 len(experiment_data.columns),
 80 |                 'Condition',
 81 |                 condition or exp_name
 82 |                 )
 83 | 
 84 |             datasets.append(experiment_data)
 85 |             unit += 1
 86 | 
 87 |     return datasets
 88 | 
 89 | 
 90 | def main():
 91 |     import argparse
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('logdir', nargs='*')
 94 |     parser.add_argument('--legend', nargs='*')
 95 |     parser.add_argument('--value', default='LastEpReturn', nargs='*')
 96 |     args = parser.parse_args()
 97 | 
 98 |     use_legend = False
 99 |     if args.legend is not None:
100 |         assert len(args.legend) == len(args.logdir), \
101 |             "Must give a legend title for each set of experiments."
102 |         use_legend = True
103 | 
104 |     data = []
105 |     if use_legend:
106 |         for logdir, legend_title in zip(args.logdir, args.legend):
107 |             data += get_datasets(logdir, legend_title)
108 |     else:
109 |         for logdir in args.logdir:
110 |             data += get_datasets(logdir)
111 | 
112 |     if isinstance(args.value, list):
113 |         values = args.value
114 |     else:
115 |         values = [args.value]
116 |     for value in values:
117 |         plot_data(data, value=value)
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/hw5/sac/sac.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import time
  3 | 
  4 | class SAC:
  5 |     """Soft Actor-Critic (SAC)
  6 |     Original code from Tuomas Haarnoja, Soroush Nasiriany, and Aurick Zhou for CS294-112 Fall 2018
  7 | 
  8 |     References
  9 |     ----------
 10 |     [1] Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine, "Soft
 11 |         Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning
 12 |         with a Stochastic Actor," ICML 2018.
 13 |     """
 14 | 
 15 |     def __init__(self,
 16 |                  alpha=1.0,
 17 |                  batch_size=256,
 18 |                  discount=0.99,
 19 |                  epoch_length=1000,
 20 |                  learning_rate=3e-3,
 21 |                  reparameterize=False,
 22 |                  tau=0.01,
 23 |                  **kwargs):
 24 |         """
 25 |         Args:
 26 |         """
 27 | 
 28 |         self._alpha = alpha
 29 |         self._batch_size = batch_size
 30 |         self._discount = discount
 31 |         self._epoch_length = epoch_length
 32 |         self._learning_rate = learning_rate
 33 |         self._reparameterize = reparameterize
 34 |         self._tau = tau
 35 | 
 36 |         self._training_ops = []
 37 | 
 38 |     def build(self, env, policy, q_function, q_function2, value_function,
 39 |               target_value_function):
 40 | 
 41 |         self._create_placeholders(env)
 42 | 
 43 |         policy_loss = self._policy_loss_for(policy, q_function, q_function2, value_function)
 44 |         value_function_loss = self._value_function_loss_for(
 45 |             policy, q_function, q_function2, value_function)
 46 |         q_function_loss = self._q_function_loss_for(q_function,
 47 |                                                     target_value_function)
 48 |         if q_function2 is not None:
 49 |             q_function2_loss = self._q_function_loss_for(q_function2,
 50 |                                                         target_value_function)
 51 | 
 52 |         optimizer = tf.train.AdamOptimizer(
 53 |             self._learning_rate, name='optimizer')
 54 |         policy_training_op = optimizer.minimize(
 55 |             loss=policy_loss, var_list=policy.trainable_variables)
 56 |         value_training_op = optimizer.minimize(
 57 |             loss=value_function_loss,
 58 |             var_list=value_function.trainable_variables)
 59 |         q_function_training_op = optimizer.minimize(
 60 |             loss=q_function_loss, var_list=q_function.trainable_variables)
 61 |         if q_function2 is not None:
 62 |             q_function2_training_op = optimizer.minimize(
 63 |                 loss=q_function2_loss, var_list=q_function2.trainable_variables)
 64 | 
 65 |         self._training_ops = [
 66 |             policy_training_op, value_training_op, q_function_training_op
 67 |         ]
 68 |         if q_function2 is not None:
 69 |             self._training_ops += [q_function2_training_op]
 70 |         self._target_update_ops = self._create_target_update(
 71 |             source=value_function, target=target_value_function)
 72 | 
 73 |         tf.get_default_session().run(tf.global_variables_initializer())
 74 | 
 75 |     def _create_placeholders(self, env):
 76 |         observation_dim = env.observation_space.shape[0]
 77 |         action_dim = env.action_space.shape[0]
 78 | 
 79 |         self._observations_ph = tf.placeholder(
 80 |             tf.float32,
 81 |             shape=(None, observation_dim),
 82 |             name='observation',
 83 |         )
 84 |         self._next_observations_ph = tf.placeholder(
 85 |             tf.float32,
 86 |             shape=(None, observation_dim),
 87 |             name='next_observation',
 88 |         )
 89 |         self._actions_ph = tf.placeholder(
 90 |             tf.float32,
 91 |             shape=(None, action_dim),
 92 |             name='actions',
 93 |         )
 94 |         self._rewards_ph = tf.placeholder(
 95 |             tf.float32,
 96 |             shape=(None, ),
 97 |             name='rewards',
 98 |         )
 99 |         self._terminals_ph = tf.placeholder(
100 |             tf.float32,
101 |             shape=(None, ),
102 |             name='terminals',
103 |         )
104 | 
105 |     def _policy_loss_for(self, policy, q_function, q_function2, value_function):
106 |         if not self._reparameterize:
107 |             ### Problem 1.3.A
108 |             ### YOUR CODE HERE
109 |             raise NotImplementedError
110 |         else:
111 |             ### Problem 1.3.B
112 |             ### YOUR CODE HERE
113 |             raise NotImplementedError
114 | 
115 |     def _value_function_loss_for(self, policy, q_function, q_function2, value_function):
116 |         ### Problem 1.2.A
117 |         ### YOUR CODE HERE
118 |         raise NotImplementedError
119 | 
120 |     def _q_function_loss_for(self, q_function, target_value_function):
121 |         ### Problem 1.1.A
122 |         ### YOUR CODE HERE
123 |         raise NotImplementedError
124 | 
125 |     def _create_target_update(self, source, target):
126 |         """Create tensorflow operations for updating target value function."""
127 | 
128 |         return [
129 |             tf.assign(target, (1 - self._tau) * target + self._tau * source)
130 |             for target, source in zip(target.trainable_variables, source.
131 |                                       trainable_variables)
132 |         ]
133 | 
134 |     def train(self, sampler, n_epochs=1000):
135 |         """Return a generator that performs RL training.
136 | 
137 |         Args:
138 |             env (`rllab.Env`): Environment used for training
139 |             policy (`Policy`): Policy used for training
140 |             initial_exploration_policy ('Policy'): Policy used for exploration
141 |                 If None, then all exploration is done using policy
142 |             pool (`PoolBase`): Sample pool to add samples to
143 |         """
144 |         self._start = time.time()
145 |         for epoch in range(n_epochs):
146 |             for t in range(self._epoch_length):
147 |                 sampler.sample()
148 | 
149 |                 batch = sampler.random_batch(self._batch_size)
150 |                 feed_dict = {
151 |                     self._observations_ph: batch['observations'],
152 |                     self._actions_ph: batch['actions'],
153 |                     self._next_observations_ph: batch['next_observations'],
154 |                     self._rewards_ph: batch['rewards'],
155 |                     self._terminals_ph: batch['terminals'],
156 |                 }
157 |                 tf.get_default_session().run(self._training_ops, feed_dict)
158 |                 tf.get_default_session().run(self._target_update_ops)
159 | 
160 |             yield epoch
161 | 
162 |     def get_statistics(self):
163 |         statistics = {
164 |             'Time': time.time() - self._start,
165 |             'TimestepsThisBatch': self._epoch_length,
166 |         }
167 | 
168 |         return statistics
169 | 


--------------------------------------------------------------------------------
/hw5/sac/train_mujoco.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | import logz
  4 | import numpy as np
  5 | import os
  6 | import tensorflow as tf
  7 | import time
  8 | 
  9 | import nn
 10 | from sac import SAC
 11 | import utils
 12 | 
 13 | from multiprocessing import Process
 14 | 
 15 | def train_SAC(env_name, exp_name, seed, logdir):
 16 |     alpha = {
 17 |         'Ant-v2': 0.1,
 18 |         'HalfCheetah-v2': 0.2,
 19 |         'Hopper-v2': 0.2,
 20 |         'Humanoid-v2': 0.05,
 21 |         'Walker2d-v2': 0.2,
 22 |     }.get(env_name, 0.2)
 23 | 
 24 |     algorithm_params = {
 25 |         'alpha': alpha,
 26 |         'batch_size': 256,
 27 |         'discount': 0.99,
 28 |         'learning_rate': 1e-3,
 29 |         'reparameterize': False,
 30 |         'tau': 0.01,
 31 |         'epoch_length': 1000,
 32 |         'n_epochs': 500,
 33 |         'two_qf': False,
 34 |     }
 35 |     sampler_params = {
 36 |         'max_episode_length': 1000,
 37 |         'prefill_steps': 1000,
 38 |     }
 39 |     replay_pool_params = {
 40 |         'max_size': 1e6,
 41 |     }
 42 | 
 43 |     value_function_params = {
 44 |         'hidden_layer_sizes': (128, 128),
 45 |     }
 46 | 
 47 |     q_function_params = {
 48 |         'hidden_layer_sizes': (128, 128),
 49 |     }
 50 | 
 51 |     policy_params = {
 52 |         'hidden_layer_sizes': (128, 128),
 53 |     }
 54 | 
 55 |     logz.configure_output_dir(logdir)
 56 |     params = {
 57 |         'exp_name': exp_name,
 58 |         'env_name': env_name,
 59 |         'algorithm_params': algorithm_params,
 60 |         'sampler_params': sampler_params,
 61 |         'replay_pool_params': replay_pool_params,
 62 |         'value_function_params': value_function_params,
 63 |         'q_function_params': q_function_params,
 64 |         'policy_params': policy_params
 65 |     }
 66 |     logz.save_params(params)
 67 | 
 68 |     env = gym.envs.make(env_name)
 69 |     # Set random seeds
 70 |     tf.set_random_seed(seed)
 71 |     np.random.seed(seed)
 72 |     env.seed(seed)
 73 | 
 74 |     sampler = utils.SimpleSampler(**sampler_params)
 75 |     replay_pool = utils.SimpleReplayPool(
 76 |         observation_shape=env.observation_space.shape,
 77 |         action_shape=env.action_space.shape,
 78 |         **replay_pool_params)
 79 | 
 80 |     q_function = nn.QFunction(name='q_function', **q_function_params)
 81 |     if algorithm_params.get('two_qf', False):
 82 |         q_function2 = nn.QFunction(name='q_function2', **q_function_params)
 83 |     else:
 84 |         q_function2 = None
 85 |     value_function = nn.ValueFunction(
 86 |         name='value_function', **value_function_params)
 87 |     target_value_function = nn.ValueFunction(
 88 |         name='target_value_function', **value_function_params)
 89 |     policy = nn.GaussianPolicy(
 90 |         action_dim=env.action_space.shape[0],
 91 |         reparameterize=algorithm_params['reparameterize'],
 92 |         **policy_params)
 93 | 
 94 |     sampler.initialize(env, policy, replay_pool)
 95 | 
 96 |     algorithm = SAC(**algorithm_params)
 97 | 
 98 |     tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
 99 |     tf_config.gpu_options.allow_growth = True  # may need if using GPU
100 |     with tf.Session(config=tf_config):
101 |         algorithm.build(
102 |             env=env,
103 |             policy=policy,
104 |             q_function=q_function,
105 |             q_function2=q_function2,
106 |             value_function=value_function,
107 |             target_value_function=target_value_function)
108 | 
109 |         for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)):
110 |             logz.log_tabular('Iteration', epoch)
111 |             for k, v in algorithm.get_statistics().items():
112 |                 logz.log_tabular(k, v)
113 |             for k, v in replay_pool.get_statistics().items():
114 |                 logz.log_tabular(k, v)
115 |             for k, v in sampler.get_statistics().items():
116 |                 logz.log_tabular(k, v)
117 |             logz.dump_tabular()
118 | 
119 | def main():
120 |     parser = argparse.ArgumentParser()
121 |     parser.add_argument('--env_name', type=str, default='HalfCheetah-v2')
122 |     parser.add_argument('--exp_name', type=str, default=None)
123 |     parser.add_argument('--seed', type=int, default=1)
124 |     parser.add_argument('--n_experiments', '-e', type=int, default=1)
125 |     args = parser.parse_args()
126 | 
127 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
128 | 
129 |     if not (os.path.exists(data_path)):
130 |         os.makedirs(data_path)
131 |     logdir = 'sac_' + args.env_name + '_' + args.exp_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
132 |     logdir = os.path.join(data_path, logdir)
133 | 
134 |     processes = []
135 | 
136 |     for e in range(args.n_experiments):
137 |         seed = args.seed + 10*e
138 |         print('Running experiment with seed %d'%seed)
139 | 
140 |         def train_func():
141 |             train_SAC(
142 |                 env_name=args.env_name,
143 |                 exp_name=args.exp_name,
144 |                 seed=seed,
145 |                 logdir=os.path.join(logdir, '%d' % seed),
146 |             )
147 |         # # Awkward hacky process runs, because Tensorflow does not like
148 |         # # repeatedly calling train_AC in the same thread.
149 |         p = Process(target=train_func, args=tuple())
150 |         p.start()
151 |         processes.append(p)
152 |         # if you comment in the line below, then the loop will block
153 |         # until this process finishes
154 |         # p.join()
155 | 
156 |     for p in processes:
157 |         p.join()
158 | 
159 | if __name__ == '__main__':
160 |     main()
161 | 


--------------------------------------------------------------------------------
/hw5/sac/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import tensorflow as tf
  4 | 
  5 | 
  6 | class Logger:
  7 |     def __init__(self, log_dir):
  8 |         self._summary_writer = tf.summary.FileWriter(
  9 |             os.path.expanduser(log_dir))
 10 | 
 11 |         self._rows = []
 12 | 
 13 |     def log_value(self, tag, value, step):
 14 |         summary = tf.Summary()
 15 |         summary.value.add(tag=tag, simple_value=value)
 16 |         self._summary_writer.add_summary(summary, step)
 17 | 
 18 |         self._rows.append("{tag:.<25} {value}".format(tag=tag, value=value))
 19 | 
 20 |     def log_values(self, dictionary, step):
 21 |         for tag, value in dictionary.items():
 22 |             self.log_value(tag, value, step)
 23 | 
 24 |     def flush(self):
 25 |         self._summary_writer.flush()
 26 |         print(format("", "_<25"))
 27 |         print("\n".join(self._rows))
 28 | 
 29 |         self._rows = []
 30 | 
 31 | 
 32 | class ReplayPool:
 33 |     def __init__(self, max_size, fields):
 34 |         max_size = int(max_size)
 35 |         self._max_size = max_size
 36 | 
 37 |         self.fields = {}
 38 |         self.field_names = []
 39 |         self.add_fields(fields)
 40 | 
 41 |         self._pointer = 0
 42 |         self._size = 0
 43 | 
 44 |     @property
 45 |     def size(self):
 46 |         return self._size
 47 | 
 48 |     def add_fields(self, fields):
 49 |         self.fields.update(fields)
 50 |         self.field_names += list(fields.keys())
 51 | 
 52 |         for field_name, field_attrs in fields.items():
 53 |             field_shape = [self._max_size] + list(field_attrs['shape'])
 54 |             initializer = field_attrs.get('initializer', np.zeros)
 55 |             setattr(self, field_name, initializer(field_shape))
 56 | 
 57 |     def _advance(self, count=1):
 58 |         self._pointer = (self._pointer + count) % self._max_size
 59 |         self._size = min(self._size + count, self._max_size)
 60 | 
 61 |     def add_sample(self, **kwargs):
 62 |         self.add_samples(1, **kwargs)
 63 | 
 64 |     def add_samples(self, num_samples=1, **kwargs):
 65 |         for field_name in self.field_names:
 66 |             idx = np.arange(self._pointer,
 67 |                             self._pointer + num_samples) % self._max_size
 68 |             getattr(self, field_name)[idx] = kwargs.pop(field_name)
 69 | 
 70 |         self._advance(num_samples)
 71 | 
 72 |     def random_indices(self, batch_size):
 73 |         if self._size == 0: return []
 74 |         return np.random.randint(0, self._size, batch_size)
 75 | 
 76 |     def random_batch(self, batch_size, field_name_filter=None):
 77 |         random_indices = self.random_indices(batch_size)
 78 |         return self.batch_by_indices(random_indices, field_name_filter)
 79 | 
 80 |     def batch_by_indices(self, indices, field_name_filter=None):
 81 |         field_names = self.field_names
 82 |         if field_name_filter is not None:
 83 |             field_names = [
 84 |                 field_name for field_name in field_names
 85 |                 if field_name_filter(field_name)
 86 |             ]
 87 | 
 88 |         return {
 89 |             field_name: getattr(self, field_name)[indices]
 90 |             for field_name in field_names
 91 |         }
 92 | 
 93 |     def get_statistics(self):
 94 |         return {
 95 |             'PoolSize': self._size,
 96 |         }
 97 | 
 98 | 
 99 | class SimpleReplayPool(ReplayPool):
100 |     def __init__(self, observation_shape, action_shape, *args, **kwargs):
101 |         self._observation_shape = observation_shape
102 |         self._action_shape = action_shape
103 | 
104 |         fields = {
105 |             'observations': {
106 |                 'shape': self._observation_shape,
107 |                 'dtype': 'float32'
108 |             },
109 |             # It's a bit memory inefficient to save the observations twice,
110 |             # but it makes the code *much* easier since you no longer have
111 |             # to worry about termination conditions.
112 |             'next_observations': {
113 |                 'shape': self._observation_shape,
114 |                 'dtype': 'float32'
115 |             },
116 |             'actions': {
117 |                 'shape': self._action_shape,
118 |                 'dtype': 'float32'
119 |             },
120 |             'rewards': {
121 |                 'shape': [],
122 |                 'dtype': 'float32'
123 |             },
124 |             # self.terminals[i] = a terminal was received at time i
125 |             'terminals': {
126 |                 'shape': [],
127 |                 'dtype': 'bool'
128 |             },
129 |         }
130 | 
131 |         super(SimpleReplayPool, self).__init__(*args, fields=fields, **kwargs)
132 | 
133 | 
134 | class Sampler(object):
135 |     def __init__(self, max_episode_length, prefill_steps):
136 |         self._max_episode_length = max_episode_length
137 |         self._prefill_steps = prefill_steps
138 | 
139 |         self.env = None
140 |         self.policy = None
141 |         self.pool = None
142 | 
143 |     def initialize(self, env, policy, pool):
144 |         self.env = env
145 |         self.policy = policy
146 |         self.pool = pool
147 | 
148 |         class UniformPolicy:
149 |             def __init__(self, action_dim):
150 |                 self._action_dim = action_dim
151 | 
152 |             def eval(self, _):
153 |                 return np.random.uniform(-1, 1, self._action_dim)
154 | 
155 |         uniform_exploration_policy = UniformPolicy(env.action_space.shape[0])
156 |         for _ in range(self._prefill_steps):
157 |             self.sample(uniform_exploration_policy)
158 | 
159 |     def set_policy(self, policy):
160 |         self.policy = policy
161 | 
162 |     def sample(self):
163 |         raise NotImplementedError
164 | 
165 |     def random_batch(self, batch_size):
166 |         return self.pool.random_batch(batch_size)
167 | 
168 |     def terminate(self):
169 |         self.env.terminate()
170 | 
171 | 
172 | class SimpleSampler(Sampler):
173 |     def __init__(self, **kwargs):
174 |         super(SimpleSampler, self).__init__(**kwargs)
175 | 
176 |         self._episode_length = 0
177 |         self._episode_return = 0
178 |         self._last_episode_return = 0
179 |         self._max_episode_return = -np.inf
180 |         self._n_episodes = 0
181 |         self._current_observation = None
182 |         self._total_samples = 0
183 | 
184 |     def sample(self, policy=None):
185 |         policy = self.policy if policy is None else policy
186 |         if self._current_observation is None:
187 |             self._current_observation = self.env.reset()
188 | 
189 |         action = policy.eval(self._current_observation)
190 |         next_observation, reward, terminal, info = self.env.step(action)
191 |         self._episode_length += 1
192 |         self._episode_return += reward
193 |         self._total_samples += 1
194 | 
195 |         self.pool.add_sample(
196 |             observations=self._current_observation,
197 |             actions=action,
198 |             rewards=reward,
199 |             terminals=terminal,
200 |             next_observations=next_observation)
201 | 
202 |         if terminal or self._episode_length >= self._max_episode_length:
203 |             self._current_observation = self.env.reset()
204 |             self._episode_length = 0
205 |             self._max_episode_return = max(self._max_episode_return,
206 |                                            self._episode_return)
207 |             self._last_episode_return = self._episode_return
208 | 
209 |             self._episode_return = 0
210 |             self._n_episodes += 1
211 | 
212 |         else:
213 |             self._current_observation = next_observation
214 | 
215 |     def get_statistics(self):
216 |         statistics = {
217 |             'MaxEpReturn': self._max_episode_return,
218 |             'LastEpReturn': self._last_episode_return,
219 |             'Episodes': self._n_episodes,
220 |             'TimestepsSoFar': self._total_samples,
221 |         }
222 | 
223 |         return statistics
224 | 


--------------------------------------------------------------------------------
/project/project_assignment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/project/project_assignment.pdf


--------------------------------------------------------------------------------