├── .gitignore ├── LICENSE ├── hw1 ├── README.md ├── demo.bash ├── experts │ ├── Ant-v2.pkl │ ├── HalfCheetah-v2.pkl │ ├── Hopper-v2.pkl │ ├── Humanoid-v2.pkl │ ├── Reacher-v2.pkl │ └── Walker2d-v2.pkl ├── load_policy.py ├── requirements.txt ├── run_expert.py └── tf_util.py ├── hw2 ├── README.md ├── hw2_instructions.pdf ├── hw2_instructions.tex ├── logz.py ├── lunar_lander.py ├── plot.py ├── requirements.txt └── train_pg_f18.py ├── hw3 ├── README.md ├── atari_wrappers.py ├── dqn.py ├── dqn_utils.py ├── logz.py ├── lunar_lander.py ├── plot.py ├── requirements.txt ├── run_dqn_atari.py ├── run_dqn_lander.py ├── run_dqn_ram.py └── train_ac_f18.py ├── hw4 ├── .gitignore ├── half_cheetah_env.py ├── logger.py ├── main.py ├── model_based_policy.py ├── model_based_rl.py ├── plot.py ├── requirements.txt ├── run_all.sh ├── tabulate.py ├── timer.py └── utils.py ├── hw5 ├── exp │ ├── README.md │ ├── density_model.py │ ├── ex_utils.py │ ├── exploration.py │ ├── hw5a.pdf │ ├── logz.py │ ├── plot.py │ ├── pointmass.py │ ├── replay.py │ ├── requirements.txt │ ├── run_all.sh │ ├── sparse_half_cheetah.py │ └── train_ac_exploration_f18.py ├── meta │ ├── README.md │ ├── logz.py │ ├── plot.py │ ├── point_mass.py │ ├── point_mass_observed.py │ ├── replay_buffer.py │ ├── requirements.txt │ └── train_policy.py └── sac │ ├── README.md │ ├── environment.yml │ ├── logz.py │ ├── nn.py │ ├── plot.py │ ├── sac.py │ ├── train_mujoco.py │ └── utils.py └── project └── project_assignment.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 berkeleydeeprlcourse 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hw1/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 1: Imitation Learning 2 | 3 | Dependencies: 4 | * Python **3.5** 5 | * Numpy version **1.14.5** 6 | * TensorFlow version **1.10.5** 7 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56** 8 | * OpenAI Gym version **0.10.5** 9 | 10 | Once Python **3.5** is installed, you can install the remaining dependencies using `pip install -r requirements.txt`. 11 | 12 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines. 13 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638). 14 | 15 | **Note**: Students enrolled in the course will receive an email with their MuJoCo activation key. Please do **not** share this key. 16 | 17 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data. 18 | 19 | In `experts/`, the provided expert policies are: 20 | * Ant-v2.pkl 21 | * HalfCheetah-v2.pkl 22 | * Hopper-v2.pkl 23 | * Humanoid-v2.pkl 24 | * Reacher-v2.pkl 25 | * Walker2d-v2.pkl 26 | 27 | The name of the pickle file corresponds to the name of the gym environment. 28 | -------------------------------------------------------------------------------- /hw1/demo.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | for e in Hopper-v2 Ant-v2 HalfCheetah-v2 Humanoid-v2 Reacher-v2 Walker2d-v2 4 | do 5 | python run_expert.py experts/$e.pkl $e --render --num_rollouts=1 6 | done 7 | -------------------------------------------------------------------------------- /hw1/experts/Ant-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Ant-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/HalfCheetah-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/HalfCheetah-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/Hopper-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Hopper-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/Humanoid-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Humanoid-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/Reacher-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Reacher-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/Walker2d-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw1/experts/Walker2d-v2.pkl -------------------------------------------------------------------------------- /hw1/load_policy.py: -------------------------------------------------------------------------------- 1 | import pickle, tensorflow as tf, tf_util, numpy as np 2 | 3 | def load_policy(filename): 4 | with open(filename, 'rb') as f: 5 | data = pickle.loads(f.read()) 6 | 7 | # assert len(data.keys()) == 2 8 | nonlin_type = data['nonlin_type'] 9 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 10 | 11 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) 12 | policy_params = data[policy_type] 13 | 14 | assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} 15 | 16 | # Keep track of input and output dims (i.e. observation and action dims) for the user 17 | 18 | def build_policy(obs_bo): 19 | def read_layer(l): 20 | assert list(l.keys()) == ['AffineLayer'] 21 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] 22 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) 23 | 24 | def apply_nonlin(x): 25 | if nonlin_type == 'lrelu': 26 | return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 27 | elif nonlin_type == 'tanh': 28 | return tf.tanh(x) 29 | else: 30 | raise NotImplementedError(nonlin_type) 31 | 32 | # Build the policy. First, observation normalization. 33 | assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] 34 | obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] 35 | obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] 36 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 37 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) 38 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation 39 | 40 | curr_activations_bd = normedobs_bo 41 | 42 | # Hidden layers next 43 | assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] 44 | layer_params = policy_params['hidden']['FeedforwardNet'] 45 | for layer_name in sorted(layer_params.keys()): 46 | l = layer_params[layer_name] 47 | W, b = read_layer(l) 48 | curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) 49 | 50 | # Output layer 51 | W, b = read_layer(policy_params['out']) 52 | output_bo = tf.matmul(curr_activations_bd, W) + b 53 | return output_bo 54 | 55 | obs_bo = tf.placeholder(tf.float32, [None, None]) 56 | a_ba = build_policy(obs_bo) 57 | policy_fn = tf_util.function([obs_bo], a_ba) 58 | return policy_fn -------------------------------------------------------------------------------- /hw1/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.5 2 | mujoco-py==1.50.1.56 3 | tensorflow 4 | numpy 5 | seaborn 6 | -------------------------------------------------------------------------------- /hw1/run_expert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Code to load an expert policy and generate roll-out data for behavioral cloning. 5 | Example usage: 6 | python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ 7 | --num_rollouts 20 8 | 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com) 10 | """ 11 | 12 | import os 13 | import pickle 14 | import tensorflow as tf 15 | import numpy as np 16 | import tf_util 17 | import gym 18 | import load_policy 19 | 20 | def main(): 21 | import argparse 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('expert_policy_file', type=str) 24 | parser.add_argument('envname', type=str) 25 | parser.add_argument('--render', action='store_true') 26 | parser.add_argument("--max_timesteps", type=int) 27 | parser.add_argument('--num_rollouts', type=int, default=20, 28 | help='Number of expert roll outs') 29 | args = parser.parse_args() 30 | 31 | print('loading and building expert policy') 32 | policy_fn = load_policy.load_policy(args.expert_policy_file) 33 | print('loaded and built') 34 | 35 | with tf.Session(): 36 | tf_util.initialize() 37 | 38 | import gym 39 | env = gym.make(args.envname) 40 | max_steps = args.max_timesteps or env.spec.timestep_limit 41 | 42 | returns = [] 43 | observations = [] 44 | actions = [] 45 | for i in range(args.num_rollouts): 46 | print('iter', i) 47 | obs = env.reset() 48 | done = False 49 | totalr = 0. 50 | steps = 0 51 | while not done: 52 | action = policy_fn(obs[None,:]) 53 | observations.append(obs) 54 | actions.append(action) 55 | obs, r, done, _ = env.step(action) 56 | totalr += r 57 | steps += 1 58 | if args.render: 59 | env.render() 60 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 61 | if steps >= max_steps: 62 | break 63 | returns.append(totalr) 64 | 65 | print('returns', returns) 66 | print('mean return', np.mean(returns)) 67 | print('std of return', np.std(returns)) 68 | 69 | expert_data = {'observations': np.array(observations), 70 | 'actions': np.array(actions)} 71 | 72 | with open(os.path.join('expert_data', args.envname + '.pkl'), 'wb') as f: 73 | pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL) 74 | 75 | if __name__ == '__main__': 76 | main() 77 | -------------------------------------------------------------------------------- /hw2/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 2: Policy Gradient 2 | 3 | Dependencies: 4 | * Python **3.5** 5 | * Numpy version **1.14.5** 6 | * TensorFlow version **1.10.5** 7 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56** 8 | * OpenAI Gym version **0.10.5** 9 | * seaborn 10 | * Box2D==**2.3.2** 11 | 12 | Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file. 13 | 14 | The only file that you need to look at is `train_pg_f18.py`, which you will implement. 15 | 16 | See the [HW2 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw2.pdf) for further instructions. 17 | -------------------------------------------------------------------------------- /hw2/hw2_instructions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw2/hw2_instructions.pdf -------------------------------------------------------------------------------- /hw2/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw2/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | plt.show() 59 | 60 | 61 | def get_datasets(fpath, condition=None): 62 | unit = 0 63 | datasets = [] 64 | for root, dir, files in os.walk(fpath): 65 | if 'log.txt' in files: 66 | param_path = open(os.path.join(root,'params.json')) 67 | params = json.load(param_path) 68 | exp_name = params['exp_name'] 69 | 70 | log_path = os.path.join(root,'log.txt') 71 | experiment_data = pd.read_table(log_path) 72 | 73 | experiment_data.insert( 74 | len(experiment_data.columns), 75 | 'Unit', 76 | unit 77 | ) 78 | experiment_data.insert( 79 | len(experiment_data.columns), 80 | 'Condition', 81 | condition or exp_name 82 | ) 83 | 84 | datasets.append(experiment_data) 85 | unit += 1 86 | 87 | return datasets 88 | 89 | 90 | def main(): 91 | import argparse 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('logdir', nargs='*') 94 | parser.add_argument('--legend', nargs='*') 95 | parser.add_argument('--value', default='AverageReturn', nargs='*') 96 | args = parser.parse_args() 97 | 98 | use_legend = False 99 | if args.legend is not None: 100 | assert len(args.legend) == len(args.logdir), \ 101 | "Must give a legend title for each set of experiments." 102 | use_legend = True 103 | 104 | data = [] 105 | if use_legend: 106 | for logdir, legend_title in zip(args.logdir, args.legend): 107 | data += get_datasets(logdir, legend_title) 108 | else: 109 | for logdir in args.logdir: 110 | data += get_datasets(logdir) 111 | 112 | if isinstance(args.value, list): 113 | values = args.value 114 | else: 115 | values = [args.value] 116 | for value in values: 117 | plot_data(data, value=value) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /hw2/requirements.txt: -------------------------------------------------------------------------------- 1 | mujoco-py==1.50.1.56 2 | gym==0.10.5 3 | tensorflow==1.10.0 4 | numpy==1.14.5 5 | seaborn 6 | Box2D==2.3.2 7 | -------------------------------------------------------------------------------- /hw3/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 3: Q-Learning 2 | 3 | Dependencies: 4 | * Python **3.5** 5 | * Numpy version **1.14.5** 6 | * TensorFlow version **1.10.5** 7 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56** 8 | * OpenAI Gym version **0.10.5** 9 | * seaborn 10 | * Box2D==**2.3.2** 11 | * OpenCV 12 | * ffmpeg 13 | 14 | Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file. 15 | 16 | The only files that you need to look at are `dqn.py` and `train_ac_f18.py`, which you will implement. 17 | 18 | See the [HW3 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw3.pdf) for further instructions. 19 | 20 | The starter code was based on an implementation of Q-learning for Atari generously provided by Szymon Sidor from OpenAI. 21 | -------------------------------------------------------------------------------- /hw3/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | #import sys 2 | #sys.path.remove('/opt/ros/kinetic/lib/python2.7/dist-packages') 3 | 4 | import cv2 5 | import numpy as np 6 | from collections import deque 7 | import gym 8 | from gym import spaces 9 | 10 | 11 | class NoopResetEnv(gym.Wrapper): 12 | def __init__(self, env=None, noop_max=30): 13 | """Sample initial states by taking random number of no-ops on reset. 14 | No-op is assumed to be action 0. 15 | """ 16 | super(NoopResetEnv, self).__init__(env) 17 | self.noop_max = noop_max 18 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 19 | 20 | def _reset(self): 21 | """ Do no-op action for a number of steps in [1, noop_max].""" 22 | self.env.reset() 23 | noops = np.random.randint(1, self.noop_max + 1) 24 | for _ in range(noops): 25 | obs, _, _, _ = self.env.step(0) 26 | return obs 27 | 28 | class FireResetEnv(gym.Wrapper): 29 | def __init__(self, env=None): 30 | """Take action on reset for environments that are fixed until firing.""" 31 | super(FireResetEnv, self).__init__(env) 32 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 33 | assert len(env.unwrapped.get_action_meanings()) >= 3 34 | 35 | def _reset(self): 36 | self.env.reset() 37 | obs, _, _, _ = self.env.step(1) 38 | obs, _, _, _ = self.env.step(2) 39 | return obs 40 | 41 | class EpisodicLifeEnv(gym.Wrapper): 42 | def __init__(self, env=None): 43 | """Make end-of-life == end-of-episode, but only reset on true game over. 44 | Done by DeepMind for the DQN and co. since it helps value estimation. 45 | """ 46 | super(EpisodicLifeEnv, self).__init__(env) 47 | self.lives = 0 48 | self.was_real_done = True 49 | self.was_real_reset = False 50 | 51 | def _step(self, action): 52 | obs, reward, done, info = self.env.step(action) 53 | self.was_real_done = done 54 | # check current lives, make loss of life terminal, 55 | # then update lives to handle bonus lives 56 | lives = self.env.unwrapped.ale.lives() 57 | if lives < self.lives and lives > 0: 58 | # for Qbert somtimes we stay in lives == 0 condtion for a few frames 59 | # so its important to keep lives > 0, so that we only reset once 60 | # the environment advertises done. 61 | done = True 62 | self.lives = lives 63 | return obs, reward, done, info 64 | 65 | def _reset(self): 66 | """Reset only when lives are exhausted. 67 | This way all states are still reachable even though lives are episodic, 68 | and the learner need not know about any of this behind-the-scenes. 69 | """ 70 | if self.was_real_done: 71 | obs = self.env.reset() 72 | self.was_real_reset = True 73 | else: 74 | # no-op step to advance from terminal/lost life state 75 | obs, _, _, _ = self.env.step(0) 76 | self.was_real_reset = False 77 | self.lives = self.env.unwrapped.ale.lives() 78 | return obs 79 | 80 | class MaxAndSkipEnv(gym.Wrapper): 81 | def __init__(self, env=None, skip=4): 82 | """Return only every `skip`-th frame""" 83 | super(MaxAndSkipEnv, self).__init__(env) 84 | # most recent raw observations (for max pooling across time steps) 85 | self._obs_buffer = deque(maxlen=2) 86 | self._skip = skip 87 | 88 | def _step(self, action): 89 | total_reward = 0.0 90 | done = None 91 | for _ in range(self._skip): 92 | obs, reward, done, info = self.env.step(action) 93 | self._obs_buffer.append(obs) 94 | total_reward += reward 95 | if done: 96 | break 97 | 98 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 99 | 100 | return max_frame, total_reward, done, info 101 | 102 | def _reset(self): 103 | """Clear past frame buffer and init. to first obs. from inner env.""" 104 | self._obs_buffer.clear() 105 | obs = self.env.reset() 106 | self._obs_buffer.append(obs) 107 | return obs 108 | 109 | def _process_frame84(frame): 110 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 111 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 112 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) 113 | x_t = resized_screen[18:102, :] 114 | x_t = np.reshape(x_t, [84, 84, 1]) 115 | return x_t.astype(np.uint8) 116 | 117 | class ProcessFrame84(gym.Wrapper): 118 | def __init__(self, env=None): 119 | super(ProcessFrame84, self).__init__(env) 120 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 121 | 122 | def _step(self, action): 123 | obs, reward, done, info = self.env.step(action) 124 | return _process_frame84(obs), reward, done, info 125 | 126 | def _reset(self): 127 | return _process_frame84(self.env.reset()) 128 | 129 | class ClippedRewardsWrapper(gym.Wrapper): 130 | def _step(self, action): 131 | obs, reward, done, info = self.env.step(action) 132 | return obs, np.sign(reward), done, info 133 | 134 | def wrap_deepmind_ram(env): 135 | env = EpisodicLifeEnv(env) 136 | env = NoopResetEnv(env, noop_max=30) 137 | env = MaxAndSkipEnv(env, skip=4) 138 | if 'FIRE' in env.unwrapped.get_action_meanings(): 139 | env = FireResetEnv(env) 140 | env = ClippedRewardsWrapper(env) 141 | return env 142 | 143 | def wrap_deepmind(env): 144 | assert 'NoFrameskip' in env.spec.id 145 | env = EpisodicLifeEnv(env) 146 | env = NoopResetEnv(env, noop_max=30) 147 | env = MaxAndSkipEnv(env, skip=4) 148 | if 'FIRE' in env.unwrapped.get_action_meanings(): 149 | env = FireResetEnv(env) 150 | env = ProcessFrame84(env) 151 | env = ClippedRewardsWrapper(env) 152 | return env 153 | -------------------------------------------------------------------------------- /hw3/dqn.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import time 3 | import pickle 4 | import sys 5 | import gym.spaces 6 | import itertools 7 | import numpy as np 8 | import random 9 | import tensorflow as tf 10 | import tensorflow.contrib.layers as layers 11 | from collections import namedtuple 12 | from dqn_utils import * 13 | 14 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"]) 15 | 16 | class QLearner(object): 17 | 18 | def __init__( 19 | self, 20 | env, 21 | q_func, 22 | optimizer_spec, 23 | session, 24 | exploration=LinearSchedule(1000000, 0.1), 25 | stopping_criterion=None, 26 | replay_buffer_size=1000000, 27 | batch_size=32, 28 | gamma=0.99, 29 | learning_starts=50000, 30 | learning_freq=4, 31 | frame_history_len=4, 32 | target_update_freq=10000, 33 | grad_norm_clipping=10, 34 | rew_file=None, 35 | double_q=True, 36 | lander=False): 37 | """Run Deep Q-learning algorithm. 38 | 39 | You can specify your own convnet using q_func. 40 | 41 | All schedules are w.r.t. total number of steps taken in the environment. 42 | 43 | Parameters 44 | ---------- 45 | env: gym.Env 46 | gym environment to train on. 47 | q_func: function 48 | Model to use for computing the q function. It should accept the 49 | following named arguments: 50 | img_in: tf.Tensor 51 | tensorflow tensor representing the input image 52 | num_actions: int 53 | number of actions 54 | scope: str 55 | scope in which all the model related variables 56 | should be created 57 | reuse: bool 58 | whether previously created variables should be reused. 59 | optimizer_spec: OptimizerSpec 60 | Specifying the constructor and kwargs, as well as learning rate schedule 61 | for the optimizer 62 | session: tf.Session 63 | tensorflow session to use. 64 | exploration: rl_algs.deepq.utils.schedules.Schedule 65 | schedule for probability of chosing random action. 66 | stopping_criterion: (env, t) -> bool 67 | should return true when it's ok for the RL algorithm to stop. 68 | takes in env and the number of steps executed so far. 69 | replay_buffer_size: int 70 | How many memories to store in the replay buffer. 71 | batch_size: int 72 | How many transitions to sample each time experience is replayed. 73 | gamma: float 74 | Discount Factor 75 | learning_starts: int 76 | After how many environment steps to start replaying experiences 77 | learning_freq: int 78 | How many steps of environment to take between every experience replay 79 | frame_history_len: int 80 | How many past frames to include as input to the model. 81 | target_update_freq: int 82 | How many experience replay rounds (not steps!) to perform between 83 | each update to the target Q network 84 | grad_norm_clipping: float or None 85 | If not None gradients' norms are clipped to this value. 86 | double_q: bool 87 | If True, then use double Q-learning to compute target values. Otherwise, use vanilla DQN. 88 | https://papers.nips.cc/paper/3964-double-q-learning.pdf 89 | """ 90 | assert type(env.observation_space) == gym.spaces.Box 91 | assert type(env.action_space) == gym.spaces.Discrete 92 | 93 | self.target_update_freq = target_update_freq 94 | self.optimizer_spec = optimizer_spec 95 | self.batch_size = batch_size 96 | self.learning_freq = learning_freq 97 | self.learning_starts = learning_starts 98 | self.stopping_criterion = stopping_criterion 99 | self.env = env 100 | self.session = session 101 | self.exploration = exploration 102 | self.rew_file = str(uuid.uuid4()) + '.pkl' if rew_file is None else rew_file 103 | 104 | ############### 105 | # BUILD MODEL # 106 | ############### 107 | 108 | if len(self.env.observation_space.shape) == 1: 109 | # This means we are running on low-dimensional observations (e.g. RAM) 110 | input_shape = self.env.observation_space.shape 111 | else: 112 | img_h, img_w, img_c = self.env.observation_space.shape 113 | input_shape = (img_h, img_w, frame_history_len * img_c) 114 | self.num_actions = self.env.action_space.n 115 | 116 | # set up placeholders 117 | # placeholder for current observation (or state) 118 | self.obs_t_ph = tf.placeholder( 119 | tf.float32 if lander else tf.uint8, [None] + list(input_shape)) 120 | # placeholder for current action 121 | self.act_t_ph = tf.placeholder(tf.int32, [None]) 122 | # placeholder for current reward 123 | self.rew_t_ph = tf.placeholder(tf.float32, [None]) 124 | # placeholder for next observation (or state) 125 | self.obs_tp1_ph = tf.placeholder( 126 | tf.float32 if lander else tf.uint8, [None] + list(input_shape)) 127 | # placeholder for end of episode mask 128 | # this value is 1 if the next state corresponds to the end of an episode, 129 | # in which case there is no Q-value at the next state; at the end of an 130 | # episode, only the current state reward contributes to the target, not the 131 | # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) 132 | self.done_mask_ph = tf.placeholder(tf.float32, [None]) 133 | 134 | # casting to float on GPU ensures lower data transfer times. 135 | if lander: 136 | obs_t_float = self.obs_t_ph 137 | obs_tp1_float = self.obs_tp1_ph 138 | else: 139 | obs_t_float = tf.cast(self.obs_t_ph, tf.float32) / 255.0 140 | obs_tp1_float = tf.cast(self.obs_tp1_ph, tf.float32) / 255.0 141 | 142 | # Here, you should fill in your own code to compute the Bellman error. This requires 143 | # evaluating the current and next Q-values and constructing the corresponding error. 144 | # TensorFlow will differentiate this error for you, you just need to pass it to the 145 | # optimizer. See assignment text for details. 146 | # Your code should produce one scalar-valued tensor: total_error 147 | # This will be passed to the optimizer in the provided code below. 148 | # Your code should also produce two collections of variables: 149 | # q_func_vars 150 | # target_q_func_vars 151 | # These should hold all of the variables of the Q-function network and target network, 152 | # respectively. A convenient way to get these is to make use of TF's "scope" feature. 153 | # For example, you can create your Q-function network with the scope "q_func" like this: 154 | # = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) 155 | # And then you can obtain the variables like this: 156 | # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 157 | # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" 158 | # Tip: use huber_loss (from dqn_utils) instead of squared error when defining self.total_error 159 | ###### 160 | 161 | # YOUR CODE HERE 162 | 163 | ###### 164 | 165 | # construct optimization op (with gradient clipping) 166 | self.learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") 167 | optimizer = self.optimizer_spec.constructor(learning_rate=self.learning_rate, **self.optimizer_spec.kwargs) 168 | self.train_fn = minimize_and_clip(optimizer, self.total_error, 169 | var_list=q_func_vars, clip_val=grad_norm_clipping) 170 | 171 | # update_target_fn will be called periodically to copy Q network to target Q network 172 | update_target_fn = [] 173 | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), 174 | sorted(target_q_func_vars, key=lambda v: v.name)): 175 | update_target_fn.append(var_target.assign(var)) 176 | self.update_target_fn = tf.group(*update_target_fn) 177 | 178 | # construct the replay buffer 179 | self.replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len, lander=lander) 180 | self.replay_buffer_idx = None 181 | 182 | ############### 183 | # RUN ENV # 184 | ############### 185 | self.model_initialized = False 186 | self.num_param_updates = 0 187 | self.mean_episode_reward = -float('nan') 188 | self.best_mean_episode_reward = -float('inf') 189 | self.last_obs = self.env.reset() 190 | self.log_every_n_steps = 10000 191 | 192 | self.start_time = None 193 | self.t = 0 194 | 195 | def stopping_criterion_met(self): 196 | return self.stopping_criterion is not None and self.stopping_criterion(self.env, self.t) 197 | 198 | def step_env(self): 199 | ### 2. Step the env and store the transition 200 | # At this point, "self.last_obs" contains the latest observation that was 201 | # recorded from the simulator. Here, your code needs to store this 202 | # observation and its outcome (reward, next observation, etc.) into 203 | # the replay buffer while stepping the simulator forward one step. 204 | # At the end of this block of code, the simulator should have been 205 | # advanced one step, and the replay buffer should contain one more 206 | # transition. 207 | # Specifically, self.last_obs must point to the new latest observation. 208 | # Useful functions you'll need to call: 209 | # obs, reward, done, info = env.step(action) 210 | # this steps the environment forward one step 211 | # obs = env.reset() 212 | # this resets the environment if you reached an episode boundary. 213 | # Don't forget to call env.reset() to get a new observation if done 214 | # is true!! 215 | # Note that you cannot use "self.last_obs" directly as input 216 | # into your network, since it needs to be processed to include context 217 | # from previous frames. You should check out the replay buffer 218 | # implementation in dqn_utils.py to see what functionality the replay 219 | # buffer exposes. The replay buffer has a function called 220 | # encode_recent_observation that will take the latest observation 221 | # that you pushed into the buffer and compute the corresponding 222 | # input that should be given to a Q network by appending some 223 | # previous frames. 224 | # Don't forget to include epsilon greedy exploration! 225 | # And remember that the first time you enter this loop, the model 226 | # may not yet have been initialized (but of course, the first step 227 | # might as well be random, since you haven't trained your net...) 228 | 229 | ##### 230 | 231 | # YOUR CODE HERE 232 | 233 | def update_model(self): 234 | ### 3. Perform experience replay and train the network. 235 | # note that this is only done if the replay buffer contains enough samples 236 | # for us to learn something useful -- until then, the model will not be 237 | # initialized and random actions should be taken 238 | if (self.t > self.learning_starts and \ 239 | self.t % self.learning_freq == 0 and \ 240 | self.replay_buffer.can_sample(self.batch_size)): 241 | # Here, you should perform training. Training consists of four steps: 242 | # 3.a: use the replay buffer to sample a batch of transitions (see the 243 | # replay buffer code for function definition, each batch that you sample 244 | # should consist of current observations, current actions, rewards, 245 | # next observations, and done indicator). 246 | # 3.b: initialize the model if it has not been initialized yet; to do 247 | # that, call 248 | # initialize_interdependent_variables(self.session, tf.global_variables(), { 249 | # self.obs_t_ph: obs_t_batch, 250 | # self.obs_tp1_ph: obs_tp1_batch, 251 | # }) 252 | # where obs_t_batch and obs_tp1_batch are the batches of observations at 253 | # the current and next time step. The boolean variable model_initialized 254 | # indicates whether or not the model has been initialized. 255 | # Remember that you have to update the target network too (see 3.d)! 256 | # 3.c: train the model. To do this, you'll need to use the self.train_fn and 257 | # self.total_error ops that were created earlier: self.total_error is what you 258 | # created to compute the total Bellman error in a batch, and self.train_fn 259 | # will actually perform a gradient step and update the network parameters 260 | # to reduce total_error. When calling self.session.run on these you'll need to 261 | # populate the following placeholders: 262 | # self.obs_t_ph 263 | # self.act_t_ph 264 | # self.rew_t_ph 265 | # self.obs_tp1_ph 266 | # self.done_mask_ph 267 | # (this is needed for computing self.total_error) 268 | # self.learning_rate -- you can get this from self.optimizer_spec.lr_schedule.value(t) 269 | # (this is needed by the optimizer to choose the learning rate) 270 | # 3.d: periodically update the target network by calling 271 | # self.session.run(self.update_target_fn) 272 | # you should update every target_update_freq steps, and you may find the 273 | # variable self.num_param_updates useful for this (it was initialized to 0) 274 | ##### 275 | 276 | # YOUR CODE HERE 277 | 278 | self.num_param_updates += 1 279 | 280 | self.t += 1 281 | 282 | def log_progress(self): 283 | episode_rewards = get_wrapper_by_name(self.env, "Monitor").get_episode_rewards() 284 | 285 | if len(episode_rewards) > 0: 286 | self.mean_episode_reward = np.mean(episode_rewards[-100:]) 287 | 288 | if len(episode_rewards) > 100: 289 | self.best_mean_episode_reward = max(self.best_mean_episode_reward, self.mean_episode_reward) 290 | 291 | if self.t % self.log_every_n_steps == 0 and self.model_initialized: 292 | print("Timestep %d" % (self.t,)) 293 | print("mean reward (100 episodes) %f" % self.mean_episode_reward) 294 | print("best mean reward %f" % self.best_mean_episode_reward) 295 | print("episodes %d" % len(episode_rewards)) 296 | print("exploration %f" % self.exploration.value(self.t)) 297 | print("learning_rate %f" % self.optimizer_spec.lr_schedule.value(self.t)) 298 | if self.start_time is not None: 299 | print("running time %f" % ((time.time() - self.start_time) / 60.)) 300 | 301 | self.start_time = time.time() 302 | 303 | sys.stdout.flush() 304 | 305 | with open(self.rew_file, 'wb') as f: 306 | pickle.dump(episode_rewards, f, pickle.HIGHEST_PROTOCOL) 307 | 308 | def learn(*args, **kwargs): 309 | alg = QLearner(*args, **kwargs) 310 | while not alg.stopping_criterion_met(): 311 | alg.step_env() 312 | # at this point, the environment should have been advanced one step (and 313 | # reset if done was true), and self.last_obs should point to the new latest 314 | # observation 315 | alg.update_model() 316 | alg.log_progress() 317 | 318 | -------------------------------------------------------------------------------- /hw3/dqn_utils.py: -------------------------------------------------------------------------------- 1 | """This file includes a collection of utility functions that are useful for 2 | implementing DQN.""" 3 | import gym 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | def huber_loss(x, delta=1.0): 9 | # https://en.wikipedia.org/wiki/Huber_loss 10 | return tf.where( 11 | tf.abs(x) < delta, 12 | tf.square(x) * 0.5, 13 | delta * (tf.abs(x) - 0.5 * delta) 14 | ) 15 | 16 | def sample_n_unique(sampling_f, n): 17 | """Helper function. Given a function `sampling_f` that returns 18 | comparable objects, sample n such unique objects. 19 | """ 20 | res = [] 21 | while len(res) < n: 22 | candidate = sampling_f() 23 | if candidate not in res: 24 | res.append(candidate) 25 | return res 26 | 27 | class Schedule(object): 28 | def value(self, t): 29 | """Value of the schedule at time t""" 30 | raise NotImplementedError() 31 | 32 | class ConstantSchedule(object): 33 | def __init__(self, value): 34 | """Value remains constant over time. 35 | Parameters 36 | ---------- 37 | value: float 38 | Constant value of the schedule 39 | """ 40 | self._v = value 41 | 42 | def value(self, t): 43 | """See Schedule.value""" 44 | return self._v 45 | 46 | def linear_interpolation(l, r, alpha): 47 | return l + alpha * (r - l) 48 | 49 | class PiecewiseSchedule(object): 50 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 51 | """Piecewise schedule. 52 | endpoints: [(int, int)] 53 | list of pairs `(time, value)` meanining that schedule should output 54 | `value` when `t==time`. All the values for time must be sorted in 55 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 56 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 57 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 58 | time passed between `time_a` and `time_b` for time `t`. 59 | interpolation: lambda float, float, float: float 60 | a function that takes value to the left and to the right of t according 61 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 62 | right endpoint that t has covered. See linear_interpolation for example. 63 | outside_value: float 64 | if the value is requested outside of all the intervals sepecified in 65 | `endpoints` this value is returned. If None then AssertionError is 66 | raised when outside value is requested. 67 | """ 68 | idxes = [e[0] for e in endpoints] 69 | assert idxes == sorted(idxes) 70 | self._interpolation = interpolation 71 | self._outside_value = outside_value 72 | self._endpoints = endpoints 73 | 74 | def value(self, t): 75 | """See Schedule.value""" 76 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 77 | if l_t <= t and t < r_t: 78 | alpha = float(t - l_t) / (r_t - l_t) 79 | return self._interpolation(l, r, alpha) 80 | 81 | # t does not belong to any of the pieces, so doom. 82 | assert self._outside_value is not None 83 | return self._outside_value 84 | 85 | class LinearSchedule(object): 86 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 87 | """Linear interpolation between initial_p and final_p over 88 | schedule_timesteps. After this many timesteps pass final_p is 89 | returned. 90 | Parameters 91 | ---------- 92 | schedule_timesteps: int 93 | Number of timesteps for which to linearly anneal initial_p 94 | to final_p 95 | initial_p: float 96 | initial output value 97 | final_p: float 98 | final output value 99 | """ 100 | self.schedule_timesteps = schedule_timesteps 101 | self.final_p = final_p 102 | self.initial_p = initial_p 103 | 104 | def value(self, t): 105 | """See Schedule.value""" 106 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 107 | return self.initial_p + fraction * (self.final_p - self.initial_p) 108 | 109 | def compute_exponential_averages(variables, decay): 110 | """Given a list of tensorflow scalar variables 111 | create ops corresponding to their exponential 112 | averages 113 | Parameters 114 | ---------- 115 | variables: [tf.Tensor] 116 | List of scalar tensors. 117 | Returns 118 | ------- 119 | averages: [tf.Tensor] 120 | List of scalar tensors corresponding to averages 121 | of al the `variables` (in order) 122 | apply_op: tf.runnable 123 | Op to be run to update the averages with current value 124 | of variables. 125 | """ 126 | averager = tf.train.ExponentialMovingAverage(decay=decay) 127 | apply_op = averager.apply(variables) 128 | return [averager.average(v) for v in variables], apply_op 129 | 130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 131 | """Minimized `objective` using `optimizer` w.r.t. variables in 132 | `var_list` while ensure the norm of the gradients for each 133 | variable is clipped to `clip_val` 134 | """ 135 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 136 | for i, (grad, var) in enumerate(gradients): 137 | if grad is not None: 138 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 139 | return optimizer.apply_gradients(gradients) 140 | 141 | def initialize_interdependent_variables(session, vars_list, feed_dict): 142 | """Initialize a list of variables one at a time, which is useful if 143 | initialization of some variables depends on initialization of the others. 144 | """ 145 | vars_left = vars_list 146 | while len(vars_left) > 0: 147 | new_vars_left = [] 148 | for v in vars_left: 149 | try: 150 | # If using an older version of TensorFlow, uncomment the line 151 | # below and comment out the line after it. 152 | #session.run(tf.initialize_variables([v]), feed_dict) 153 | session.run(tf.variables_initializer([v]), feed_dict) 154 | except tf.errors.FailedPreconditionError: 155 | new_vars_left.append(v) 156 | if len(new_vars_left) >= len(vars_left): 157 | # This can happend if the variables all depend on each other, or more likely if there's 158 | # another variable outside of the list, that still needs to be initialized. This could be 159 | # detected here, but life's finite. 160 | raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") 161 | else: 162 | vars_left = new_vars_left 163 | 164 | def get_wrapper_by_name(env, classname): 165 | currentenv = env 166 | while True: 167 | if classname in currentenv.__class__.__name__: 168 | return currentenv 169 | elif isinstance(env, gym.Wrapper): 170 | currentenv = currentenv.env 171 | else: 172 | raise ValueError("Couldn't find wrapper named %s"%classname) 173 | 174 | class ReplayBuffer(object): 175 | def __init__(self, size, frame_history_len, lander=False): 176 | """This is a memory efficient implementation of the replay buffer. 177 | 178 | The sepecific memory optimizations use here are: 179 | - only store each frame once rather than k times 180 | even if every observation normally consists of k last frames 181 | - store frames as np.uint8 (actually it is most time-performance 182 | to cast them back to float32 on GPU to minimize memory transfer 183 | time) 184 | - store frame_t and frame_(t+1) in the same buffer. 185 | 186 | For the tipical use case in Atari Deep RL buffer with 1M frames the total 187 | memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes 188 | 189 | Warning! Assumes that returning frame of zeros at the beginning 190 | of the episode, when there is less frames than `frame_history_len`, 191 | is acceptable. 192 | 193 | Parameters 194 | ---------- 195 | size: int 196 | Max number of transitions to store in the buffer. When the buffer 197 | overflows the old memories are dropped. 198 | frame_history_len: int 199 | Number of memories to be retried for each observation. 200 | """ 201 | self.lander = lander 202 | 203 | self.size = size 204 | self.frame_history_len = frame_history_len 205 | 206 | self.next_idx = 0 207 | self.num_in_buffer = 0 208 | 209 | self.obs = None 210 | self.action = None 211 | self.reward = None 212 | self.done = None 213 | 214 | def can_sample(self, batch_size): 215 | """Returns true if `batch_size` different transitions can be sampled from the buffer.""" 216 | return batch_size + 1 <= self.num_in_buffer 217 | 218 | def _encode_sample(self, idxes): 219 | obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) 220 | act_batch = self.action[idxes] 221 | rew_batch = self.reward[idxes] 222 | next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) 223 | done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) 224 | 225 | return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask 226 | 227 | 228 | def sample(self, batch_size): 229 | """Sample `batch_size` different transitions. 230 | 231 | i-th sample transition is the following: 232 | 233 | when observing `obs_batch[i]`, action `act_batch[i]` was taken, 234 | after which reward `rew_batch[i]` was received and subsequent 235 | observation next_obs_batch[i] was observed, unless the epsiode 236 | was done which is represented by `done_mask[i]` which is equal 237 | to 1 if episode has ended as a result of that action. 238 | 239 | Parameters 240 | ---------- 241 | batch_size: int 242 | How many transitions to sample. 243 | 244 | Returns 245 | ------- 246 | obs_batch: np.array 247 | Array of shape 248 | (batch_size, img_h, img_w, img_c * frame_history_len) 249 | and dtype np.uint8 250 | act_batch: np.array 251 | Array of shape (batch_size,) and dtype np.int32 252 | rew_batch: np.array 253 | Array of shape (batch_size,) and dtype np.float32 254 | next_obs_batch: np.array 255 | Array of shape 256 | (batch_size, img_h, img_w, img_c * frame_history_len) 257 | and dtype np.uint8 258 | done_mask: np.array 259 | Array of shape (batch_size,) and dtype np.float32 260 | """ 261 | assert self.can_sample(batch_size) 262 | idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) 263 | return self._encode_sample(idxes) 264 | 265 | def encode_recent_observation(self): 266 | """Return the most recent `frame_history_len` frames. 267 | 268 | Returns 269 | ------- 270 | observation: np.array 271 | Array of shape (img_h, img_w, img_c * frame_history_len) 272 | and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] 273 | encodes frame at time `t - frame_history_len + i` 274 | """ 275 | assert self.num_in_buffer > 0 276 | return self._encode_observation((self.next_idx - 1) % self.size) 277 | 278 | def _encode_observation(self, idx): 279 | end_idx = idx + 1 # make noninclusive 280 | start_idx = end_idx - self.frame_history_len 281 | # this checks if we are using low-dimensional observations, such as RAM 282 | # state, in which case we just directly return the latest RAM. 283 | if len(self.obs.shape) == 2: 284 | return self.obs[end_idx-1] 285 | # if there weren't enough frames ever in the buffer for context 286 | if start_idx < 0 and self.num_in_buffer != self.size: 287 | start_idx = 0 288 | for idx in range(start_idx, end_idx - 1): 289 | if self.done[idx % self.size]: 290 | start_idx = idx + 1 291 | missing_context = self.frame_history_len - (end_idx - start_idx) 292 | # if zero padding is needed for missing context 293 | # or we are on the boundry of the buffer 294 | if start_idx < 0 or missing_context > 0: 295 | frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] 296 | for idx in range(start_idx, end_idx): 297 | frames.append(self.obs[idx % self.size]) 298 | return np.concatenate(frames, 2) 299 | else: 300 | # this optimization has potential to saves about 30% compute time \o/ 301 | img_h, img_w = self.obs.shape[1], self.obs.shape[2] 302 | return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) 303 | 304 | def store_frame(self, frame): 305 | """Store a single frame in the buffer at the next available index, overwriting 306 | old frames if necessary. 307 | 308 | Parameters 309 | ---------- 310 | frame: np.array 311 | Array of shape (img_h, img_w, img_c) and dtype np.uint8 312 | the frame to be stored 313 | 314 | Returns 315 | ------- 316 | idx: int 317 | Index at which the frame is stored. To be used for `store_effect` later. 318 | """ 319 | if self.obs is None: 320 | self.obs = np.empty([self.size] + list(frame.shape), dtype=np.float32 if self.lander else np.uint8) 321 | self.action = np.empty([self.size], dtype=np.int32) 322 | self.reward = np.empty([self.size], dtype=np.float32) 323 | self.done = np.empty([self.size], dtype=np.bool) 324 | self.obs[self.next_idx] = frame 325 | 326 | ret = self.next_idx 327 | self.next_idx = (self.next_idx + 1) % self.size 328 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 329 | 330 | return ret 331 | 332 | def store_effect(self, idx, action, reward, done): 333 | """Store effects of action taken after obeserving frame stored 334 | at index idx. The reason `store_frame` and `store_effect` is broken 335 | up into two functions is so that once can call `encode_recent_observation` 336 | in between. 337 | 338 | Paramters 339 | --------- 340 | idx: int 341 | Index in buffer of recently observed frame (returned by `store_frame`). 342 | action: int 343 | Action that was performed upon observing this frame. 344 | reward: float 345 | Reward that was received when the actions was performed. 346 | done: bool 347 | True if episode was finished after performing that action. 348 | """ 349 | self.action[idx] = action 350 | self.reward[idx] = reward 351 | self.done[idx] = done 352 | 353 | -------------------------------------------------------------------------------- /hw3/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw3/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | plt.show() 59 | 60 | 61 | def get_datasets(fpath, condition=None): 62 | unit = 0 63 | datasets = [] 64 | for root, dir, files in os.walk(fpath): 65 | if 'log.txt' in files: 66 | param_path = open(os.path.join(root,'params.json')) 67 | params = json.load(param_path) 68 | exp_name = params['exp_name'] 69 | 70 | log_path = os.path.join(root,'log.txt') 71 | experiment_data = pd.read_table(log_path) 72 | 73 | experiment_data.insert( 74 | len(experiment_data.columns), 75 | 'Unit', 76 | unit 77 | ) 78 | experiment_data.insert( 79 | len(experiment_data.columns), 80 | 'Condition', 81 | condition or exp_name 82 | ) 83 | 84 | datasets.append(experiment_data) 85 | unit += 1 86 | 87 | return datasets 88 | 89 | 90 | def main(): 91 | import argparse 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('logdir', nargs='*') 94 | parser.add_argument('--legend', nargs='*') 95 | parser.add_argument('--value', default='AverageReturn', nargs='*') 96 | args = parser.parse_args() 97 | 98 | use_legend = False 99 | if args.legend is not None: 100 | assert len(args.legend) == len(args.logdir), \ 101 | "Must give a legend title for each set of experiments." 102 | use_legend = True 103 | 104 | data = [] 105 | if use_legend: 106 | for logdir, legend_title in zip(args.logdir, args.legend): 107 | data += get_datasets(logdir, legend_title) 108 | else: 109 | for logdir in args.logdir: 110 | data += get_datasets(logdir) 111 | 112 | if isinstance(args.value, list): 113 | values = args.value 114 | else: 115 | values = [args.value] 116 | for value in values: 117 | plot_data(data, value=value) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /hw3/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.5 2 | gym[atari] 3 | box2d 4 | mujoco-py==1.50.1.56 5 | tensorflow 6 | numpy 7 | seaborn 8 | opencv-python 9 | -------------------------------------------------------------------------------- /hw3/run_dqn_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(img_in, num_actions, scope, reuse=False): 16 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = img_in 19 | with tf.variable_scope("convnet"): 20 | # original architecture 21 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 22 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 24 | out = layers.flatten(out) 25 | with tf.variable_scope("action_value"): 26 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 27 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 28 | 29 | return out 30 | 31 | def atari_learn(env, 32 | session, 33 | num_timesteps): 34 | # This is just a rough estimate 35 | num_iterations = float(num_timesteps) / 4.0 36 | 37 | lr_multiplier = 1.0 38 | lr_schedule = PiecewiseSchedule([ 39 | (0, 1e-4 * lr_multiplier), 40 | (num_iterations / 10, 1e-4 * lr_multiplier), 41 | (num_iterations / 2, 5e-5 * lr_multiplier), 42 | ], 43 | outside_value=5e-5 * lr_multiplier) 44 | optimizer = dqn.OptimizerSpec( 45 | constructor=tf.train.AdamOptimizer, 46 | kwargs=dict(epsilon=1e-4), 47 | lr_schedule=lr_schedule 48 | ) 49 | 50 | def stopping_criterion(env, t): 51 | # notice that here t is the number of steps of the wrapped env, 52 | # which is different from the number of steps in the underlying env 53 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 54 | 55 | exploration_schedule = PiecewiseSchedule( 56 | [ 57 | (0, 1.0), 58 | (1e6, 0.1), 59 | (num_iterations / 2, 0.01), 60 | ], outside_value=0.01 61 | ) 62 | 63 | dqn.learn( 64 | env=env, 65 | q_func=atari_model, 66 | optimizer_spec=optimizer, 67 | session=session, 68 | exploration=exploration_schedule, 69 | stopping_criterion=stopping_criterion, 70 | replay_buffer_size=1000000, 71 | batch_size=32, 72 | gamma=0.99, 73 | learning_starts=50000, 74 | learning_freq=4, 75 | frame_history_len=4, 76 | target_update_freq=10000, 77 | grad_norm_clipping=10, 78 | double_q=True 79 | ) 80 | env.close() 81 | 82 | def get_available_gpus(): 83 | from tensorflow.python.client import device_lib 84 | local_device_protos = device_lib.list_local_devices() 85 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 86 | 87 | def set_global_seeds(i): 88 | try: 89 | import tensorflow as tf 90 | except ImportError: 91 | pass 92 | else: 93 | tf.set_random_seed(i) 94 | np.random.seed(i) 95 | random.seed(i) 96 | 97 | def get_session(): 98 | tf.reset_default_graph() 99 | tf_config = tf.ConfigProto( 100 | inter_op_parallelism_threads=1, 101 | intra_op_parallelism_threads=1) 102 | session = tf.Session(config=tf_config) 103 | print("AVAILABLE GPUS: ", get_available_gpus()) 104 | return session 105 | 106 | def get_env(task, seed): 107 | env = gym.make('PongNoFrameskip-v4') 108 | 109 | set_global_seeds(seed) 110 | env.seed(seed) 111 | 112 | expt_dir = '/tmp/hw3_vid_dir2/' 113 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 114 | env = wrap_deepmind(env) 115 | 116 | return env 117 | 118 | def main(): 119 | # Get Atari games. 120 | task = gym.make('PongNoFrameskip-v4') 121 | 122 | # Run training 123 | seed = random.randint(0, 9999) 124 | print('random seed = %d' % seed) 125 | env = get_env(task, seed) 126 | session = get_session() 127 | atari_learn(env, session, num_timesteps=2e8) 128 | 129 | if __name__ == "__main__": 130 | main() 131 | -------------------------------------------------------------------------------- /hw3/run_dqn_lander.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | 13 | def lander_model(obs, num_actions, scope, reuse=False): 14 | with tf.variable_scope(scope, reuse=reuse): 15 | out = obs 16 | with tf.variable_scope("action_value"): 17 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 18 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 19 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 20 | 21 | return out 22 | 23 | def lander_optimizer(): 24 | return dqn.OptimizerSpec( 25 | constructor=tf.train.AdamOptimizer, 26 | lr_schedule=ConstantSchedule(1e-3), 27 | kwargs={} 28 | ) 29 | 30 | def lander_stopping_criterion(num_timesteps): 31 | def stopping_criterion(env, t): 32 | # notice that here t is the number of steps of the wrapped env, 33 | # which is different from the number of steps in the underlying env 34 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 35 | return stopping_criterion 36 | 37 | def lander_exploration_schedule(num_timesteps): 38 | return PiecewiseSchedule( 39 | [ 40 | (0, 1), 41 | (num_timesteps * 0.1, 0.02), 42 | ], outside_value=0.02 43 | ) 44 | 45 | def lander_kwargs(): 46 | return { 47 | 'optimizer_spec': lander_optimizer(), 48 | 'q_func': lander_model, 49 | 'replay_buffer_size': 50000, 50 | 'batch_size': 32, 51 | 'gamma': 1.00, 52 | 'learning_starts': 1000, 53 | 'learning_freq': 1, 54 | 'frame_history_len': 1, 55 | 'target_update_freq': 3000, 56 | 'grad_norm_clipping': 10, 57 | 'lander': True 58 | } 59 | 60 | def lander_learn(env, 61 | session, 62 | num_timesteps, 63 | seed): 64 | 65 | optimizer = lander_optimizer() 66 | stopping_criterion = lander_stopping_criterion(num_timesteps) 67 | exploration_schedule = lander_exploration_schedule(num_timesteps) 68 | 69 | dqn.learn( 70 | env=env, 71 | session=session, 72 | exploration=lander_exploration_schedule(num_timesteps), 73 | stopping_criterion=lander_stopping_criterion(num_timesteps), 74 | double_q=True, 75 | **lander_kwargs() 76 | ) 77 | env.close() 78 | 79 | def set_global_seeds(i): 80 | tf.set_random_seed(i) 81 | np.random.seed(i) 82 | random.seed(i) 83 | 84 | def get_session(): 85 | tf.reset_default_graph() 86 | tf_config = tf.ConfigProto( 87 | inter_op_parallelism_threads=1, 88 | intra_op_parallelism_threads=1, 89 | device_count={'GPU': 0}) 90 | # GPUs don't significantly speed up deep Q-learning for lunar lander, 91 | # since the observations are low-dimensional 92 | session = tf.Session(config=tf_config) 93 | return session 94 | 95 | def get_env(seed): 96 | env = gym.make('LunarLander-v2') 97 | 98 | set_global_seeds(seed) 99 | env.seed(seed) 100 | 101 | expt_dir = '/tmp/hw3_vid_dir/' 102 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 103 | 104 | return env 105 | 106 | def main(): 107 | # Run training 108 | seed = 4565 # you may want to randomize this 109 | print('random seed = %d' % seed) 110 | env = get_env(seed) 111 | session = get_session() 112 | set_global_seeds(seed) 113 | lander_learn(env, session, num_timesteps=500000, seed=seed) 114 | 115 | if __name__ == "__main__": 116 | main() 117 | -------------------------------------------------------------------------------- /hw3/run_dqn_ram.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(ram_in, num_actions, scope, reuse=False): 16 | with tf.variable_scope(scope, reuse=reuse): 17 | out = ram_in 18 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65])) 19 | with tf.variable_scope("action_value"): 20 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) 21 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu) 22 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 23 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 24 | 25 | return out 26 | 27 | def atari_learn(env, 28 | session, 29 | num_timesteps): 30 | # This is just a rough estimate 31 | num_iterations = float(num_timesteps) / 4.0 32 | 33 | lr_multiplier = 1.0 34 | lr_schedule = PiecewiseSchedule([ 35 | (0, 1e-4 * lr_multiplier), 36 | (num_iterations / 10, 1e-4 * lr_multiplier), 37 | (num_iterations / 2, 5e-5 * lr_multiplier), 38 | ], 39 | outside_value=5e-5 * lr_multiplier) 40 | optimizer = dqn.OptimizerSpec( 41 | constructor=tf.train.AdamOptimizer, 42 | kwargs=dict(epsilon=1e-4), 43 | lr_schedule=lr_schedule 44 | ) 45 | 46 | def stopping_criterion(env, t): 47 | # notice that here t is the number of steps of the wrapped env, 48 | # which is different from the number of steps in the underlying env 49 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 50 | 51 | exploration_schedule = PiecewiseSchedule( 52 | [ 53 | (0, 0.2), 54 | (1e6, 0.1), 55 | (num_iterations / 2, 0.01), 56 | ], outside_value=0.01 57 | ) 58 | 59 | dqn.learn( 60 | env, 61 | q_func=atari_model, 62 | optimizer_spec=optimizer, 63 | session=session, 64 | exploration=exploration_schedule, 65 | stopping_criterion=stopping_criterion, 66 | replay_buffer_size=1000000, 67 | batch_size=32, 68 | gamma=0.99, 69 | learning_starts=50000, 70 | learning_freq=4, 71 | frame_history_len=1, 72 | target_update_freq=10000, 73 | grad_norm_clipping=10 74 | ) 75 | env.close() 76 | 77 | def get_available_gpus(): 78 | from tensorflow.python.client import device_lib 79 | local_device_protos = device_lib.list_local_devices() 80 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 81 | 82 | def set_global_seeds(i): 83 | try: 84 | import tensorflow as tf 85 | except ImportError: 86 | pass 87 | else: 88 | tf.set_random_seed(i) 89 | np.random.seed(i) 90 | random.seed(i) 91 | 92 | def get_session(): 93 | tf.reset_default_graph() 94 | tf_config = tf.ConfigProto( 95 | inter_op_parallelism_threads=1, 96 | intra_op_parallelism_threads=1) 97 | session = tf.Session(config=tf_config) 98 | print("AVAILABLE GPUS: ", get_available_gpus()) 99 | return session 100 | 101 | def get_env(seed): 102 | env = gym.make('Pong-ram-v0') 103 | 104 | set_global_seeds(seed) 105 | env.seed(seed) 106 | 107 | expt_dir = '/tmp/hw3_vid_dir/' 108 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 109 | env = wrap_deepmind_ram(env) 110 | 111 | return env 112 | 113 | def main(): 114 | # Run training 115 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 116 | env = get_env(seed) 117 | session = get_session() 118 | atari_learn(env, session, num_timesteps=int(4e7)) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /hw4/.gitignore: -------------------------------------------------------------------------------- 1 | plots/ 2 | data/ 3 | -------------------------------------------------------------------------------- /hw4/half_cheetah_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym import utils 4 | from gym.envs.mujoco import mujoco_env 5 | 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, action): 12 | xposbefore = self.sim.data.qpos[0] 13 | self.do_simulation(action, self.frame_skip) 14 | xposafter = self.sim.data.qpos[0] 15 | ob = self._get_obs() 16 | reward_ctrl = - 0.1 * np.square(action).sum() 17 | reward_run = (xposafter - xposbefore)/self.dt 18 | reward = reward_ctrl + reward_run 19 | done = False 20 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 21 | 22 | def _get_obs(self): 23 | return np.concatenate([ 24 | self.sim.data.qpos.flat[1:], 25 | self.sim.data.qvel.flat, 26 | self.get_body_com("torso").flat, 27 | # self.get_body_comvel("torso").flat, 28 | ]) 29 | 30 | def reset_model(self): 31 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 32 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 33 | self.set_state(qpos, qvel) 34 | return self._get_obs() 35 | 36 | def viewer_setup(self): 37 | self.viewer.cam.distance = self.model.stat.extent * 0.5 38 | 39 | @staticmethod 40 | def cost_fn(states, actions, next_states): 41 | is_tf = tf.contrib.framework.is_tensor(states) 42 | is_single_state = (len(states.get_shape()) == 1) if is_tf else (len(states.shape) == 1) 43 | 44 | if is_single_state: 45 | states = states[None, ...] 46 | actions = actions[None, ...] 47 | next_states = next_states[None, ...] 48 | 49 | scores = tf.zeros(actions.get_shape()[0].value) if is_tf else np.zeros(actions.shape[0]) 50 | 51 | heading_penalty_factor = 10 52 | 53 | # dont move front shin back so far that you tilt forward 54 | front_leg = states[:, 5] 55 | my_range = 0.2 56 | if is_tf: 57 | scores += tf.cast(front_leg >= my_range, tf.float32) * heading_penalty_factor 58 | else: 59 | scores += (front_leg >= my_range) * heading_penalty_factor 60 | 61 | front_shin = states[:, 6] 62 | my_range = 0 63 | if is_tf: 64 | scores += tf.cast(front_shin >= my_range, tf.float32) * heading_penalty_factor 65 | else: 66 | scores += (front_shin >= my_range) * heading_penalty_factor 67 | 68 | front_foot = states[:, 7] 69 | my_range = 0 70 | if is_tf: 71 | scores += tf.cast(front_foot >= my_range, tf.float32) * heading_penalty_factor 72 | else: 73 | scores += (front_foot >= my_range) * heading_penalty_factor 74 | 75 | scores -= (next_states[:, 17] - states[:, 17]) / 0.01 76 | 77 | if is_single_state: 78 | scores = scores[0] 79 | 80 | return scores 81 | -------------------------------------------------------------------------------- /hw4/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | import logging 4 | from colorlog import ColoredFormatter 5 | 6 | import pandas 7 | import numpy as np 8 | 9 | from tabulate import tabulate 10 | 11 | 12 | class LoggerClass(object): 13 | GLOBAL_LOGGER_NAME = '_global_logger' 14 | 15 | _color_formatter = ColoredFormatter( 16 | "%(asctime)s %(log_color)s%(name)-10s %(levelname)-8s%(reset)s %(white)s%(message)s", 17 | datefmt='%m-%d %H:%M:%S', 18 | reset=True, 19 | log_colors={ 20 | 'DEBUG': 'cyan', 21 | 'INFO': 'green', 22 | 'WARNING': 'yellow', 23 | 'ERROR': 'red', 24 | 'CRITICAL': 'red,bg_white', 25 | }, 26 | secondary_log_colors={}, 27 | style='%' 28 | ) 29 | 30 | _normal_formatter = logging.Formatter( 31 | '%(asctime)s %(name)-10s %(levelname)-8s %(message)s', 32 | datefmt='%m-%d %H:%M:%S', 33 | style='%' 34 | ) 35 | 36 | def __init__(self): 37 | self._dir = None 38 | self._logger = None 39 | self._log_path = None 40 | self._csv_path = None 41 | self._tabular = defaultdict(list) 42 | self._curr_recorded = list() 43 | self._num_dump_tabular_calls = 0 44 | 45 | @property 46 | def dir(self): 47 | return self._dir 48 | 49 | ############# 50 | ### Setup ### 51 | ############# 52 | 53 | def setup(self, display_name, log_path, lvl): 54 | self._dir = os.path.dirname(log_path) 55 | self._logger = self._get_logger(LoggerClass.GLOBAL_LOGGER_NAME, 56 | log_path, 57 | lvl=lvl, 58 | display_name=display_name) 59 | self._csv_path = os.path.splitext(log_path)[0] + '.csv' 60 | 61 | ### load csv if exists 62 | if os.path.exists(self._csv_path): 63 | self._tabular = {k: list(v) for k, v in pandas.read_csv(self._csv_path).items()} 64 | self._num_dump_tabular_calls = len(tuple(self._tabular.values())[0]) 65 | 66 | def _get_logger(self, name, log_path, lvl=logging.INFO, display_name=None): 67 | if isinstance(lvl, str): 68 | lvl = lvl.lower().strip() 69 | if lvl == 'debug': 70 | lvl = logging.DEBUG 71 | elif lvl == 'info': 72 | lvl = logging.INFO 73 | elif lvl == 'warn' or lvl == 'warning': 74 | lvl = logging.WARN 75 | elif lvl == 'error': 76 | lvl = logging.ERROR 77 | elif lvl == 'fatal' or lvl == 'critical': 78 | lvl = logging.CRITICAL 79 | else: 80 | raise ValueError('unknown logging level') 81 | 82 | file_handler = logging.FileHandler(log_path) 83 | file_handler.setLevel(logging.DEBUG) 84 | file_handler.setFormatter(LoggerClass._normal_formatter) 85 | console_handler = logging.StreamHandler() 86 | console_handler.setLevel(lvl) 87 | console_handler.setFormatter(LoggerClass._color_formatter) 88 | if display_name is None: 89 | display_name = name 90 | logger = logging.getLogger(display_name) 91 | logger.setLevel(logging.DEBUG) 92 | logger.addHandler(console_handler) 93 | logger.addHandler(file_handler) 94 | 95 | return logger 96 | 97 | ############### 98 | ### Logging ### 99 | ############### 100 | 101 | def debug(self, s): 102 | assert (self._logger is not None) 103 | self._logger.debug(s) 104 | 105 | def info(self, s): 106 | assert (self._logger is not None) 107 | self._logger.info(s) 108 | 109 | def warn(self, s): 110 | assert (self._logger is not None) 111 | self._logger.warn(s) 112 | 113 | def error(self, s): 114 | assert (self._logger is not None) 115 | self._logger.error(s) 116 | 117 | def critical(self, s): 118 | assert (self._logger is not None) 119 | self._logger.critical(s) 120 | 121 | #################### 122 | ### Data logging ### 123 | #################### 124 | 125 | def record_tabular(self, key, val): 126 | assert (str(key) not in self._curr_recorded) 127 | self._curr_recorded.append(str(key)) 128 | 129 | if key in self._tabular: 130 | self._tabular[key].append(val) 131 | else: 132 | self._tabular[key] = [np.nan] * self._num_dump_tabular_calls + [val] 133 | 134 | def dump_tabular(self, print_func=None): 135 | if len(self._curr_recorded) == 0: 136 | return '' 137 | 138 | ### reset 139 | self._curr_recorded = list() 140 | self._num_dump_tabular_calls += 1 141 | 142 | ### make sure all same length 143 | for k, v in self._tabular.items(): 144 | if len(v) == self._num_dump_tabular_calls: 145 | pass 146 | elif len(v) == self._num_dump_tabular_calls - 1: 147 | self._tabular[k].append(np.nan) 148 | else: 149 | raise ValueError('key {0} should not have {1} items when {2} calls have been made'.format( 150 | k, len(v), self._num_dump_tabular_calls)) 151 | 152 | ### print 153 | if print_func is not None: 154 | log_str = tabulate(sorted([(k, v[-1]) for k, v in self._tabular.items()], key=lambda kv: kv[0])) 155 | for line in log_str.split('\n'): 156 | print_func(line) 157 | 158 | ### write to file 159 | tabular_pandas = pandas.DataFrame({k: pandas.Series(v) for k, v in self._tabular.items()}) 160 | tabular_pandas.to_csv(self._csv_path) 161 | 162 | 163 | logger = LoggerClass() 164 | -------------------------------------------------------------------------------- /hw4/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import time 4 | 5 | from half_cheetah_env import HalfCheetahEnv 6 | from logger import logger 7 | from model_based_rl import ModelBasedRL 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('question', type=str, choices=('q1, q2, q3')) 11 | parser.add_argument('--exp_name', type=str, default=None) 12 | parser.add_argument('--env', type=str, default='HalfCheetah', choices=('HalfCheetah',)) 13 | parser.add_argument('--render', action='store_true') 14 | parser.add_argument('--mpc_horizon', type=int, default=15) 15 | parser.add_argument('--num_random_action_selection', type=int, default=4096) 16 | parser.add_argument('--nn_layers', type=int, default=1) 17 | args = parser.parse_args() 18 | 19 | data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') 20 | exp_name = '{0}_{1}_{2}'.format(args.env, 21 | args.question, 22 | args.exp_name if args.exp_name else time.strftime("%d-%m-%Y_%H-%M-%S")) 23 | exp_dir = os.path.join(data_dir, exp_name) 24 | assert not os.path.exists(exp_dir),\ 25 | 'Experiment directory {0} already exists. Either delete the directory, or run the experiment with a different name'.format(exp_dir) 26 | os.makedirs(exp_dir, exist_ok=True) 27 | logger.setup(exp_name, os.path.join(exp_dir, 'log.txt'), 'debug') 28 | 29 | env = { 30 | 'HalfCheetah': HalfCheetahEnv() 31 | }[args.env] 32 | 33 | mbrl = ModelBasedRL(env=env, 34 | render=args.render, 35 | mpc_horizon=args.mpc_horizon, 36 | num_random_action_selection=args.num_random_action_selection, 37 | nn_layers=args.nn_layers) 38 | 39 | run_func = { 40 | 'q1': mbrl.run_q1, 41 | 'q2': mbrl.run_q2, 42 | 'q3': mbrl.run_q3 43 | }[args.question] 44 | run_func() 45 | -------------------------------------------------------------------------------- /hw4/model_based_policy.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | import utils 5 | 6 | 7 | class ModelBasedPolicy(object): 8 | 9 | def __init__(self, 10 | env, 11 | init_dataset, 12 | horizon=15, 13 | num_random_action_selection=4096, 14 | nn_layers=1): 15 | self._cost_fn = env.cost_fn 16 | self._state_dim = env.observation_space.shape[0] 17 | self._action_dim = env.action_space.shape[0] 18 | self._action_space_low = env.action_space.low 19 | self._action_space_high = env.action_space.high 20 | self._init_dataset = init_dataset 21 | self._horizon = horizon 22 | self._num_random_action_selection = num_random_action_selection 23 | self._nn_layers = nn_layers 24 | self._learning_rate = 1e-3 25 | 26 | self._sess, self._state_ph, self._action_ph, self._next_state_ph,\ 27 | self._next_state_pred, self._loss, self._optimizer, self._best_action = self._setup_graph() 28 | 29 | def _setup_placeholders(self): 30 | """ 31 | Creates the placeholders used for training, prediction, and action selection 32 | 33 | returns: 34 | state_ph: current state 35 | action_ph: current_action 36 | next_state_ph: next state 37 | 38 | implementation details: 39 | (a) the placeholders should have 2 dimensions, 40 | in which the 1st dimension is variable length (i.e., None) 41 | """ 42 | ### PROBLEM 1 43 | ### YOUR CODE HERE 44 | raise NotImplementedError 45 | 46 | return state_ph, action_ph, next_state_ph 47 | 48 | def _dynamics_func(self, state, action, reuse): 49 | """ 50 | Takes as input a state and action, and predicts the next state 51 | 52 | returns: 53 | next_state_pred: predicted next state 54 | 55 | implementation details (in order): 56 | (a) Normalize both the state and action by using the statistics of self._init_dataset and 57 | the utils.normalize function 58 | (b) Concatenate the normalized state and action 59 | (c) Pass the concatenated, normalized state-action tensor through a neural network with 60 | self._nn_layers number of layers using the function utils.build_mlp. The resulting output 61 | is the normalized predicted difference between the next state and the current state 62 | (d) Unnormalize the delta state prediction, and add it to the current state in order to produce 63 | the predicted next state 64 | 65 | """ 66 | ### PROBLEM 1 67 | ### YOUR CODE HERE 68 | raise NotImplementedError 69 | 70 | return next_state_pred 71 | 72 | def _setup_training(self, state_ph, next_state_ph, next_state_pred): 73 | """ 74 | Takes as input the current state, next state, and predicted next state, and returns 75 | the loss and optimizer for training the dynamics model 76 | 77 | returns: 78 | loss: Scalar loss tensor 79 | optimizer: Operation used to perform gradient descent 80 | 81 | implementation details (in order): 82 | (a) Compute both the actual state difference and the predicted state difference 83 | (b) Normalize both of these state differences by using the statistics of self._init_dataset and 84 | the utils.normalize function 85 | (c) The loss function is the mean-squared-error between the normalized state difference and 86 | normalized predicted state difference 87 | (d) Create the optimizer by minimizing the loss using the Adam optimizer with self._learning_rate 88 | 89 | """ 90 | ### PROBLEM 1 91 | ### YOUR CODE HERE 92 | raise NotImplementedError 93 | 94 | return loss, optimizer 95 | 96 | def _setup_action_selection(self, state_ph): 97 | """ 98 | Computes the best action from the current state by using randomly sampled action sequences 99 | to predict future states, evaluating these predictions according to a cost function, 100 | selecting the action sequence with the lowest cost, and returning the first action in that sequence 101 | 102 | returns: 103 | best_action: the action that minimizes the cost function (tensor with shape [self._action_dim]) 104 | 105 | implementation details (in order): 106 | (a) We will assume state_ph has a batch size of 1 whenever action selection is performed 107 | (b) Randomly sample uniformly self._num_random_action_selection number of action sequences, 108 | each of length self._horizon 109 | (c) Starting from the input state, unroll each action sequence using your neural network 110 | dynamics model 111 | (d) While unrolling the action sequences, keep track of the cost of each action sequence 112 | using self._cost_fn 113 | (e) Find the action sequence with the lowest cost, and return the first action in that sequence 114 | 115 | Hints: 116 | (i) self._cost_fn takes three arguments: states, actions, and next states. These arguments are 117 | 2-dimensional tensors, where the 1st dimension is the batch size and the 2nd dimension is the 118 | state or action size 119 | (ii) You should call self._dynamics_func and self._cost_fn a total of self._horizon times 120 | (iii) Use tf.random_uniform(...) to generate the random action sequences 121 | 122 | """ 123 | ### PROBLEM 2 124 | ### YOUR CODE HERE 125 | raise NotImplementedError 126 | 127 | return best_action 128 | 129 | def _setup_graph(self): 130 | """ 131 | Sets up the tensorflow computation graph for training, prediction, and action selection 132 | 133 | The variables returned will be set as class attributes (see __init__) 134 | """ 135 | sess = tf.Session() 136 | 137 | ### PROBLEM 1 138 | ### YOUR CODE HERE 139 | raise NotImplementedError 140 | ### PROBLEM 2 141 | ### YOUR CODE HERE 142 | best_action = None 143 | 144 | sess.run(tf.global_variables_initializer()) 145 | 146 | return sess, state_ph, action_ph, next_state_ph, \ 147 | next_state_pred, loss, optimizer, best_action 148 | 149 | def train_step(self, states, actions, next_states): 150 | """ 151 | Performs one step of gradient descent 152 | 153 | returns: 154 | loss: the loss from performing gradient descent 155 | """ 156 | ### PROBLEM 1 157 | ### YOUR CODE HERE 158 | raise NotImplementedError 159 | 160 | return loss 161 | 162 | def predict(self, state, action): 163 | """ 164 | Predicts the next state given the current state and action 165 | 166 | returns: 167 | next_state_pred: predicted next state 168 | 169 | implementation detils: 170 | (i) The state and action arguments are 1-dimensional vectors (NO batch dimension) 171 | """ 172 | assert np.shape(state) == (self._state_dim,) 173 | assert np.shape(action) == (self._action_dim,) 174 | 175 | ### PROBLEM 1 176 | ### YOUR CODE HERE 177 | raise NotImplementedError 178 | 179 | assert np.shape(next_state_pred) == (self._state_dim,) 180 | return next_state_pred 181 | 182 | def get_action(self, state): 183 | """ 184 | Computes the action that minimizes the cost function given the current state 185 | 186 | returns: 187 | best_action: the best action 188 | """ 189 | assert np.shape(state) == (self._state_dim,) 190 | 191 | ### PROBLEM 2 192 | ### YOUR CODE HERE 193 | raise NotImplementedError 194 | 195 | assert np.shape(best_action) == (self._action_dim,) 196 | return best_action 197 | -------------------------------------------------------------------------------- /hw4/model_based_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | from model_based_policy import ModelBasedPolicy 7 | import utils 8 | from logger import logger 9 | from timer import timeit 10 | 11 | 12 | class ModelBasedRL(object): 13 | 14 | def __init__(self, 15 | env, 16 | num_init_random_rollouts=10, 17 | max_rollout_length=500, 18 | num_onplicy_iters=10, 19 | num_onpolicy_rollouts=10, 20 | training_epochs=60, 21 | training_batch_size=512, 22 | render=False, 23 | mpc_horizon=15, 24 | num_random_action_selection=4096, 25 | nn_layers=1): 26 | self._env = env 27 | self._max_rollout_length = max_rollout_length 28 | self._num_onpolicy_iters = num_onplicy_iters 29 | self._num_onpolicy_rollouts = num_onpolicy_rollouts 30 | self._training_epochs = training_epochs 31 | self._training_batch_size = training_batch_size 32 | self._render = render 33 | 34 | logger.info('Gathering random dataset') 35 | self._random_dataset = self._gather_rollouts(utils.RandomPolicy(env), 36 | num_init_random_rollouts) 37 | 38 | logger.info('Creating policy') 39 | self._policy = ModelBasedPolicy(env, 40 | self._random_dataset, 41 | horizon=mpc_horizon, 42 | num_random_action_selection=num_random_action_selection) 43 | 44 | timeit.reset() 45 | timeit.start('total') 46 | 47 | def _gather_rollouts(self, policy, num_rollouts): 48 | dataset = utils.Dataset() 49 | 50 | for _ in range(num_rollouts): 51 | state = self._env.reset() 52 | done = False 53 | t = 0 54 | while not done: 55 | if self._render: 56 | timeit.start('render') 57 | self._env.render() 58 | timeit.stop('render') 59 | timeit.start('get action') 60 | action = policy.get_action(state) 61 | timeit.stop('get action') 62 | timeit.start('env step') 63 | next_state, reward, done, _ = self._env.step(action) 64 | timeit.stop('env step') 65 | done = done or (t >= self._max_rollout_length) 66 | dataset.add(state, action, next_state, reward, done) 67 | 68 | state = next_state 69 | t += 1 70 | 71 | return dataset 72 | 73 | def _train_policy(self, dataset): 74 | """ 75 | Train the model-based policy 76 | 77 | implementation details: 78 | (a) Train for self._training_epochs number of epochs 79 | (b) The dataset.random_iterator(...) method will iterate through the dataset once in a random order 80 | (c) Use self._training_batch_size for iterating through the dataset 81 | (d) Keep track of the loss values by appending them to the losses array 82 | """ 83 | timeit.start('train policy') 84 | 85 | losses = [] 86 | ### PROBLEM 1 87 | ### YOUR CODE HERE 88 | raise NotImplementedError 89 | 90 | logger.record_tabular('TrainingLossStart', losses[0]) 91 | logger.record_tabular('TrainingLossFinal', losses[-1]) 92 | 93 | timeit.stop('train policy') 94 | 95 | def _log(self, dataset): 96 | timeit.stop('total') 97 | dataset.log() 98 | logger.dump_tabular(print_func=logger.info) 99 | logger.debug('') 100 | for line in str(timeit).split('\n'): 101 | logger.debug(line) 102 | timeit.reset() 103 | timeit.start('total') 104 | 105 | def run_q1(self): 106 | """ 107 | Train on a dataset, and see how good the learned dynamics model's predictions are. 108 | 109 | implementation details: 110 | (i) Train using the self._random_dataset 111 | (ii) For each rollout, use the initial state and all actions to predict the future states. 112 | Store these predicted states in the pred_states list. 113 | NOTE: you should *not* be using any of the states in states[1:]. Only use states[0] 114 | (iii) After predicting the future states, we have provided plotting code that plots the actual vs 115 | predicted states and saves these to the experiment's folder. You do not need to modify this code. 116 | """ 117 | logger.info('Training policy....') 118 | ### PROBLEM 1 119 | ### YOUR CODE HERE 120 | raise NotImplementedError 121 | 122 | logger.info('Evaluating predictions...') 123 | for r_num, (states, actions, _, _, _) in enumerate(self._random_dataset.rollout_iterator()): 124 | pred_states = [] 125 | 126 | ### PROBLEM 1 127 | ### YOUR CODE HERE 128 | raise NotImplementedError 129 | 130 | states = np.asarray(states) 131 | pred_states = np.asarray(pred_states) 132 | 133 | state_dim = states.shape[1] 134 | rows = int(np.sqrt(state_dim)) 135 | cols = state_dim // rows 136 | f, axes = plt.subplots(rows, cols, figsize=(3*cols, 3*rows)) 137 | f.suptitle('Model predictions (red) versus ground truth (black) for open-loop predictions') 138 | for i, (ax, state_i, pred_state_i) in enumerate(zip(axes.ravel(), states.T, pred_states.T)): 139 | ax.set_title('state {0}'.format(i)) 140 | ax.plot(state_i, color='k') 141 | ax.plot(pred_state_i, color='r') 142 | plt.tight_layout() 143 | plt.subplots_adjust(top=0.90) 144 | f.savefig(os.path.join(logger.dir, 'prediction_{0:03d}.jpg'.format(r_num)), bbox_inches='tight') 145 | 146 | logger.info('All plots saved to folder') 147 | 148 | def run_q2(self): 149 | """ 150 | Train the model-based policy on a random dataset, and evaluate the performance of the resulting policy 151 | """ 152 | logger.info('Random policy') 153 | self._log(self._random_dataset) 154 | 155 | logger.info('Training policy....') 156 | ### PROBLEM 2 157 | ### YOUR CODE HERE 158 | raise NotImplementedError 159 | 160 | logger.info('Evaluating policy...') 161 | ### PROBLEM 2 162 | ### YOUR CODE HERE 163 | raise NotImplementedError 164 | 165 | logger.info('Trained policy') 166 | self._log(eval_dataset) 167 | 168 | def run_q3(self): 169 | """ 170 | Starting with the random dataset, train the policy on the dataset, gather rollouts with the policy, 171 | append the new rollouts to the existing dataset, and repeat 172 | """ 173 | dataset = self._random_dataset 174 | 175 | itr = -1 176 | logger.info('Iteration {0}'.format(itr)) 177 | logger.record_tabular('Itr', itr) 178 | self._log(dataset) 179 | 180 | for itr in range(self._num_onpolicy_iters + 1): 181 | logger.info('Iteration {0}'.format(itr)) 182 | logger.record_tabular('Itr', itr) 183 | 184 | ### PROBLEM 3 185 | ### YOUR CODE HERE 186 | logger.info('Training policy...') 187 | raise NotImplementedError 188 | 189 | ### PROBLEM 3 190 | ### YOUR CODE HERE 191 | logger.info('Gathering rollouts...') 192 | raise NotImplementedError 193 | 194 | ### PROBLEM 3 195 | ### YOUR CODE HERE 196 | logger.info('Appending dataset...') 197 | raise NotImplementedError 198 | 199 | self._log(new_dataset) 200 | -------------------------------------------------------------------------------- /hw4/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import matplotlib.pyplot as plt 5 | import matplotlib.cm as cm 6 | import pandas 7 | 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--exps', nargs='+', type=str) 11 | parser.add_argument('--save', type=str, default=None) 12 | args = parser.parse_args() 13 | 14 | f, ax = plt.subplots(1, 1) 15 | for i, exp in enumerate(args.exps): 16 | log_fname = os.path.join('data', exp, 'log.csv') 17 | csv = pandas.read_csv(log_fname) 18 | 19 | color = cm.viridis(i / float(len(args.exps))) 20 | ax.plot(csv['Itr'], csv['ReturnAvg'], color=color, label=exp) 21 | ax.fill_between(csv['Itr'], csv['ReturnAvg'] - csv['ReturnStd'], csv['ReturnAvg'] + csv['ReturnStd'], 22 | color=color, alpha=0.2) 23 | 24 | ax.legend() 25 | ax.set_xlabel('Iteration') 26 | ax.set_ylabel('Return') 27 | 28 | if args.save: 29 | os.makedirs('plots', exist_ok=True) 30 | f.savefig(os.path.join('plots', args.save + '.jpg')) 31 | else: 32 | plt.show() 33 | -------------------------------------------------------------------------------- /hw4/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | matplotlib 3 | colorlog -------------------------------------------------------------------------------- /hw4/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ########## 4 | ### Q1 ### 5 | ########## 6 | 7 | python main.py q1 --exp_name exp 8 | 9 | ########## 10 | ### Q2 ### 11 | ########## 12 | 13 | python main.py q2 --exp_name exp 14 | 15 | ########### 16 | ### Q3a ### 17 | ########### 18 | 19 | python main.py q3 --exp_name default 20 | python plot.py --exps HalfCheetah_q3_default --save HalfCheetah_q3_default 21 | 22 | ########### 23 | ### Q3b ### 24 | ########### 25 | 26 | python main.py q3 --exp_name action128 --num_random_action_selection 128 27 | python main.py q3 --exp_name action4096 --num_random_action_selection 4096 28 | python main.py q3 --exp_name action16384 --num_random_action_selection 16384 29 | python plot.py --exps HalfCheetah_q3_action128 HalfCheetah_q3_action4096 HalfCheetah_q3_action16384 --save HalfCheetah_q3_actions 30 | 31 | python main.py q3 --exp_name horizon10 --mpc_horizon 10 32 | python main.py q3 --exp_name horizon15 --mpc_horizon 15 33 | python main.py q3 --exp_name horizon20 --mpc_horizon 20 34 | python plot.py --exps HalfCheetah_q3_horizon10 HalfCheetah_q3_horizon15 HalfCheetah_q3_horizon20 --save HalfCheetah_q3_mpc_horizon 35 | 36 | python main.py q3 --exp_name layers1 --nn_layers 1 37 | python main.py q3 --exp_name layers2 --nn_layers 2 38 | python main.py q3 --exp_name layers3 --nn_layers 3 39 | python plot.py --exps HalfCheetah_q3_layers1 HalfCheetah_q3_layers2 HalfCheetah_q3_layers3 --save HalfCheetah_q3_nn_layers 40 | -------------------------------------------------------------------------------- /hw4/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections import defaultdict 3 | 4 | class TimeIt(object): 5 | def __init__(self, prefix=''): 6 | self.prefix = prefix 7 | self.start_times = dict() 8 | self.elapsed_times = defaultdict(int) 9 | 10 | def start(self, name): 11 | assert(name not in self.start_times) 12 | self.start_times[name] = time.time() 13 | 14 | def stop(self, name): 15 | assert(name in self.start_times) 16 | self.elapsed_times[name] += time.time() - self.start_times[name] 17 | self.start_times.pop(name) 18 | 19 | def elapsed(self, name): 20 | return self.elapsed_times[name] 21 | 22 | def reset(self): 23 | self.start_times = dict() 24 | self.elapsed_times = defaultdict(int) 25 | 26 | def __str__(self): 27 | s = '' 28 | names_elapsed = sorted(self.elapsed_times.items(), key=lambda x: x[1], reverse=True) 29 | for name, elapsed in names_elapsed: 30 | if 'total' not in self.elapsed_times: 31 | s += '{0}: {1: <10} {2:.1f}\n'.format(self.prefix, name, elapsed) 32 | else: 33 | assert(self.elapsed_times['total'] >= max(self.elapsed_times.values())) 34 | pct = 100. * elapsed / self.elapsed_times['total'] 35 | s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, name, elapsed, pct) 36 | if 'total' in self.elapsed_times: 37 | times_summed = sum([t for k, t in self.elapsed_times.items() if k != 'total']) 38 | other_time = self.elapsed_times['total'] - times_summed 39 | assert(other_time >= 0) 40 | pct = 100. * other_time / self.elapsed_times['total'] 41 | s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, 'other', other_time, pct) 42 | return s 43 | 44 | timeit = TimeIt() 45 | -------------------------------------------------------------------------------- /hw4/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from logger import logger 5 | 6 | 7 | ############ 8 | ### Data ### 9 | ############ 10 | 11 | class Dataset(object): 12 | 13 | def __init__(self): 14 | self._states = [] 15 | self._actions = [] 16 | self._next_states = [] 17 | self._rewards = [] 18 | self._dones = [] 19 | 20 | @property 21 | def is_empty(self): 22 | return len(self) == 0 23 | 24 | def __len__(self): 25 | return len(self._states) 26 | 27 | ################## 28 | ### Statistics ### 29 | ################## 30 | 31 | @property 32 | def state_mean(self): 33 | return np.mean(self._states, axis=0) 34 | 35 | @property 36 | def state_std(self): 37 | return np.std(self._states, axis=0) 38 | 39 | @property 40 | def action_mean(self): 41 | return np.mean(self._actions, axis=0) 42 | 43 | @property 44 | def action_std(self): 45 | return np.std(self._actions, axis=0) 46 | 47 | @property 48 | def delta_state_mean(self): 49 | return np.mean(np.array(self._next_states) - np.array(self._states), axis=0) 50 | 51 | @property 52 | def delta_state_std(self): 53 | return np.std(np.array(self._next_states) - np.array(self._states), axis=0) 54 | 55 | ################### 56 | ### Adding data ### 57 | ################### 58 | 59 | def add(self, state, action, next_state, reward, done): 60 | """ 61 | Add (s, a, r, s') to this dataset 62 | """ 63 | if not self.is_empty: 64 | # ensure the state, action, next_state are of the same dimension 65 | assert len(self._states[-1]) == len(np.ravel(state)) 66 | assert len(self._actions[-1]) == len(np.ravel(action)) 67 | assert len(self._next_states[-1]) == len(np.ravel(next_state)) 68 | 69 | self._states.append(np.ravel(state)) 70 | self._actions.append(np.ravel(action)) 71 | self._next_states.append(np.ravel(next_state)) 72 | self._rewards.append(reward) 73 | self._dones.append(done) 74 | 75 | def append(self, other_dataset): 76 | """ 77 | Append other_dataset to this dataset 78 | """ 79 | if not self.is_empty and not other_dataset.is_empty: 80 | # ensure the state, action, next_state are of the same dimension 81 | assert len(self._states[-1]) == len(other_dataset._states[-1]) 82 | assert len(self._actions[-1]) == len(other_dataset._actions[-1]) 83 | assert len(self._next_states[-1]) == len(other_dataset._next_states[-1]) 84 | 85 | self._states += other_dataset._states 86 | self._actions += other_dataset._actions 87 | self._next_states += other_dataset._next_states 88 | self._rewards += other_dataset._rewards 89 | self._dones += other_dataset._dones 90 | 91 | ############################ 92 | ### Iterate through data ### 93 | ############################ 94 | 95 | def rollout_iterator(self): 96 | """ 97 | Iterate through all the rollouts in the dataset sequentially 98 | """ 99 | end_indices = np.nonzero(self._dones)[0] + 1 100 | 101 | states = np.asarray(self._states) 102 | actions = np.asarray(self._actions) 103 | next_states = np.asarray(self._next_states) 104 | rewards = np.asarray(self._rewards) 105 | dones = np.asarray(self._dones) 106 | 107 | start_idx = 0 108 | for end_idx in end_indices: 109 | indices = np.arange(start_idx, end_idx) 110 | yield states[indices], actions[indices], next_states[indices], rewards[indices], dones[indices] 111 | start_idx = end_idx 112 | 113 | def random_iterator(self, batch_size): 114 | """ 115 | Iterate once through all (s, a, r, s') in batches in a random order 116 | """ 117 | all_indices = np.nonzero(np.logical_not(self._dones))[0] 118 | np.random.shuffle(all_indices) 119 | 120 | states = np.asarray(self._states) 121 | actions = np.asarray(self._actions) 122 | next_states = np.asarray(self._next_states) 123 | rewards = np.asarray(self._rewards) 124 | dones = np.asarray(self._dones) 125 | 126 | i = 0 127 | while i < len(all_indices): 128 | indices = all_indices[i:i+batch_size] 129 | 130 | yield states[indices], actions[indices], next_states[indices], rewards[indices], dones[indices] 131 | 132 | i += batch_size 133 | 134 | ############### 135 | ### Logging ### 136 | ############### 137 | 138 | def log(self): 139 | end_idxs = np.nonzero(self._dones)[0] + 1 140 | 141 | returns = [] 142 | 143 | start_idx = 0 144 | for end_idx in end_idxs: 145 | rewards = self._rewards[start_idx:end_idx] 146 | returns.append(np.sum(rewards)) 147 | 148 | start_idx = end_idx 149 | 150 | logger.record_tabular('ReturnAvg', np.mean(returns)) 151 | logger.record_tabular('ReturnStd', np.std(returns)) 152 | logger.record_tabular('ReturnMin', np.min(returns)) 153 | logger.record_tabular('ReturnMax', np.max(returns)) 154 | 155 | ################## 156 | ### Tensorflow ### 157 | ################## 158 | 159 | def build_mlp(input_layer, 160 | output_dim, 161 | scope, 162 | n_layers=1, 163 | hidden_dim=500, 164 | activation=tf.nn.relu, 165 | output_activation=None, 166 | reuse=False): 167 | layer = input_layer 168 | with tf.variable_scope(scope, reuse=reuse): 169 | for _ in range(n_layers): 170 | layer = tf.layers.dense(layer, hidden_dim, activation=activation) 171 | layer = tf.layers.dense(layer, output_dim, activation=output_activation) 172 | return layer 173 | 174 | def normalize(x, mean, std, eps=1e-8): 175 | return (x - mean) / (std + eps) 176 | 177 | def unnormalize(x, mean, std): 178 | return x * std + mean 179 | 180 | ################ 181 | ### Policies ### 182 | ################ 183 | 184 | class RandomPolicy(object): 185 | 186 | def __init__(self, env): 187 | self._action_space_low = env.action_space.low 188 | self._action_space_high = env.action_space.high 189 | 190 | def get_action(self, state): 191 | return np.random.uniform(self._action_space_low, self._action_space_high) 192 | 193 | -------------------------------------------------------------------------------- /hw5/exp/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 5a: Exploration 2 | 3 | Dependencies: 4 | * Python **3.5** 5 | * Numpy version **1.14.5** 6 | * TensorFlow version **1.10.5** 7 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56** 8 | * seaborn 9 | * tqdm==**4.26.0** 10 | 11 | Before doing anything, first replace `gym/envs/mujoco/half_cheetah.py` with the provided `sparse_half_cheetah.py` file. It is always a good idea to keep a copy of the original `gym/envs/mujoco/half_cheetah.py` just in case you need it for something else. 12 | 13 | You will implement `density_model.py`, `exploration.py`, and `train_ac_exploration_f18.py`. 14 | 15 | See the hw5a.pdf in this folder for further instructions. 16 | . 17 | -------------------------------------------------------------------------------- /hw5/exp/density_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import tensorflow_probability as tfp 4 | from ex_utils import build_mlp 5 | 6 | class Density_Model(object): 7 | def __init__(self): 8 | super(Density_Model, self).__init__() 9 | 10 | def receive_tf_sess(self, sess): 11 | self.sess = sess 12 | 13 | def get_prob(self, state): 14 | raise NotImplementedError 15 | 16 | class Histogram(Density_Model): 17 | def __init__(self, nbins, preprocessor): 18 | super(Histogram, self).__init__() 19 | self.nbins = nbins 20 | self.total = 0. 21 | self.hist = {} 22 | for i in range(int(self.nbins)): 23 | self.hist[i] = 0 24 | self.preprocessor = preprocessor 25 | 26 | def update_count(self, state, increment): 27 | """ 28 | ### PROBLEM 1 29 | ### YOUR CODE HERE 30 | 31 | args: 32 | state: numpy array 33 | increment: int 34 | 35 | TODO: 36 | 1. increment the entry "bin_name" in self.hist by "increment" 37 | 2. increment self.total by "increment" 38 | """ 39 | bin_name = self.preprocessor(state) 40 | raise NotImplementedError 41 | 42 | def get_count(self, states): 43 | """ 44 | ### PROBLEM 1 45 | ### YOUR CODE HERE 46 | 47 | args: 48 | states: numpy array (bsize, ob_dim) 49 | 50 | returns: 51 | counts: numpy_array (bsize) 52 | 53 | TODO: 54 | For each state in states: 55 | 1. get the bin_name using self.preprocessor 56 | 2. get the value of self.hist with key bin_name 57 | """ 58 | raise NotImplementedError 59 | return counts 60 | 61 | def get_prob(self, states): 62 | """ 63 | ### PROBLEM 1 64 | ### YOUR CODE HERE 65 | 66 | args: 67 | states: numpy array (bsize, ob_dim) 68 | 69 | returns: 70 | return the probabilities of the state (bsize) 71 | 72 | NOTE: 73 | remember to normalize by float(self.total) 74 | """ 75 | raise NotImplementedError 76 | return probs 77 | 78 | class RBF(Density_Model): 79 | """ 80 | https://en.wikipedia.org/wiki/Radial_basis_function_kernel 81 | https://en.wikipedia.org/wiki/Kernel_density_estimation 82 | """ 83 | def __init__(self, sigma): 84 | super(RBF, self).__init__() 85 | self.sigma = sigma 86 | self.means = None 87 | 88 | def fit_data(self, data): 89 | """ 90 | ### PROBLEM 2 91 | ### YOUR CODE HERE 92 | 93 | args: 94 | data: list of states of shape (ob_dim) 95 | 96 | TODO: 97 | We simply assign self.means to be equal to the data points. 98 | Let the length of the data be B 99 | self.means: np array (B, ob_dim) 100 | """ 101 | B, ob_dim = len(data), len(data[0]) 102 | raise NotImplementedError 103 | self.means = None 104 | assert self.means.shape == (B, ob_dim) 105 | 106 | def get_prob(self, states): 107 | """ 108 | ### PROBLEM 2 109 | ### YOUR CODE HERE 110 | 111 | given: 112 | states: (b, ob_dim) 113 | where b is the number of states we wish to get the 114 | probability of 115 | 116 | self.means: (B, ob_dim) 117 | where B is the number of states in the replay buffer 118 | we will plop a Gaussian distribution on top of each 119 | of self.means with a std of self.sigma 120 | 121 | TODO: 122 | 1. Compute deltas: for each state in states, compute the 123 | difference between that state and every mean in self.means. 124 | 2. Euclidean distance: sum the squared deltas 125 | 3. Gaussian: evaluate the probability of the state under the 126 | gaussian centered around each mean. The hyperparameters 127 | for the reference solution assume that you do not normalize 128 | the gaussian. This is fine since the rewards will be 129 | normalized later when we compute advantages anyways. 130 | 4. Average: average the probabilities from each gaussian 131 | """ 132 | b, ob_dim = states.shape 133 | if self.means is None: 134 | # Return a uniform distribution if we don't have samples in the 135 | # replay buffer yet. 136 | return (1.0/len(states))*np.ones(len(states)) 137 | else: 138 | B, replay_dim = self.means.shape 139 | assert states.ndim == self.means.ndim and ob_dim == replay_dim 140 | 141 | # 1. Compute deltas 142 | deltas = raise NotImplementedError 143 | assert deltas.shape == (b, B, ob_dim) 144 | 145 | # 2. Euclidean distance 146 | euc_dists = raise NotImplementedError 147 | assert euc_dists.shape == (b, B) 148 | 149 | # Gaussian 150 | gaussians = raise NotImplementedError 151 | assert gaussians.shape == (b, B) 152 | 153 | # 4. Average 154 | densities = raise NotImplementedError 155 | assert densities.shape == (b,) 156 | 157 | return densities 158 | 159 | class Exemplar(Density_Model): 160 | def __init__(self, ob_dim, hid_dim, learning_rate, kl_weight): 161 | super(Exemplar, self).__init__() 162 | self.ob_dim = ob_dim 163 | self.hid_dim = hid_dim 164 | self.learning_rate = learning_rate 165 | self.kl_weight = kl_weight 166 | 167 | def build_computation_graph(self): 168 | """ 169 | ### PROBLEM 3 170 | ### YOUR CODE HERE 171 | 172 | TODO: 173 | 1. self.log_likelihood. shape: (batch_size) 174 | - use tf.squeeze 175 | - use the discriminator to get the log prob of the discrim_target 176 | 2. self.likelihood. shape: (batch_size) 177 | - use tf.squeeze 178 | - use the discriminator to get the prob of the discrim_target 179 | 3. self.kl. shape: (batch_size) 180 | - simply add the kl divergence between self.encoder1 and 181 | the prior and the kl divergence between self.encoder2 182 | and the prior. Do not average. 183 | 4. self.elbo: 184 | - subtract the kl (weighted by self.kl_weight) from the 185 | log_likelihood, and average over the batch 186 | 5. self.update_op: use the AdamOptimizer with self.learning_rate 187 | to minimize the -self.elbo (Note the negative sign!) 188 | 189 | Hint: 190 | https://www.tensorflow.org/probability/api_docs/python/tfp/distributions 191 | """ 192 | self.state1, self.state2 = self.define_placeholders() 193 | self.encoder1, self.encoder2, self.prior, self.discriminator = self.forward_pass(self.state1, self.state2) 194 | self.discrim_target = tf.placeholder(shape=[None, 1], name="discrim_target", dtype=tf.float32) 195 | 196 | raise NotImplementedError 197 | self.log_likelihood = None 198 | self.likelihood = None 199 | self.kl = None 200 | assert len(self.log_likelihood.shape) == len(self.likelihood.shape) == len(self.kl.shape) == 1 201 | 202 | raise NotImplementedError 203 | self.elbo = None 204 | self.update_op = None 205 | 206 | def define_placeholders(self): 207 | state1 = tf.placeholder(shape=[None, self.ob_dim], name="s1", dtype=tf.float32) 208 | state2 = tf.placeholder(shape=[None, self.ob_dim], name="s2", dtype=tf.float32) 209 | return state1, state2 210 | 211 | def make_encoder(self, state, z_size, scope, n_layers, hid_size): 212 | """ 213 | ### PROBLEM 3 214 | ### YOUR CODE HERE 215 | 216 | args: 217 | state: tf variable 218 | z_size: output dimension of the encoder network 219 | scope: scope name 220 | n_layers: number of layers of the encoder network 221 | hid_size: hidden dimension of encoder network 222 | 223 | TODO: 224 | 1. z_mean: the output of a neural network that takes the state as input, 225 | has output dimension z_size, n_layers layers, and hidden 226 | dimension hid_size 227 | 2. z_logstd: a trainable variable, initialized to 0 228 | shape (z_size,) 229 | 230 | Hint: use build_mlp 231 | """ 232 | z_mean = raise NotImplementedError 233 | z_logstd = raise NotImplementedError 234 | return tfp.distributions.MultivariateNormalDiag(loc=z_mean, scale_diag=tf.exp(z_logstd)) 235 | 236 | def make_prior(self, z_size): 237 | """ 238 | ### PROBLEM 3 239 | ### YOUR CODE HERE 240 | 241 | args: 242 | z_size: output dimension of the encoder network 243 | 244 | TODO: 245 | prior_mean and prior_logstd are for a standard normal distribution 246 | both have dimension z_size 247 | """ 248 | prior_mean = raise NotImplementedError 249 | prior_logstd = raise NotImplementedError 250 | return tfp.distributions.MultivariateNormalDiag(loc=prior_mean, scale_diag=tf.exp(prior_logstd)) 251 | 252 | def make_discriminator(self, z, output_size, scope, n_layers, hid_size): 253 | """ 254 | ### PROBLEM 3 255 | ### YOUR CODE HERE 256 | 257 | args: 258 | z: input to to discriminator network 259 | output_size: output dimension of discriminator network 260 | scope: scope name 261 | n_layers: number of layers of discriminator network 262 | hid_size: hidden dimension of discriminator network 263 | 264 | TODO: 265 | 1. logit: the output of a neural network that takes z as input, 266 | has output size output_size, n_layers layers, and hidden 267 | dimension hid_size 268 | 269 | Hint: use build_mlp 270 | """ 271 | logit = raise NotImplementedError 272 | return tfp.distributions.Bernoulli(logit) 273 | 274 | def forward_pass(self, state1, state2): 275 | """ 276 | ### PROBLEM 3 277 | ### YOUR CODE HERE 278 | 279 | args: 280 | state1: tf variable 281 | state2: tf variable 282 | 283 | encoder1: tfp.distributions.MultivariateNormalDiag distribution 284 | encoder2: tfp.distributions.MultivariateNormalDiag distribution 285 | prior: tfp.distributions.MultivariateNormalDiag distribution 286 | discriminator: tfp.distributions.Bernoulli distribution 287 | 288 | TODO: 289 | 1. z1: sample from encoder1 290 | 2. z2: sample from encoder2 291 | 3. z: concatenate z1 and z2 292 | 293 | Hint: 294 | https://www.tensorflow.org/probability/api_docs/python/tfp/distributions 295 | """ 296 | # Reuse 297 | make_encoder1 = tf.make_template('encoder1', self.make_encoder) 298 | make_encoder2 = tf.make_template('encoder2', self.make_encoder) 299 | make_discriminator = tf.make_template('decoder', self.make_discriminator) 300 | 301 | # Encoder 302 | encoder1 = make_encoder1(state1, self.hid_dim/2, 'z1', n_layers=2, hid_size=self.hid_dim) 303 | encoder2 = make_encoder2(state2, self.hid_dim/2, 'z2', n_layers=2, hid_size=self.hid_dim) 304 | 305 | # Prior 306 | prior = self.make_prior(self.hid_dim/2) 307 | 308 | # Sampled Latent 309 | z1 = raise NotImplementedError 310 | z2 = raise NotImplementedError 311 | z = raise NotImplementedError 312 | 313 | # Discriminator 314 | discriminator = make_discriminator(z, 1, 'discriminator', n_layers=2, hid_size=self.hid_dim) 315 | return encoder1, encoder2, prior, discriminator 316 | 317 | def update(self, state1, state2, target): 318 | """ 319 | ### PROBLEM 3 320 | ### YOUR CODE HERE 321 | 322 | args: 323 | state1: np array (batch_size, ob_dim) 324 | state2: np array (batch_size, ob_dim) 325 | target: np array (batch_size, 1) 326 | 327 | TODO: 328 | train the density model and return 329 | ll: log_likelihood 330 | kl: kl divergence 331 | elbo: elbo 332 | """ 333 | assert state1.ndim == state2.ndim == target.ndim 334 | assert state1.shape[1] == state2.shape[1] == self.ob_dim 335 | assert state1.shape[0] == state2.shape[0] == target.shape[0] 336 | raise NotImplementedError 337 | return ll, kl, elbo 338 | 339 | def get_likelihood(self, state1, state2): 340 | """ 341 | ### PROBLEM 3 342 | ### YOUR CODE HERE 343 | 344 | args: 345 | state1: np array (batch_size, ob_dim) 346 | state2: np array (batch_size, ob_dim) 347 | 348 | TODO: 349 | likelihood of state1 == state2 350 | 351 | Hint: 352 | what should be the value of self.discrim_target? 353 | """ 354 | assert state1.ndim == state2.ndim 355 | assert state1.shape[1] == state2.shape[1] == self.ob_dim 356 | assert state1.shape[0] == state2.shape[0] 357 | raise NotImplementedError 358 | return likelihood 359 | 360 | def get_prob(self, state): 361 | """ 362 | ### PROBLEM 3 363 | ### YOUR CODE HERE 364 | 365 | args: 366 | state: np array (batch_size, ob_dim) 367 | 368 | TODO: 369 | likelihood: 370 | evaluate the discriminator D(x,x) on the same input 371 | prob: 372 | compute the probability density of x from the discriminator 373 | likelihood (see homework doc) 374 | """ 375 | likelihood = raise NotImplementedError 376 | # avoid divide by 0 and log(0) 377 | likelihood = np.clip(np.squeeze(likelihood), 1e-5, 1-1e-5) 378 | prob = raise NotImplementedError 379 | return prob 380 | -------------------------------------------------------------------------------- /hw5/exp/ex_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None): 4 | """ 5 | Builds a feedforward neural network 6 | 7 | arguments: 8 | input_placeholder: placeholder variable for the state (batch_size, input_size) 9 | output_size: size of the output layer 10 | scope: variable scope of the network 11 | n_layers: number of hidden layers 12 | size: dimension of the hidden layer 13 | activation: activation of the hidden layers 14 | output_activation: activation of the ouput layers 15 | 16 | returns: 17 | output placeholder of the network (the result of a forward pass) 18 | 19 | Hint: use tf.layers.dense 20 | """ 21 | output_placeholder = input_placeholder 22 | with tf.variable_scope(scope): 23 | for _ in range(n_layers): 24 | output_placeholder = tf.layers.dense(output_placeholder, size, activation=activation) 25 | output_placeholder = tf.layers.dense(output_placeholder, output_size, activation=output_activation) 26 | return output_placeholder -------------------------------------------------------------------------------- /hw5/exp/exploration.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import tensorflow as tf 4 | 5 | from density_model import Density_Model 6 | from replay import Replay_Buffer 7 | 8 | class Exploration(object): 9 | def __init__(self, density_model, bonus_coeff): 10 | super(Exploration, self).__init__() 11 | self.density_model = density_model 12 | self.bonus_coeff = bonus_coeff 13 | 14 | def receive_tf_sess(self, sess): 15 | self.density_model.receive_tf_sess(sess) 16 | self.sess = sess 17 | 18 | def bonus_function(self, x): 19 | # You do not need to do anything here 20 | raise NotImplementedError 21 | 22 | def fit_density_model(self, states): 23 | # You do not need to do anything here 24 | raise NotImplementedError 25 | 26 | def compute_reward_bonus(self, states): 27 | # You do not need to do anything here 28 | raise NotImplementedError 29 | 30 | def modify_reward(self, rewards, states): 31 | """ 32 | ### PROBLEM 1 33 | ### YOUR CODE HERE 34 | 35 | args: 36 | states: (bsize, ob_dim) 37 | 38 | TODO: 39 | Use self.compute_reward_bonus to compute the reward 40 | bonus and then modify the rewards with the bonus 41 | and store that in new_rewards, which you will return 42 | """ 43 | raise NotImplementedError 44 | bonus = None 45 | new_rewards = None 46 | return new_rewards 47 | 48 | class DiscreteExploration(Exploration): 49 | def __init__(self, density_model, bonus_coeff): 50 | super(DiscreteExploration, self).__init__(density_model, bonus_coeff) 51 | 52 | def fit_density_model(self, states): 53 | """ 54 | ### PROBLEM 1 55 | ### YOUR CODE HERE 56 | 57 | args: 58 | states: (bsize, ob_dim) 59 | """ 60 | raise NotImplementedError 61 | 62 | def bonus_function(self, count): 63 | """ 64 | ### PROBLEM 1 65 | ### YOUR CODE HERE 66 | 67 | args: 68 | count: np array (bsize) 69 | """ 70 | raise NotImplementedError 71 | 72 | def compute_reward_bonus(self, states): 73 | """ 74 | ### PROBLEM 1 75 | ### YOUR CODE HERE 76 | 77 | args: 78 | states: (bsize, ob_dim) 79 | """ 80 | count = raise NotImplementedError 81 | bonus = raise NotImplementedError 82 | return bonus 83 | 84 | 85 | class ContinuousExploration(Exploration): 86 | def __init__(self, density_model, bonus_coeff, replay_size): 87 | super(ContinuousExploration, self).__init__(density_model, bonus_coeff) 88 | self.replay_buffer = Replay_Buffer(max_size=replay_size) 89 | 90 | def fit_density_model(self, states): 91 | # You do not need to do anything here 92 | raise NotImplementedError 93 | 94 | def bonus_function(self, prob): 95 | """ 96 | ### PROBLEM 2 97 | ### YOUR CODE HERE 98 | 99 | args: 100 | prob: np array (bsize,) 101 | """ 102 | raise NotImplementedError 103 | 104 | def compute_reward_bonus(self, states): 105 | """ 106 | ### PROBLEM 2 107 | ### YOUR CODE HERE 108 | 109 | args: 110 | states: (bsize, ob_dim) 111 | """ 112 | raise NotImplementedError 113 | prob = None 114 | bonus = None 115 | return bonus 116 | 117 | 118 | class RBFExploration(ContinuousExploration): 119 | def __init__(self, density_model, bonus_coeff, replay_size): 120 | super(RBFExploration, self).__init__(density_model, bonus_coeff, replay_size) 121 | 122 | def fit_density_model(self, states): 123 | """ 124 | args: 125 | states: (bsize, ob_dim) 126 | """ 127 | self.replay_buffer.prepend(states) 128 | self.density_model.fit_data(self.replay_buffer.get_memory()) 129 | 130 | 131 | class ExemplarExploration(ContinuousExploration): 132 | def __init__(self, density_model, bonus_coeff, train_iters, bsize, replay_size): 133 | super(ExemplarExploration, self).__init__(density_model, bonus_coeff, replay_size) 134 | self.train_iters = train_iters 135 | self.bsize = bsize 136 | 137 | def sample_idxs(self, states, batch_size): 138 | states = copy.deepcopy(states) 139 | data_size = len(states) 140 | pos_idxs = np.random.randint(data_size, size=batch_size) 141 | continue_sampling = True 142 | while continue_sampling: 143 | neg_idxs = np.random.randint(data_size, size=batch_size) 144 | if np.all(pos_idxs != neg_idxs): 145 | continue_sampling = False 146 | positives = np.concatenate([states[pos_idxs], states[pos_idxs]], axis=0) 147 | negatives = np.concatenate([states[pos_idxs], states[neg_idxs]], axis=0) 148 | return positives, negatives 149 | 150 | def sample_idxs_replay(self, states, batch_size): 151 | states = copy.deepcopy(states) 152 | data_size = len(states) 153 | pos_idxs = np.random.randint(data_size, size=batch_size) 154 | neg_idxs = np.random.randint(data_size, len(self.replay_buffer), size=batch_size) 155 | positives = np.concatenate([states[pos_idxs], states[pos_idxs]], axis=0) 156 | negatives = np.concatenate([states[pos_idxs], self.replay_buffer[neg_idxs]], axis=0) 157 | return positives, negatives 158 | 159 | def fit_density_model(self, states): 160 | """ 161 | args: 162 | states: (bsize, ob_dim) 163 | """ 164 | self.replay_buffer.prepend(states) 165 | for i in range(self.train_iters): 166 | if len(self.replay_buffer) >= 2*len(states): 167 | positives, negatives = self.sample_idxs_replay(states, self.bsize) 168 | else: 169 | positives, negatives = self.sample_idxs(states, self.bsize) 170 | labels = np.concatenate([np.ones((self.bsize, 1)), np.zeros((self.bsize, 1))], axis=0) 171 | ll, kl, elbo = self.density_model.update(positives, negatives, labels) 172 | if i % (self.train_iters/10) == 0: 173 | print('log likelihood\t{}\tkl divergence\t{}\t-elbo\t{}'.format(np.mean(ll), np.mean(kl), -elbo)) 174 | return ll, kl, elbo 175 | -------------------------------------------------------------------------------- /hw5/exp/hw5a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/hw5/exp/hw5a.pdf -------------------------------------------------------------------------------- /hw5/exp/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw5/exp/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | # plt.legend(loc='best', bbox_to_anchor=(1, 1), fontsize=8).draggable() 59 | plt.show() 60 | 61 | 62 | def get_datasets(fpath, condition=None): 63 | unit = 0 64 | datasets = [] 65 | for root, dir, files in os.walk(fpath): 66 | if 'log.txt' in files: 67 | param_path = open(os.path.join(root,'params.json')) 68 | params = json.load(param_path) 69 | exp_name = params['exp_name'] 70 | 71 | log_path = os.path.join(root,'log.txt') 72 | experiment_data = pd.read_table(log_path) 73 | 74 | experiment_data.insert( 75 | len(experiment_data.columns), 76 | 'Unit', 77 | unit 78 | ) 79 | experiment_data.insert( 80 | len(experiment_data.columns), 81 | 'Condition', 82 | condition or exp_name 83 | ) 84 | 85 | datasets.append(experiment_data) 86 | unit += 1 87 | 88 | return datasets 89 | 90 | 91 | def main(): 92 | import argparse 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('logdir', nargs='*') 95 | parser.add_argument('--legend', nargs='*') 96 | parser.add_argument('--value', default='AverageReturn', nargs='*') 97 | args = parser.parse_args() 98 | 99 | use_legend = False 100 | if args.legend is not None: 101 | assert len(args.legend) == len(args.logdir), \ 102 | "Must give a legend title for each set of experiments." 103 | use_legend = True 104 | 105 | data = [] 106 | if use_legend: 107 | for logdir, legend_title in zip(args.logdir, args.legend): 108 | data += get_datasets(logdir, legend_title) 109 | else: 110 | for logdir in args.logdir: 111 | data += get_datasets(logdir) 112 | 113 | if isinstance(args.value, list): 114 | values = args.value 115 | else: 116 | values = [args.value] 117 | for value in values: 118 | plot_data(data, value=value) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /hw5/exp/pointmass.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.envs.registration import EnvSpec 3 | import imageio 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import os 7 | import seaborn as sns 8 | from tqdm import tqdm 9 | 10 | class Env(object): 11 | def __init__(self): 12 | super(Env, self).__init__() 13 | 14 | def reset(self): 15 | raise NotImplementedError 16 | 17 | def step(self, action): 18 | raise NotImplementedError 19 | 20 | def seed(self, seed): 21 | raise NotImplementedError 22 | 23 | class PointMass(Env): 24 | def __init__(self, max_episode_steps_coeff=1, scale=20, goal_padding=2.0): 25 | super(PointMass, self).__init__() 26 | # define scale such that the each square in the grid is 1 x 1 27 | self.scale = int(scale) 28 | self.grid_size = self.scale * self.scale 29 | self.observation_space = gym.spaces.Box( 30 | low=np.array([0.0, 0.0]), 31 | high=np.array([1.0, 1.0])) 32 | self.action_space = gym.spaces.Box( 33 | low=np.array([-np.inf, -np.inf]), 34 | high=np.array([np.inf, np.inf])) 35 | self.goal_padding = goal_padding 36 | self.spec = EnvSpec(id='PointMass-v0', max_episode_steps=int(max_episode_steps_coeff*self.scale)) 37 | 38 | def reset(self): 39 | plt.close() 40 | self.state = np.array([self.goal_padding, self.goal_padding]) 41 | state = self.state/self.scale 42 | return state 43 | 44 | def step(self, action): 45 | x, y = action 46 | 47 | # next state 48 | new_x = self.state[0]+x 49 | new_y = self.state[1]+y 50 | if new_x < 0: 51 | new_x = 0 52 | if new_x > self.scale: 53 | new_x = self.scale 54 | if new_y < 0: 55 | new_y = 0 56 | if new_y > self.scale: 57 | new_y = self.scale 58 | self.state = np.array([new_x, new_y]) 59 | state = self.state/self.scale 60 | 61 | # reward 62 | reg_term = -0.01*np.sum(action**2) 63 | 64 | threshold = self.scale - self.goal_padding 65 | if new_x > threshold and new_y > threshold: 66 | reward = 10 + reg_term 67 | else: 68 | reward = 0 + reg_term 69 | 70 | # done 71 | done = False 72 | 73 | return state, reward, done, None 74 | 75 | def preprocess(self, state): 76 | scaled_state = self.scale * state 77 | x_floor, y_floor = np.floor(scaled_state) 78 | assert x_floor <= self.scale 79 | assert y_floor <= self.scale 80 | if x_floor == self.scale: 81 | x_floor -= 1 82 | if y_floor == self.scale: 83 | y_floor -= 1 84 | index = self.scale*x_floor + y_floor 85 | return index 86 | 87 | def unprocess(self, index): 88 | x_floor = index // self.scale 89 | y_floor = index % self.scale 90 | unscaled_state = np.array([x_floor, y_floor])/self.scale 91 | return unscaled_state 92 | 93 | def seed(self, seed): 94 | pass 95 | 96 | def render(self): 97 | # create a grid 98 | states = [self.state/self.scale] 99 | indices = np.array([int(self.preprocess(s)) for s in states]) 100 | a = np.zeros(self.grid_size) 101 | for i in indices: 102 | a[i] += 1 103 | max_freq = np.max(a) 104 | a/=float(max_freq) # normalize 105 | a = np.reshape(a, (self.scale, self.scale)) 106 | ax = sns.heatmap(a) 107 | plt.draw() 108 | plt.pause(0.001) 109 | plt.clf() 110 | 111 | def visualize(self, states, itr, dirname): 112 | if states is None: 113 | states = np.load(os.path.join(dirname, '{}.npy'.format(itr))) 114 | indices = np.array([int(self.preprocess(s)) for s in states]) 115 | a = np.zeros(int(self.grid_size)) 116 | for i in indices: 117 | a[i] += 1 118 | max_freq = np.max(a) 119 | a/=float(max_freq) # normalize 120 | a = np.reshape(a, (self.scale, self.scale)) 121 | ax = sns.heatmap(a) 122 | plt.savefig(os.path.join(dirname, '{}.png'.format(itr))) 123 | plt.close() 124 | 125 | def create_gif(self, dirname, density=False): 126 | images = [] 127 | if density: 128 | filenames = [x for x in os.listdir(dirname) if '_density.png' in x] 129 | sorted_fnames = sorted(filenames, key=lambda x: int(x.split('_density.png')[0])) 130 | else: 131 | filenames = [x for x in os.listdir(dirname) if ('.png' in x and 'density' not in x)] 132 | sorted_fnames = sorted(filenames, key=lambda x: int(x.split('.png')[0])) 133 | for f in sorted_fnames: 134 | images.append(imageio.imread(os.path.join(dirname, f))) 135 | imageio.mimsave(os.path.join(dirname, 'exploration.gif'), images) 136 | 137 | def create_visualization(self, dirname, density=False): 138 | for s in os.listdir(dirname): 139 | for i in tqdm(range(100)): 140 | self.visualize(None, i, os.path.join(dirname, s)) 141 | self.create_gif(os.path.join(dirname, str(s))) 142 | 143 | def debug(): 144 | logdir = 'pm_debug' 145 | os.mkdir(logdir) 146 | num_episodes = 10 147 | num_steps_per_epoch = 20 148 | 149 | env = PointMass() 150 | for epoch in range(num_episodes): 151 | states = [] 152 | state = env.reset() 153 | for i in range(num_steps_per_epoch): 154 | action = np.random.rand(2) 155 | state, reward, done, _ = env.step(action) 156 | states.append(state) 157 | env.visualize(np.array(states), epoch, logdir) 158 | env.create_gif(logdir) 159 | 160 | 161 | if __name__ == "__main__": 162 | # debug() # run this if you want to get a feel for how the PointMass environment works (make sure to comment out the code below) 163 | import argparse 164 | parser = argparse.ArgumentParser() 165 | parser.add_argument('dirname', type=str) 166 | args = parser.parse_args() 167 | env = PointMass() 168 | env.create_visualization(args.dirname) 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /hw5/exp/replay.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import copy 4 | 5 | class Replay_Buffer(object): 6 | def __init__(self, max_size=np.inf): 7 | self.memory = [] 8 | self.max_size = int(max_size) 9 | 10 | def adjust_size(self): 11 | if len(self.memory) > self.max_size: 12 | diff = int(len(self.memory) - self.max_size) 13 | self.memory = self.memory[:-diff] # FIFO 14 | print('Adjusted replay size') 15 | 16 | def prepend(self, x): 17 | # assume x is a list of states 18 | self.memory = list(x) + self.memory 19 | self.adjust_size() 20 | 21 | def sample(self, batch_size): 22 | random_batch = random.sample(self.memory, batch_size) 23 | return random_batch 24 | 25 | def __len__(self): 26 | return len(self.memory) 27 | 28 | def __getitem__(self, indices): 29 | return copy.deepcopy(np.array([self.memory[i] for i in indices])) 30 | 31 | def get_memory(self): 32 | return copy.deepcopy(self.memory) 33 | 34 | def clear_buffer(self): 35 | del self.memory[:] -------------------------------------------------------------------------------- /hw5/exp/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.5 2 | mujoco-py==1.50.1.56 3 | tensorflow 4 | numpy 5 | seaborn 6 | tqdm -------------------------------------------------------------------------------- /hw5/exp/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ########################## 4 | ### P1 Hist PointMass ### 5 | ########################## 6 | 7 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model none -s 8 --exp_name PM_bc0_s8 8 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model hist -bc 0.01 -s 8 --exp_name PM_hist_bc0.01_s8 9 | 10 | ########################## 11 | ### P2 RBF PointMass ### 12 | ########################## 13 | 14 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model rbf -bc 0.01 -s 8 -sig 0.2 --exp_name PM_rbf_bc0.01_s8_sig0.2 15 | 16 | ########################## 17 | ### P3 EX2 PointMass ### 18 | ########################## 19 | 20 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model ex2 -s 8 -bc 0.05 -kl 0.1 -dlr 0.001 -dh 8 -dti 1000 --exp_name PM_ex2_s8_bc0.05_kl0.1_dlr0.001_dh8_dti1000 21 | 22 | ########################### 23 | ### P4 HalfCheetah ### 24 | ########################### 25 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model none --exp_name HC_bc0 26 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.001 -kl 0.1 -dlr 0.005 -dti 1000 --exp_name HC_bc0.001_kl0.1_dlr0.005_dti1000 27 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.0001 -kl 0.1 -dlr 0.005 -dti 10000 --exp_name HC_bc0.0001_kl0.1_dlr0.005_dti10000 28 | -------------------------------------------------------------------------------- /hw5/exp/sparse_half_cheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5) 8 | utils.EzPickle.__init__(self) 9 | 10 | def step(self, action): 11 | ################################################# 12 | ctrl = False 13 | relu = False 14 | threshold = 10.0 15 | ################################################# 16 | xposbefore = self.sim.data.qpos[0] 17 | self.do_simulation(action, self.frame_skip) 18 | xposafter = self.sim.data.qpos[0] 19 | ob = self._get_obs() 20 | # reward_ctrl = - 0.1 * np.square(action).sum() 21 | # reward_run = (xposafter - xposbefore)/self.dt 22 | ################################################# 23 | if ctrl: 24 | reward_ctrl = - 0.1 * np.square(action).sum() 25 | else: 26 | reward_ctrl = 0 27 | if abs(xposafter) <= threshold: 28 | reward_run = 0.0 29 | else: 30 | if relu: 31 | reward_run = np.sign(xposafter)*(xposafter - xposbefore)/self.dt 32 | else: 33 | reward_run = 1.0 34 | ################################################# 35 | reward = reward_ctrl + reward_run 36 | done = False 37 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 38 | 39 | def _get_obs(self): 40 | return np.concatenate([ 41 | self.sim.data.qpos.flat[1:], 42 | self.sim.data.qvel.flat, 43 | ]) 44 | 45 | def reset_model(self): 46 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 47 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 48 | self.set_state(qpos, qvel) 49 | return self._get_obs() 50 | 51 | def viewer_setup(self): 52 | self.viewer.cam.distance = self.model.stat.extent * 0.5 53 | -------------------------------------------------------------------------------- /hw5/meta/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 5c: Meta-Learning 2 | 3 | Dependencies: 4 | * Python **3.5** 5 | * Numpy version 1.14.5 6 | * TensorFlow version 1.10.5 7 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56** 8 | * OpenAI Gym version **0.10.5** 9 | * seaborn 10 | * Box2D==2.3.2 11 | 12 | See the [HW5c PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5c.pdf) for further instructions. 13 | -------------------------------------------------------------------------------- /hw5/meta/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw5/meta/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | plt.show() 59 | 60 | 61 | def get_datasets(fpath, condition=None): 62 | unit = 0 63 | datasets = [] 64 | for root, dir, files in os.walk(fpath): 65 | if 'log.txt' in files: 66 | param_path = open(os.path.join(root,'params.json')) 67 | params = json.load(param_path) 68 | exp_name = params['exp_name'] 69 | 70 | log_path = os.path.join(root,'log.txt') 71 | experiment_data = pd.read_table(log_path) 72 | 73 | experiment_data.insert( 74 | len(experiment_data.columns), 75 | 'Unit', 76 | unit 77 | ) 78 | experiment_data.insert( 79 | len(experiment_data.columns), 80 | 'Condition', 81 | condition or exp_name 82 | ) 83 | 84 | datasets.append(experiment_data) 85 | unit += 1 86 | 87 | return datasets 88 | 89 | 90 | def main(): 91 | import argparse 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('logdir', nargs='*') 94 | parser.add_argument('--legend', nargs='*') 95 | parser.add_argument('--value', default='AverageReturn', nargs='*') 96 | args = parser.parse_args() 97 | 98 | use_legend = False 99 | if args.legend is not None: 100 | assert len(args.legend) == len(args.logdir), \ 101 | "Must give a legend title for each set of experiments." 102 | use_legend = True 103 | 104 | data = [] 105 | if use_legend: 106 | for logdir, legend_title in zip(args.logdir, args.legend): 107 | data += get_datasets(logdir, legend_title) 108 | else: 109 | for logdir in args.logdir: 110 | data += get_datasets(logdir) 111 | 112 | if isinstance(args.value, list): 113 | values = args.value 114 | else: 115 | values = [args.value] 116 | for value in values: 117 | plot_data(data, value=value) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /hw5/meta/point_mass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from gym import Env 4 | 5 | 6 | class PointEnv(Env): 7 | """ 8 | point mass on a 2-D plane 9 | goals are sampled randomly from a square 10 | """ 11 | 12 | def __init__(self, num_tasks=1): 13 | self.reset_task() 14 | self.reset() 15 | self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,)) 16 | self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,)) 17 | 18 | def reset_task(self, is_evaluation=False): 19 | ''' 20 | sample a new task randomly 21 | 22 | Problem 3: make training and evaluation goals disjoint sets 23 | if `is_evaluation` is true, sample from the evaluation set, 24 | otherwise sample from the training set 25 | ''' 26 | #====================================================================================# 27 | # ----------PROBLEM 3---------- 28 | #====================================================================================# 29 | # YOUR CODE HERE 30 | x = np.random.uniform(-10, 10) 31 | y = np.random.uniform(-10, 10) 32 | self._goal = np.array([x, y]) 33 | 34 | def reset(self): 35 | self._state = np.array([0, 0], dtype=np.float32) 36 | return self._get_obs() 37 | 38 | def _get_obs(self): 39 | return np.copy(self._state) 40 | 41 | def reward_function(self, x, y): 42 | return - (x ** 2 + y ** 2) ** 0.5 43 | 44 | def step(self, action): 45 | x, y = self._state 46 | # compute reward, add penalty for large actions instead of clipping them 47 | x -= self._goal[0] 48 | y -= self._goal[1] 49 | # check if task is complete 50 | done = abs(x) < .01 and abs(y) < .01 51 | reward = self.reward_function(x, y) 52 | # move to next state 53 | self._state = self._state + action 54 | ob = self._get_obs() 55 | return ob, reward, done, dict() 56 | 57 | def viewer_setup(self): 58 | print('no viewer') 59 | pass 60 | 61 | def render(self): 62 | print('current state:', self._state) 63 | 64 | def seed(self, seed): 65 | np.random.seed = seed 66 | -------------------------------------------------------------------------------- /hw5/meta/point_mass_observed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from gym import Env 4 | 5 | 6 | class ObservedPointEnv(Env): 7 | """ 8 | point mass on a 2-D plane 9 | four tasks: move to (-10, -10), (-10, 10), (10, -10), (10, 10) 10 | 11 | Problem 1: augment the observation with a one-hot vector encoding the task ID 12 | - change the dimension of the observation space 13 | - augment the observation with a one-hot vector that encodes the task ID 14 | """ 15 | #====================================================================================# 16 | # ----------PROBLEM 1---------- 17 | #====================================================================================# 18 | # YOUR CODE SOMEWHERE HERE 19 | def __init__(self, num_tasks=1): 20 | self.tasks = [0, 1, 2, 3][:num_tasks] 21 | self.task_idx = -1 22 | self.reset_task() 23 | self.reset() 24 | 25 | self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,)) 26 | self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,)) 27 | 28 | def reset_task(self, is_evaluation=False): 29 | # for evaluation, cycle deterministically through all tasks 30 | if is_evaluation: 31 | self.task_idx = (self.task_idx + 1) % len(self.tasks) 32 | # during training, sample tasks randomly 33 | else: 34 | self.task_idx = np.random.randint(len(self.tasks)) 35 | self._task = self.tasks[self.task_idx] 36 | goals = [[-1, -1], [-1, 1], [1, -1], [1, 1]] 37 | self._goal = np.array(goals[self.task_idx])*10 38 | 39 | def reset(self): 40 | self._state = np.array([0, 0], dtype=np.float32) 41 | return self._get_obs() 42 | 43 | def _get_obs(self): 44 | return np.copy(self._state) 45 | 46 | def step(self, action): 47 | x, y = self._state 48 | # compute reward, add penalty for large actions instead of clipping them 49 | x -= self._goal[0] 50 | y -= self._goal[1] 51 | reward = - (x ** 2 + y ** 2) ** 0.5 52 | # check if task is complete 53 | done = abs(x) < 0.01 and abs(y) < 0.01 54 | # move to next state 55 | self._state = self._state + action 56 | ob = self._get_obs() 57 | return ob, reward, done, dict() 58 | 59 | def viewer_setup(self): 60 | print('no viewer') 61 | pass 62 | 63 | def render(self): 64 | print('current state:', self._state) 65 | 66 | def seed(self, seed): 67 | np.random.seed = seed 68 | -------------------------------------------------------------------------------- /hw5/meta/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | ''' 5 | minimalistic replay buffer 6 | 7 | a sample consists of 8 | - observation 9 | - action 10 | - reward 11 | - terminal 12 | - hidden state for recurrent policy 13 | 14 | it is memory inefficient to store windowed observations this way 15 | so do not run on tasks with large observations (e.g. from vision) 16 | ''' 17 | 18 | def __init__(self, max_size, ob_dim, ac_dim, hidden_dim, task_dim): 19 | self.max_size = max_size 20 | self.ob_dim = ob_dim 21 | self.ac_dim = ac_dim 22 | self.hidden_dim = hidden_dim 23 | self.task_dim = task_dim 24 | self.flush() 25 | 26 | def flush(self): 27 | ''' 28 | set buffer to empty 29 | ''' 30 | self._observations = np.zeros((self.max_size, *self.ob_dim)) 31 | self._actions = np.zeros((self.max_size, *self.ac_dim)) 32 | self._rewards = np.zeros((self.max_size, 1)) 33 | self._terminals = np.zeros((self.max_size, 1)) 34 | self._hiddens = np.zeros((self.max_size, self.hidden_dim)) 35 | self._tasks = np.zeros((self.max_size, self.task_dim)) 36 | self._top = 0 37 | self._size = 0 38 | 39 | def _advance(self): 40 | ''' 41 | move pointer to top of buffer 42 | if end of buffer is reached, overwrite oldest data 43 | ''' 44 | self._top = (self._top + 1) % self.max_size 45 | if self._size < self.max_size: 46 | self._size += 1 47 | 48 | def add_sample(self, ob, ac, re, te, hi, task): 49 | ''' 50 | add sample to buffer 51 | ''' 52 | self._observations[self._top] = ob 53 | self._actions[self._top] = ac 54 | self._rewards[self._top] = re 55 | self._terminals[self._top] = te 56 | self._hiddens[self._top] = hi 57 | self._tasks[self._top] = task 58 | 59 | self._advance() 60 | 61 | def get_samples(self, indices): 62 | ''' 63 | return buffer data indexed by `indices` 64 | ''' 65 | return dict( 66 | observations=self._observations[indices], 67 | actions=self._actions[indices], 68 | rewards=self._rewards[indices], 69 | terminals=self._terminals[indices], 70 | hiddens=self._hiddens[indices], 71 | tasks=self._tasks[indices], 72 | ) 73 | 74 | def random_batch(self, batch_size): 75 | ''' 76 | return random sample of `batch_size` transitions 77 | ''' 78 | indices = np.random.randint(0, self._size, batch_size) 79 | return self.get_samples(indices) 80 | 81 | def all_batch(self): 82 | ''' 83 | return all data in the buffer 84 | ''' 85 | indices = list(range(self._size)) 86 | return self.get_samples(indices) 87 | 88 | def num_steps_can_sample(self): 89 | return self._size 90 | 91 | 92 | 93 | class PPOReplayBuffer(object): 94 | ''' 95 | replay buffer for PPO algorithm 96 | store fixed log probs, advantages, and returns for use in multiple updates 97 | 98 | n.b. samples must be added as a batch, and we assume that the 99 | batch is the same size as that of the simple buffer 100 | ''' 101 | 102 | def __init__(self, simple_buffer): 103 | self.simple_buffer = simple_buffer 104 | self.max_size = self.simple_buffer.max_size 105 | self.flush() 106 | 107 | def flush(self): 108 | self.simple_buffer.flush() 109 | self._log_probs = np.zeros((self.max_size, 1)) 110 | self._advantages = np.zeros((self.max_size, 1)) 111 | self._returns = np.zeros((self.max_size, 1)) 112 | 113 | def add_samples(self, lp, adv, ret): 114 | self._log_probs = lp 115 | self._advantages = adv 116 | self._returns = ret 117 | 118 | def get_samples(self, indices): 119 | return dict( 120 | log_probs = self._log_probs[indices], 121 | advantages = self._advantages[indices], 122 | returns = self._returns[indices], 123 | ) 124 | 125 | def random_batch(self, batch_size): 126 | indices = np.random.randint(0, self.simple_buffer._size, batch_size) 127 | simple = self.simple_buffer.get_samples(indices) 128 | ppo = self.get_samples(indices) 129 | return {**simple, **ppo} 130 | -------------------------------------------------------------------------------- /hw5/meta/requirements.txt: -------------------------------------------------------------------------------- 1 | mujoco-py==1.50.1.56 2 | gym==0.10.5 3 | tensorflow==1.10.0 4 | numpy==1.14.5 5 | scipy==1.1.0 6 | tensorflow-probability==0.3.0 7 | seaborn 8 | Box2D==2.3.2 9 | -------------------------------------------------------------------------------- /hw5/sac/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 5b: Soft Actor Critic 2 | Original code from Tuomas Haarnoja, Soroush Nasiriany, and Aurick Zhou for CS294-112 Fall 2018 3 | 4 | Dependencies: 5 | * Python **3.4.5** 6 | * Numpy version **1.15.2** 7 | * TensorFlow version **1.10.0** 8 | * tensorflow-probability version **0.4.0** 9 | * OpenAI Gym version **0.10.8** 10 | * MuJoCo version **1.50** and mujoco-py **1.50.1.59** 11 | * seaborn version **0.9.0** 12 | 13 | You will implement `sac.py`, and `nn.py`. 14 | 15 | See the [HW5 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5b.pdf) for further instructions. 16 | -------------------------------------------------------------------------------- /hw5/sac/environment.yml: -------------------------------------------------------------------------------- 1 | name: hw5-sac 2 | dependencies: 3 | - python==3.4.5 4 | - pip: 5 | - gym==0.10.8 6 | - numpy==1.15.2 7 | - tensorflow==1.10.0 8 | - tensorflow-probability==0.4.0 9 | - mujoco-py==1.50.1.59 10 | - seaborn==0.9.0 11 | -------------------------------------------------------------------------------- /hw5/sac/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, indent=2, separators=(',', ': '), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw5/sac/nn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras import layers 4 | from tensorflow_probability import distributions 5 | from tensorflow.python import keras 6 | from tensorflow.python.keras.engine.network import Network 7 | 8 | 9 | class QFunction(Network): 10 | def __init__(self, hidden_layer_sizes, **kwargs): 11 | super(QFunction, self).__init__(**kwargs) 12 | self._hidden_layer_sizes = hidden_layer_sizes 13 | 14 | def build(self, input_shape): 15 | inputs = [ 16 | layers.Input(batch_shape=input_shape[0], name='observations'), 17 | layers.Input(batch_shape=input_shape[1], name='actions') 18 | ] 19 | 20 | x = layers.Concatenate(axis=1)(inputs) 21 | for hidden_units in self._hidden_layer_sizes: 22 | x = layers.Dense(hidden_units, activation='relu')(x) 23 | q_values = layers.Dense(1, activation=None)(x) 24 | 25 | self._init_graph_network(inputs, q_values) 26 | super(QFunction, self).build(input_shape) 27 | 28 | 29 | class ValueFunction(Network): 30 | def __init__(self, hidden_layer_sizes, **kwargs): 31 | super(ValueFunction, self).__init__(**kwargs) 32 | self._hidden_layer_sizes = hidden_layer_sizes 33 | 34 | def build(self, input_shape): 35 | inputs = layers.Input(batch_shape=input_shape, name='observations') 36 | 37 | x = inputs 38 | for hidden_units in self._hidden_layer_sizes: 39 | x = layers.Dense(hidden_units, activation='relu')(x) 40 | values = layers.Dense(1, activation=None)(x) 41 | 42 | self._init_graph_network(inputs, values) 43 | super(ValueFunction, self).build(input_shape) 44 | 45 | 46 | class GaussianPolicy(Network): 47 | def __init__(self, action_dim, hidden_layer_sizes, reparameterize, **kwargs): 48 | super(GaussianPolicy, self).__init__(**kwargs) 49 | self._action_dim = action_dim 50 | self._f = None 51 | self._hidden_layer_sizes = hidden_layer_sizes 52 | self._reparameterize = reparameterize 53 | 54 | def build(self, input_shape): 55 | inputs = layers.Input(batch_shape=input_shape, name='observations') 56 | 57 | x = inputs 58 | for hidden_units in self._hidden_layer_sizes: 59 | x = layers.Dense(hidden_units, activation='relu')(x) 60 | 61 | mean_and_log_std = layers.Dense( 62 | self._action_dim * 2, activation=None)(x) 63 | 64 | def create_distribution_layer(mean_and_log_std): 65 | mean, log_std = tf.split( 66 | mean_and_log_std, num_or_size_splits=2, axis=1) 67 | log_std = tf.clip_by_value(log_std, -20., 2.) 68 | 69 | distribution = distributions.MultivariateNormalDiag( 70 | loc=mean, 71 | scale_diag=tf.exp(log_std)) 72 | 73 | raw_actions = distribution.sample() 74 | if not self._reparameterize: 75 | ### Problem 1.3.A 76 | ### YOUR CODE HERE 77 | raise NotImplementedError 78 | log_probs = distribution.log_prob(raw_actions) 79 | log_probs -= self._squash_correction(raw_actions) 80 | 81 | actions = None 82 | ### Problem 2.A 83 | ### YOUR CODE HERE 84 | raise NotImplementedError 85 | 86 | return actions, log_probs 87 | 88 | samples, log_probs = layers.Lambda(create_distribution_layer)( 89 | mean_and_log_std) 90 | 91 | self._init_graph_network(inputs=inputs, outputs=[samples, log_probs]) 92 | super(GaussianPolicy, self).build(input_shape) 93 | 94 | def _squash_correction(self, raw_actions): 95 | ### Problem 2.B 96 | ### YOUR CODE HERE 97 | raise NotImplementedError 98 | 99 | def eval(self, observation): 100 | assert self.built and observation.ndim == 1 101 | 102 | if self._f is None: 103 | self._f = keras.backend.function(self.inputs, [self.outputs[0]]) 104 | 105 | action, = self._f([observation[None]]) 106 | return action.flatten() 107 | -------------------------------------------------------------------------------- /hw5/sac/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | plt.show() 59 | 60 | 61 | def get_datasets(fpath, condition=None): 62 | unit = 0 63 | datasets = [] 64 | for root, dir, files in os.walk(fpath): 65 | if 'log.txt' in files: 66 | param_path = open(os.path.join(root,'params.json')) 67 | params = json.load(param_path) 68 | exp_name = params['exp_name'] 69 | 70 | log_path = os.path.join(root,'log.txt') 71 | experiment_data = pd.read_table(log_path) 72 | 73 | experiment_data.insert( 74 | len(experiment_data.columns), 75 | 'Unit', 76 | unit 77 | ) 78 | experiment_data.insert( 79 | len(experiment_data.columns), 80 | 'Condition', 81 | condition or exp_name 82 | ) 83 | 84 | datasets.append(experiment_data) 85 | unit += 1 86 | 87 | return datasets 88 | 89 | 90 | def main(): 91 | import argparse 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('logdir', nargs='*') 94 | parser.add_argument('--legend', nargs='*') 95 | parser.add_argument('--value', default='LastEpReturn', nargs='*') 96 | args = parser.parse_args() 97 | 98 | use_legend = False 99 | if args.legend is not None: 100 | assert len(args.legend) == len(args.logdir), \ 101 | "Must give a legend title for each set of experiments." 102 | use_legend = True 103 | 104 | data = [] 105 | if use_legend: 106 | for logdir, legend_title in zip(args.logdir, args.legend): 107 | data += get_datasets(logdir, legend_title) 108 | else: 109 | for logdir in args.logdir: 110 | data += get_datasets(logdir) 111 | 112 | if isinstance(args.value, list): 113 | values = args.value 114 | else: 115 | values = [args.value] 116 | for value in values: 117 | plot_data(data, value=value) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /hw5/sac/sac.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import time 3 | 4 | class SAC: 5 | """Soft Actor-Critic (SAC) 6 | Original code from Tuomas Haarnoja, Soroush Nasiriany, and Aurick Zhou for CS294-112 Fall 2018 7 | 8 | References 9 | ---------- 10 | [1] Tuomas Haarnoja, Aurick Zhou, Pieter Abbeel, and Sergey Levine, "Soft 11 | Actor-Critic: Off-Policy Maximum Entropy Deep Reinforcement Learning 12 | with a Stochastic Actor," ICML 2018. 13 | """ 14 | 15 | def __init__(self, 16 | alpha=1.0, 17 | batch_size=256, 18 | discount=0.99, 19 | epoch_length=1000, 20 | learning_rate=3e-3, 21 | reparameterize=False, 22 | tau=0.01, 23 | **kwargs): 24 | """ 25 | Args: 26 | """ 27 | 28 | self._alpha = alpha 29 | self._batch_size = batch_size 30 | self._discount = discount 31 | self._epoch_length = epoch_length 32 | self._learning_rate = learning_rate 33 | self._reparameterize = reparameterize 34 | self._tau = tau 35 | 36 | self._training_ops = [] 37 | 38 | def build(self, env, policy, q_function, q_function2, value_function, 39 | target_value_function): 40 | 41 | self._create_placeholders(env) 42 | 43 | policy_loss = self._policy_loss_for(policy, q_function, q_function2, value_function) 44 | value_function_loss = self._value_function_loss_for( 45 | policy, q_function, q_function2, value_function) 46 | q_function_loss = self._q_function_loss_for(q_function, 47 | target_value_function) 48 | if q_function2 is not None: 49 | q_function2_loss = self._q_function_loss_for(q_function2, 50 | target_value_function) 51 | 52 | optimizer = tf.train.AdamOptimizer( 53 | self._learning_rate, name='optimizer') 54 | policy_training_op = optimizer.minimize( 55 | loss=policy_loss, var_list=policy.trainable_variables) 56 | value_training_op = optimizer.minimize( 57 | loss=value_function_loss, 58 | var_list=value_function.trainable_variables) 59 | q_function_training_op = optimizer.minimize( 60 | loss=q_function_loss, var_list=q_function.trainable_variables) 61 | if q_function2 is not None: 62 | q_function2_training_op = optimizer.minimize( 63 | loss=q_function2_loss, var_list=q_function2.trainable_variables) 64 | 65 | self._training_ops = [ 66 | policy_training_op, value_training_op, q_function_training_op 67 | ] 68 | if q_function2 is not None: 69 | self._training_ops += [q_function2_training_op] 70 | self._target_update_ops = self._create_target_update( 71 | source=value_function, target=target_value_function) 72 | 73 | tf.get_default_session().run(tf.global_variables_initializer()) 74 | 75 | def _create_placeholders(self, env): 76 | observation_dim = env.observation_space.shape[0] 77 | action_dim = env.action_space.shape[0] 78 | 79 | self._observations_ph = tf.placeholder( 80 | tf.float32, 81 | shape=(None, observation_dim), 82 | name='observation', 83 | ) 84 | self._next_observations_ph = tf.placeholder( 85 | tf.float32, 86 | shape=(None, observation_dim), 87 | name='next_observation', 88 | ) 89 | self._actions_ph = tf.placeholder( 90 | tf.float32, 91 | shape=(None, action_dim), 92 | name='actions', 93 | ) 94 | self._rewards_ph = tf.placeholder( 95 | tf.float32, 96 | shape=(None, ), 97 | name='rewards', 98 | ) 99 | self._terminals_ph = tf.placeholder( 100 | tf.float32, 101 | shape=(None, ), 102 | name='terminals', 103 | ) 104 | 105 | def _policy_loss_for(self, policy, q_function, q_function2, value_function): 106 | if not self._reparameterize: 107 | ### Problem 1.3.A 108 | ### YOUR CODE HERE 109 | raise NotImplementedError 110 | else: 111 | ### Problem 1.3.B 112 | ### YOUR CODE HERE 113 | raise NotImplementedError 114 | 115 | def _value_function_loss_for(self, policy, q_function, q_function2, value_function): 116 | ### Problem 1.2.A 117 | ### YOUR CODE HERE 118 | raise NotImplementedError 119 | 120 | def _q_function_loss_for(self, q_function, target_value_function): 121 | ### Problem 1.1.A 122 | ### YOUR CODE HERE 123 | raise NotImplementedError 124 | 125 | def _create_target_update(self, source, target): 126 | """Create tensorflow operations for updating target value function.""" 127 | 128 | return [ 129 | tf.assign(target, (1 - self._tau) * target + self._tau * source) 130 | for target, source in zip(target.trainable_variables, source. 131 | trainable_variables) 132 | ] 133 | 134 | def train(self, sampler, n_epochs=1000): 135 | """Return a generator that performs RL training. 136 | 137 | Args: 138 | env (`rllab.Env`): Environment used for training 139 | policy (`Policy`): Policy used for training 140 | initial_exploration_policy ('Policy'): Policy used for exploration 141 | If None, then all exploration is done using policy 142 | pool (`PoolBase`): Sample pool to add samples to 143 | """ 144 | self._start = time.time() 145 | for epoch in range(n_epochs): 146 | for t in range(self._epoch_length): 147 | sampler.sample() 148 | 149 | batch = sampler.random_batch(self._batch_size) 150 | feed_dict = { 151 | self._observations_ph: batch['observations'], 152 | self._actions_ph: batch['actions'], 153 | self._next_observations_ph: batch['next_observations'], 154 | self._rewards_ph: batch['rewards'], 155 | self._terminals_ph: batch['terminals'], 156 | } 157 | tf.get_default_session().run(self._training_ops, feed_dict) 158 | tf.get_default_session().run(self._target_update_ops) 159 | 160 | yield epoch 161 | 162 | def get_statistics(self): 163 | statistics = { 164 | 'Time': time.time() - self._start, 165 | 'TimestepsThisBatch': self._epoch_length, 166 | } 167 | 168 | return statistics 169 | -------------------------------------------------------------------------------- /hw5/sac/train_mujoco.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import logz 4 | import numpy as np 5 | import os 6 | import tensorflow as tf 7 | import time 8 | 9 | import nn 10 | from sac import SAC 11 | import utils 12 | 13 | from multiprocessing import Process 14 | 15 | def train_SAC(env_name, exp_name, seed, logdir): 16 | alpha = { 17 | 'Ant-v2': 0.1, 18 | 'HalfCheetah-v2': 0.2, 19 | 'Hopper-v2': 0.2, 20 | 'Humanoid-v2': 0.05, 21 | 'Walker2d-v2': 0.2, 22 | }.get(env_name, 0.2) 23 | 24 | algorithm_params = { 25 | 'alpha': alpha, 26 | 'batch_size': 256, 27 | 'discount': 0.99, 28 | 'learning_rate': 1e-3, 29 | 'reparameterize': False, 30 | 'tau': 0.01, 31 | 'epoch_length': 1000, 32 | 'n_epochs': 500, 33 | 'two_qf': False, 34 | } 35 | sampler_params = { 36 | 'max_episode_length': 1000, 37 | 'prefill_steps': 1000, 38 | } 39 | replay_pool_params = { 40 | 'max_size': 1e6, 41 | } 42 | 43 | value_function_params = { 44 | 'hidden_layer_sizes': (128, 128), 45 | } 46 | 47 | q_function_params = { 48 | 'hidden_layer_sizes': (128, 128), 49 | } 50 | 51 | policy_params = { 52 | 'hidden_layer_sizes': (128, 128), 53 | } 54 | 55 | logz.configure_output_dir(logdir) 56 | params = { 57 | 'exp_name': exp_name, 58 | 'env_name': env_name, 59 | 'algorithm_params': algorithm_params, 60 | 'sampler_params': sampler_params, 61 | 'replay_pool_params': replay_pool_params, 62 | 'value_function_params': value_function_params, 63 | 'q_function_params': q_function_params, 64 | 'policy_params': policy_params 65 | } 66 | logz.save_params(params) 67 | 68 | env = gym.envs.make(env_name) 69 | # Set random seeds 70 | tf.set_random_seed(seed) 71 | np.random.seed(seed) 72 | env.seed(seed) 73 | 74 | sampler = utils.SimpleSampler(**sampler_params) 75 | replay_pool = utils.SimpleReplayPool( 76 | observation_shape=env.observation_space.shape, 77 | action_shape=env.action_space.shape, 78 | **replay_pool_params) 79 | 80 | q_function = nn.QFunction(name='q_function', **q_function_params) 81 | if algorithm_params.get('two_qf', False): 82 | q_function2 = nn.QFunction(name='q_function2', **q_function_params) 83 | else: 84 | q_function2 = None 85 | value_function = nn.ValueFunction( 86 | name='value_function', **value_function_params) 87 | target_value_function = nn.ValueFunction( 88 | name='target_value_function', **value_function_params) 89 | policy = nn.GaussianPolicy( 90 | action_dim=env.action_space.shape[0], 91 | reparameterize=algorithm_params['reparameterize'], 92 | **policy_params) 93 | 94 | sampler.initialize(env, policy, replay_pool) 95 | 96 | algorithm = SAC(**algorithm_params) 97 | 98 | tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 99 | tf_config.gpu_options.allow_growth = True # may need if using GPU 100 | with tf.Session(config=tf_config): 101 | algorithm.build( 102 | env=env, 103 | policy=policy, 104 | q_function=q_function, 105 | q_function2=q_function2, 106 | value_function=value_function, 107 | target_value_function=target_value_function) 108 | 109 | for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)): 110 | logz.log_tabular('Iteration', epoch) 111 | for k, v in algorithm.get_statistics().items(): 112 | logz.log_tabular(k, v) 113 | for k, v in replay_pool.get_statistics().items(): 114 | logz.log_tabular(k, v) 115 | for k, v in sampler.get_statistics().items(): 116 | logz.log_tabular(k, v) 117 | logz.dump_tabular() 118 | 119 | def main(): 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--env_name', type=str, default='HalfCheetah-v2') 122 | parser.add_argument('--exp_name', type=str, default=None) 123 | parser.add_argument('--seed', type=int, default=1) 124 | parser.add_argument('--n_experiments', '-e', type=int, default=1) 125 | args = parser.parse_args() 126 | 127 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') 128 | 129 | if not (os.path.exists(data_path)): 130 | os.makedirs(data_path) 131 | logdir = 'sac_' + args.env_name + '_' + args.exp_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 132 | logdir = os.path.join(data_path, logdir) 133 | 134 | processes = [] 135 | 136 | for e in range(args.n_experiments): 137 | seed = args.seed + 10*e 138 | print('Running experiment with seed %d'%seed) 139 | 140 | def train_func(): 141 | train_SAC( 142 | env_name=args.env_name, 143 | exp_name=args.exp_name, 144 | seed=seed, 145 | logdir=os.path.join(logdir, '%d' % seed), 146 | ) 147 | # # Awkward hacky process runs, because Tensorflow does not like 148 | # # repeatedly calling train_AC in the same thread. 149 | p = Process(target=train_func, args=tuple()) 150 | p.start() 151 | processes.append(p) 152 | # if you comment in the line below, then the loop will block 153 | # until this process finishes 154 | # p.join() 155 | 156 | for p in processes: 157 | p.join() 158 | 159 | if __name__ == '__main__': 160 | main() 161 | -------------------------------------------------------------------------------- /hw5/sac/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import tensorflow as tf 4 | 5 | 6 | class Logger: 7 | def __init__(self, log_dir): 8 | self._summary_writer = tf.summary.FileWriter( 9 | os.path.expanduser(log_dir)) 10 | 11 | self._rows = [] 12 | 13 | def log_value(self, tag, value, step): 14 | summary = tf.Summary() 15 | summary.value.add(tag=tag, simple_value=value) 16 | self._summary_writer.add_summary(summary, step) 17 | 18 | self._rows.append("{tag:.<25} {value}".format(tag=tag, value=value)) 19 | 20 | def log_values(self, dictionary, step): 21 | for tag, value in dictionary.items(): 22 | self.log_value(tag, value, step) 23 | 24 | def flush(self): 25 | self._summary_writer.flush() 26 | print(format("", "_<25")) 27 | print("\n".join(self._rows)) 28 | 29 | self._rows = [] 30 | 31 | 32 | class ReplayPool: 33 | def __init__(self, max_size, fields): 34 | max_size = int(max_size) 35 | self._max_size = max_size 36 | 37 | self.fields = {} 38 | self.field_names = [] 39 | self.add_fields(fields) 40 | 41 | self._pointer = 0 42 | self._size = 0 43 | 44 | @property 45 | def size(self): 46 | return self._size 47 | 48 | def add_fields(self, fields): 49 | self.fields.update(fields) 50 | self.field_names += list(fields.keys()) 51 | 52 | for field_name, field_attrs in fields.items(): 53 | field_shape = [self._max_size] + list(field_attrs['shape']) 54 | initializer = field_attrs.get('initializer', np.zeros) 55 | setattr(self, field_name, initializer(field_shape)) 56 | 57 | def _advance(self, count=1): 58 | self._pointer = (self._pointer + count) % self._max_size 59 | self._size = min(self._size + count, self._max_size) 60 | 61 | def add_sample(self, **kwargs): 62 | self.add_samples(1, **kwargs) 63 | 64 | def add_samples(self, num_samples=1, **kwargs): 65 | for field_name in self.field_names: 66 | idx = np.arange(self._pointer, 67 | self._pointer + num_samples) % self._max_size 68 | getattr(self, field_name)[idx] = kwargs.pop(field_name) 69 | 70 | self._advance(num_samples) 71 | 72 | def random_indices(self, batch_size): 73 | if self._size == 0: return [] 74 | return np.random.randint(0, self._size, batch_size) 75 | 76 | def random_batch(self, batch_size, field_name_filter=None): 77 | random_indices = self.random_indices(batch_size) 78 | return self.batch_by_indices(random_indices, field_name_filter) 79 | 80 | def batch_by_indices(self, indices, field_name_filter=None): 81 | field_names = self.field_names 82 | if field_name_filter is not None: 83 | field_names = [ 84 | field_name for field_name in field_names 85 | if field_name_filter(field_name) 86 | ] 87 | 88 | return { 89 | field_name: getattr(self, field_name)[indices] 90 | for field_name in field_names 91 | } 92 | 93 | def get_statistics(self): 94 | return { 95 | 'PoolSize': self._size, 96 | } 97 | 98 | 99 | class SimpleReplayPool(ReplayPool): 100 | def __init__(self, observation_shape, action_shape, *args, **kwargs): 101 | self._observation_shape = observation_shape 102 | self._action_shape = action_shape 103 | 104 | fields = { 105 | 'observations': { 106 | 'shape': self._observation_shape, 107 | 'dtype': 'float32' 108 | }, 109 | # It's a bit memory inefficient to save the observations twice, 110 | # but it makes the code *much* easier since you no longer have 111 | # to worry about termination conditions. 112 | 'next_observations': { 113 | 'shape': self._observation_shape, 114 | 'dtype': 'float32' 115 | }, 116 | 'actions': { 117 | 'shape': self._action_shape, 118 | 'dtype': 'float32' 119 | }, 120 | 'rewards': { 121 | 'shape': [], 122 | 'dtype': 'float32' 123 | }, 124 | # self.terminals[i] = a terminal was received at time i 125 | 'terminals': { 126 | 'shape': [], 127 | 'dtype': 'bool' 128 | }, 129 | } 130 | 131 | super(SimpleReplayPool, self).__init__(*args, fields=fields, **kwargs) 132 | 133 | 134 | class Sampler(object): 135 | def __init__(self, max_episode_length, prefill_steps): 136 | self._max_episode_length = max_episode_length 137 | self._prefill_steps = prefill_steps 138 | 139 | self.env = None 140 | self.policy = None 141 | self.pool = None 142 | 143 | def initialize(self, env, policy, pool): 144 | self.env = env 145 | self.policy = policy 146 | self.pool = pool 147 | 148 | class UniformPolicy: 149 | def __init__(self, action_dim): 150 | self._action_dim = action_dim 151 | 152 | def eval(self, _): 153 | return np.random.uniform(-1, 1, self._action_dim) 154 | 155 | uniform_exploration_policy = UniformPolicy(env.action_space.shape[0]) 156 | for _ in range(self._prefill_steps): 157 | self.sample(uniform_exploration_policy) 158 | 159 | def set_policy(self, policy): 160 | self.policy = policy 161 | 162 | def sample(self): 163 | raise NotImplementedError 164 | 165 | def random_batch(self, batch_size): 166 | return self.pool.random_batch(batch_size) 167 | 168 | def terminate(self): 169 | self.env.terminate() 170 | 171 | 172 | class SimpleSampler(Sampler): 173 | def __init__(self, **kwargs): 174 | super(SimpleSampler, self).__init__(**kwargs) 175 | 176 | self._episode_length = 0 177 | self._episode_return = 0 178 | self._last_episode_return = 0 179 | self._max_episode_return = -np.inf 180 | self._n_episodes = 0 181 | self._current_observation = None 182 | self._total_samples = 0 183 | 184 | def sample(self, policy=None): 185 | policy = self.policy if policy is None else policy 186 | if self._current_observation is None: 187 | self._current_observation = self.env.reset() 188 | 189 | action = policy.eval(self._current_observation) 190 | next_observation, reward, terminal, info = self.env.step(action) 191 | self._episode_length += 1 192 | self._episode_return += reward 193 | self._total_samples += 1 194 | 195 | self.pool.add_sample( 196 | observations=self._current_observation, 197 | actions=action, 198 | rewards=reward, 199 | terminals=terminal, 200 | next_observations=next_observation) 201 | 202 | if terminal or self._episode_length >= self._max_episode_length: 203 | self._current_observation = self.env.reset() 204 | self._episode_length = 0 205 | self._max_episode_return = max(self._max_episode_return, 206 | self._episode_return) 207 | self._last_episode_return = self._episode_return 208 | 209 | self._episode_return = 0 210 | self._n_episodes += 1 211 | 212 | else: 213 | self._current_observation = next_observation 214 | 215 | def get_statistics(self): 216 | statistics = { 217 | 'MaxEpReturn': self._max_episode_return, 218 | 'LastEpReturn': self._last_episode_return, 219 | 'Episodes': self._n_episodes, 220 | 'TimestepsSoFar': self._total_samples, 221 | } 222 | 223 | return statistics 224 | -------------------------------------------------------------------------------- /project/project_assignment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework/dde95f4e126e14a343a53efe25d1c2205854ea3a/project/project_assignment.pdf --------------------------------------------------------------------------------