├── hw4 ├── data │ ├── mb_mpc_HalfCheetah-v1_25-06-2018_14-51-34 │ │ └── log.txt │ ├── mb_mpc_HalfCheetah-v1_25-06-2018_15-01-46 │ │ └── log.txt │ ├── mb_mpc_HalfCheetah-v1_25-06-2018_15-22-28 │ │ └── log.txt │ ├── mb_mpc_HalfCheetah-v1_25-06-2018_15-23-41 │ │ └── log.txt │ └── mb_mpc_HalfCheetah-v1_25-06-2018_15-28-11 │ │ └── log.txt ├── HW4.pdf ├── cheetah_env.py ├── controllers.py ├── cost_functions.py ├── logz.py ├── plot.py ├── dynamics.py └── main.py ├── hw1 ├── HW1.pdf ├── .DS_Store ├── experts │ ├── Ant-v1.pkl │ ├── Hopper-v1.pkl │ ├── Humanoid-v1.pkl │ ├── Reacher-v1.pkl │ ├── Walker2d-v1.pkl │ └── HalfCheetah-v1.pkl ├── DAgger.bash ├── demo.bash ├── README.md ├── load_policy.py ├── model.py ├── run_expert.py ├── DAgger.py └── tf_util.py ├── hw2 ├── HW2.pdf ├── hw2_final.pdf ├── logz.py ├── plot.py └── TestNoteBook.ipynb ├── hw3 ├── HW3.pdf ├── README ├── run_dqn_ram.py ├── run_dqn_atari.py ├── atari_wrappers.py ├── Testing.ipynb ├── dqn_utils.py └── dqn.py ├── sp17_hw ├── hw1 │ ├── experts │ │ ├── Ant-v1.pkl │ │ ├── Hopper-v1.pkl │ │ ├── Reacher-v1.pkl │ │ ├── Humanoid-v1.pkl │ │ ├── Walker2d-v1.pkl │ │ └── HalfCheetah-v1.pkl │ ├── demo.bash │ ├── README.md │ ├── run_expert.py │ └── load_policy.py ├── hw3 │ ├── README │ ├── run_dqn_ram.py │ ├── run_dqn_atari.py │ ├── atari_wrappers.py │ ├── dqn.py │ └── dqn_utils.py ├── hw4 │ ├── plot_learning_curves.py │ ├── logz.py │ ├── homework.md │ └── main.py └── hw2 │ ├── discrete_env.py │ └── frozen_lake.py ├── LICENSE ├── .gitignore └── README.md /hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_14-51-34/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_15-01-46/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_15-22-28/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_15-23-41/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_15-28-11/log.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /hw1/HW1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/HW1.pdf -------------------------------------------------------------------------------- /hw2/HW2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw2/HW2.pdf -------------------------------------------------------------------------------- /hw3/HW3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw3/HW3.pdf -------------------------------------------------------------------------------- /hw4/HW4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw4/HW4.pdf -------------------------------------------------------------------------------- /hw1/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/.DS_Store -------------------------------------------------------------------------------- /hw2/hw2_final.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw2/hw2_final.pdf -------------------------------------------------------------------------------- /hw1/experts/Ant-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Ant-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Hopper-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Hopper-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Humanoid-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Humanoid-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Reacher-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Reacher-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/Walker2d-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Walker2d-v1.pkl -------------------------------------------------------------------------------- /hw1/experts/HalfCheetah-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/HalfCheetah-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Ant-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Ant-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Hopper-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Hopper-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Reacher-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Reacher-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Humanoid-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Humanoid-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/Walker2d-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Walker2d-v1.pkl -------------------------------------------------------------------------------- /sp17_hw/hw1/experts/HalfCheetah-v1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/HalfCheetah-v1.pkl -------------------------------------------------------------------------------- /hw1/DAgger.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | for e in Hopper Ant HalfCheetah Humanoid Reacher Walker2d 4 | do 5 | python DAgger.py experts/$e-v1.pkl $e-v2 --num_rollouts=5 6 | done 7 | -------------------------------------------------------------------------------- /hw1/demo.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | for e in Hopper Ant HalfCheetah Humanoid Reacher Walker2d 4 | do 5 | python run_expert.py experts/$e-v1.pkl $e-v2 --num_rollouts=5 6 | done 7 | -------------------------------------------------------------------------------- /sp17_hw/hw1/demo.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | for e in Hopper-v1 Ant-v1 HalfCheetah-v1 Humanoid-v1 Reacher-v1 Walker2d-v1 4 | do 5 | python run_expert.py experts/$e.pkl $e --render --num_rollouts=1 6 | done 7 | -------------------------------------------------------------------------------- /hw3/README: -------------------------------------------------------------------------------- 1 | See http://rll.berkeley.edu/deeprlcourse/f17docs/hw3.pdf for instructions 2 | 3 | The starter code was based on an implementation of Q-learning for Atari 4 | generously provided by Szymon Sidor from OpenAI 5 | 6 | -------------------------------------------------------------------------------- /sp17_hw/hw3/README: -------------------------------------------------------------------------------- 1 | See http://rll.berkeley.edu/deeprlcourse/docs/hw3.pdf for instructions 2 | 3 | The starter code was based on an implementation of Q-learning for Atari 4 | generously provided by Szymon Sidor from OpenAI 5 | 6 | -------------------------------------------------------------------------------- /hw1/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 1: Imitation Learning 2 | 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym 4 | 5 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines. 6 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638). 7 | 8 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data. 9 | 10 | In `experts/`, the provided expert policies are: 11 | * Ant-v1.pkl 12 | * HalfCheetah-v1.pkl 13 | * Hopper-v1.pkl 14 | * Humanoid-v1.pkl 15 | * Reacher-v1.pkl 16 | * Walker2d-v1.pkl 17 | 18 | The name of the pickle file corresponds to the name of the gym environment. 19 | -------------------------------------------------------------------------------- /sp17_hw/hw1/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 1: Imitation Learning 2 | 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym 4 | 5 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines. 6 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638). 7 | 8 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data. 9 | 10 | In `experts/`, the provided expert policies are: 11 | * Ant-v1.pkl 12 | * HalfCheetah-v1.pkl 13 | * Hopper-v1.pkl 14 | * Humanoid-v1.pkl 15 | * Reacher-v1.pkl 16 | * Walker2d-v1.pkl 17 | 18 | The name of the pickle file corresponds to the name of the gym environment. 19 | -------------------------------------------------------------------------------- /sp17_hw/hw4/plot_learning_curves.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | parser = argparse.ArgumentParser() 3 | parser.add_argument("expdir", help="experiment dir, e.g., /tmp/experiments") 4 | args = parser.parse_args() 5 | 6 | from pylab import * 7 | import os 8 | from os.path import join 9 | 10 | dirnames = os.listdir(args.expdir) 11 | 12 | fig, axes = subplots(4) 13 | for dirname in dirnames: 14 | print(dirname) 15 | A = np.genfromtxt(join(args.expdir, dirname, 'log.txt'),delimiter='\t',dtype=None, names=True) 16 | # axes[0].plot(scipy.signal.savgol_filter(A['EpRewMean'] , 21, 3), '-x') 17 | x = A['TimestepsSoFar'] 18 | axes[0].plot(x, A['EpRewMean'], '-x') 19 | axes[1].plot(x, A['KLOldNew'], '-x') 20 | axes[2].plot(x, A['Entropy'], '-x') 21 | axes[3].plot(x, A['EVBefore'], '-x') 22 | legend(dirnames,loc='best').draggable() 23 | axes[0].set_ylabel("EpRewMean") 24 | axes[1].set_ylabel("KLOldNew") 25 | axes[2].set_ylabel("Entropy") 26 | axes[3].set_ylabel("EVBefore") 27 | axes[3].set_ylim(-1,1) 28 | axes[-1].set_xlabel("Iterations") 29 | show() 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 berkeleydeeprlcourse 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /hw4/cheetah_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class HalfCheetahEnvNew(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) 8 | utils.EzPickle.__init__(self) 9 | 10 | def _step(self, action): 11 | xposbefore = self.model.data.qpos[0, 0] 12 | self.do_simulation(action, self.frame_skip) 13 | xposafter = self.model.data.qpos[0, 0] 14 | ob = self._get_obs() 15 | reward_ctrl = - 0.1 * np.square(action).sum() 16 | reward_run = (xposafter - xposbefore)/self.dt 17 | reward = reward_ctrl + reward_run 18 | done = False 19 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 20 | 21 | def _get_obs(self): 22 | return np.concatenate([ 23 | self.model.data.qpos.flat[1:], 24 | self.model.data.qvel.flat, 25 | self.get_body_com("torso").flat, 26 | # self.get_body_comvel("torso").flat, 27 | ]) 28 | 29 | def reset_model(self): 30 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 31 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 32 | self.set_state(qpos, qvel) 33 | return self._get_obs() 34 | 35 | def viewer_setup(self): 36 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | hw1/behavior_cloning/ 2 | hw1/DAgger/ 3 | hw2/data/ 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # IPython Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | -------------------------------------------------------------------------------- /sp17_hw/hw2/discrete_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym import Env, spaces 4 | from gym.utils import seeding 5 | 6 | def categorical_sample(prob_n, np_random): 7 | """ 8 | Sample from categorical distribution 9 | Each row specifies class probabilities 10 | """ 11 | prob_n = np.asarray(prob_n) 12 | csprob_n = np.cumsum(prob_n) 13 | return (csprob_n > np_random.rand()).argmax() 14 | 15 | 16 | class DiscreteEnv(Env): 17 | 18 | """ 19 | Has the following members 20 | - nS: number of states 21 | - nA: number of actions 22 | - P: transitions (*) 23 | - isd: initial state distribution (**) 24 | 25 | (*) dictionary dict of dicts of lists, where 26 | P[s][a] == [(probability, nextstate, reward, done), ...] 27 | (**) list or array of length nS 28 | 29 | 30 | """ 31 | def __init__(self, nS, nA, P, isd): 32 | self.P = P 33 | self.isd = isd 34 | self.lastaction=None # for rendering 35 | self.nS = nS 36 | self.nA = nA 37 | 38 | self.action_space = spaces.Discrete(self.nA) 39 | self.observation_space = spaces.Discrete(self.nS) 40 | 41 | self._seed() 42 | self._reset() 43 | 44 | def _seed(self, seed=None): 45 | self.np_random, seed = seeding.np_random(seed) 46 | return [seed] 47 | 48 | def _reset(self): 49 | self.s = categorical_sample(self.isd, self.np_random) 50 | self.lastaction=None 51 | return self.s 52 | 53 | def _step(self, a): 54 | transitions = self.P[self.s][a] 55 | i = categorical_sample([t[0] for t in transitions], self.np_random) 56 | p, s, r, d= transitions[i] 57 | self.s = s 58 | self.lastaction=a 59 | return (s, r, d, {"prob" : p}) 60 | -------------------------------------------------------------------------------- /hw4/controllers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from cost_functions import trajectory_cost_fn 3 | import time 4 | 5 | class Controller(): 6 | def __init__(self): 7 | pass 8 | 9 | # Get the appropriate action(s) for this state(s) 10 | def get_action(self, state): 11 | pass 12 | 13 | 14 | class RandomController(Controller): 15 | def __init__(self, env): 16 | self.env = env 17 | 18 | def get_action(self, state): 19 | """ Your code should randomly sample an action uniformly from the action space """ 20 | return self.env.action_space.sample() 21 | 22 | 23 | class MPCcontroller(Controller): 24 | """ Controller built using the MPC method outlined in https://arxiv.org/abs/1708.02596 """ 25 | def __init__(self, 26 | env, 27 | dyn_model, 28 | horizon=5, 29 | cost_fn=None, 30 | num_simulated_paths=10, 31 | ): 32 | self.env = env 33 | self.dyn_model = dyn_model 34 | self.horizon = horizon 35 | self.cost_fn = cost_fn 36 | self.num_simulated_paths = num_simulated_paths 37 | 38 | def get_action(self, state): 39 | """ Note: be careful to batch your simulations through the model for speed """ 40 | 41 | state_batch, states, next_states, actions = [], [], [], [] 42 | 43 | #state batches have dimension (K, dim(state)) 44 | for _ in range(self.num_simulated_paths): 45 | state_batch.append(state) 46 | 47 | for _ in range(self.horizon): 48 | action = [] 49 | for _ in range(self.num_simulated_paths): 50 | action.append(self.env.action_space.sample()) 51 | actions.append(action) 52 | states.append(state_batch) 53 | #use batch for speed 54 | state_batch = self.dyn_model.predict(np.array(state_batch), np.array(action)) 55 | 56 | next_states.append(state_batch) 57 | 58 | costs = trajectory_cost_fn(self.cost_fn, np.array(states), np.array(actions), np.array(next_states)) 59 | j_star = np.argmin(np.array(costs)) 60 | return actions[0][j_star] 61 | 62 | -------------------------------------------------------------------------------- /hw4/cost_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | #======================================================== 5 | # 6 | # Environment-specific cost functions: 7 | # 8 | 9 | def cheetah_cost_fn(state, action, next_state): 10 | if len(state.shape) > 1: 11 | 12 | heading_penalty_factor=10 13 | scores=np.zeros((state.shape[0],)) 14 | 15 | #dont move front shin back so far that you tilt forward 16 | front_leg = state[:,5] 17 | my_range = 0.2 18 | scores[front_leg>=my_range] += heading_penalty_factor 19 | 20 | front_shin = state[:,6] 21 | my_range = 0 22 | scores[front_shin>=my_range] += heading_penalty_factor 23 | 24 | front_foot = state[:,7] 25 | my_range = 0 26 | scores[front_foot>=my_range] += heading_penalty_factor 27 | 28 | scores-= (next_state[:,17] - state[:,17]) / 0.01 #+ 0.1 * (np.sum(action**2, axis=1)) 29 | return scores 30 | 31 | heading_penalty_factor=10 32 | score = 0 33 | 34 | #dont move front shin back so far that you tilt forward 35 | front_leg = state[5] 36 | my_range = 0.2 37 | if front_leg>=my_range: 38 | score += heading_penalty_factor 39 | 40 | front_shin = state[6] 41 | my_range = 0 42 | if front_shin>=my_range: 43 | score += heading_penalty_factor 44 | 45 | front_foot = state[7] 46 | my_range = 0 47 | if front_foot>=my_range: 48 | score += heading_penalty_factor 49 | 50 | score -= (next_state[17] - state[17]) / 0.01 #+ 0.1 * (np.sum(action**2)) 51 | return score 52 | 53 | #======================================================== 54 | # 55 | # Cost function for a whole trajectory: 56 | # 57 | 58 | def trajectory_cost_fn(cost_fn, states, actions, next_states): 59 | trajectory_cost = 0 60 | for i in range(len(actions)): 61 | trajectory_cost += cost_fn(states[i], actions[i], next_states[i]) 62 | return trajectory_cost -------------------------------------------------------------------------------- /sp17_hw/hw1/run_expert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Code to load an expert policy and generate roll-out data for behavioral cloning. 5 | Example usage: 6 | python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ 7 | --num_rollouts 20 8 | 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com) 10 | """ 11 | 12 | import pickle 13 | import tensorflow as tf 14 | import numpy as np 15 | import tf_util 16 | import gym 17 | import load_policy 18 | 19 | def main(): 20 | import argparse 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('expert_policy_file', type=str) 23 | parser.add_argument('envname', type=str) 24 | parser.add_argument('--render', action='store_true') 25 | parser.add_argument("--max_timesteps", type=int) 26 | parser.add_argument('--num_rollouts', type=int, default=20, 27 | help='Number of expert roll outs') 28 | args = parser.parse_args() 29 | 30 | print('loading and building expert policy') 31 | policy_fn = load_policy.load_policy(args.expert_policy_file) 32 | print('loaded and built') 33 | 34 | with tf.Session(): 35 | tf_util.initialize() 36 | 37 | import gym 38 | env = gym.make(args.envname) 39 | max_steps = args.max_timesteps or env.spec.timestep_limit 40 | 41 | returns = [] 42 | observations = [] 43 | actions = [] 44 | for i in range(args.num_rollouts): 45 | print('iter', i) 46 | obs = env.reset() 47 | done = False 48 | totalr = 0. 49 | steps = 0 50 | while not done: 51 | action = policy_fn(obs[None,:]) 52 | observations.append(obs) 53 | actions.append(action) 54 | obs, r, done, _ = env.step(action) 55 | totalr += r 56 | steps += 1 57 | if args.render: 58 | env.render() 59 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 60 | if steps >= max_steps: 61 | break 62 | returns.append(totalr) 63 | 64 | print('returns', returns) 65 | print('mean return', np.mean(returns)) 66 | print('std of return', np.std(returns)) 67 | 68 | expert_data = {'observations': np.array(observations), 69 | 'actions': np.array(actions)} 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /hw1/load_policy.py: -------------------------------------------------------------------------------- 1 | import pickle, tensorflow as tf, tf_util, numpy as np 2 | 3 | def load_policy(filename): 4 | with open(filename, 'rb') as f: 5 | data = pickle.loads(f.read()) 6 | 7 | # assert len(data.keys()) == 2 8 | nonlin_type = data['nonlin_type'] 9 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 10 | 11 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) 12 | policy_params = data[policy_type] 13 | 14 | assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} 15 | 16 | # Keep track of input and output dims (i.e. observation and action dims) for the user 17 | 18 | def build_policy(obs_bo): 19 | def read_layer(l): 20 | assert list(l.keys()) == ['AffineLayer'] 21 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] 22 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) 23 | 24 | def apply_nonlin(x): 25 | if nonlin_type == 'lrelu': 26 | return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 27 | elif nonlin_type == 'tanh': 28 | return tf.tanh(x) 29 | else: 30 | raise NotImplementedError(nonlin_type) 31 | 32 | # Build the policy. First, observation normalization. 33 | assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] 34 | obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] 35 | obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] 36 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 37 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) 38 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation 39 | 40 | curr_activations_bd = normedobs_bo 41 | 42 | # Hidden layers next 43 | assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] 44 | layer_params = policy_params['hidden']['FeedforwardNet'] 45 | for layer_name in sorted(layer_params.keys()): 46 | l = layer_params[layer_name] 47 | W, b = read_layer(l) 48 | curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) 49 | 50 | # Output layer 51 | W, b = read_layer(policy_params['out']) 52 | output_bo = tf.matmul(curr_activations_bd, W) + b 53 | return output_bo 54 | 55 | obs_bo = tf.placeholder(tf.float32, [None, None]) 56 | a_ba = build_policy(obs_bo) 57 | policy_fn = tf_util.function([obs_bo], a_ba) 58 | return policy_fn -------------------------------------------------------------------------------- /sp17_hw/hw1/load_policy.py: -------------------------------------------------------------------------------- 1 | import pickle, tensorflow as tf, tf_util, numpy as np 2 | 3 | def load_policy(filename): 4 | with open(filename, 'rb') as f: 5 | data = pickle.loads(f.read()) 6 | 7 | # assert len(data.keys()) == 2 8 | nonlin_type = data['nonlin_type'] 9 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 10 | 11 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) 12 | policy_params = data[policy_type] 13 | 14 | assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} 15 | 16 | # Keep track of input and output dims (i.e. observation and action dims) for the user 17 | 18 | def build_policy(obs_bo): 19 | def read_layer(l): 20 | assert list(l.keys()) == ['AffineLayer'] 21 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] 22 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) 23 | 24 | def apply_nonlin(x): 25 | if nonlin_type == 'lrelu': 26 | return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 27 | elif nonlin_type == 'tanh': 28 | return tf.tanh(x) 29 | else: 30 | raise NotImplementedError(nonlin_type) 31 | 32 | # Build the policy. First, observation normalization. 33 | assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] 34 | obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] 35 | obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] 36 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 37 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) 38 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation 39 | 40 | curr_activations_bd = normedobs_bo 41 | 42 | # Hidden layers next 43 | assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] 44 | layer_params = policy_params['hidden']['FeedforwardNet'] 45 | for layer_name in sorted(layer_params.keys()): 46 | l = layer_params[layer_name] 47 | W, b = read_layer(l) 48 | curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) 49 | 50 | # Output layer 51 | W, b = read_layer(policy_params['out']) 52 | output_bo = tf.matmul(curr_activations_bd, W) + b 53 | return output_bo 54 | 55 | obs_bo = tf.placeholder(tf.float32, [None, None]) 56 | a_ba = build_policy(obs_bo) 57 | policy_fn = tf_util.function([obs_bo], a_ba) 58 | return policy_fn -------------------------------------------------------------------------------- /hw1/model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from sklearn.utils import shuffle 4 | 5 | class Model(object): 6 | def __init__(self, input_data, output_data, name, type, batch_size = 64): 7 | self.name = type + "/" + name 8 | self.batch_size = batch_size 9 | self.input_data = input_data 10 | self.output_data = np.reshape(output_data, (output_data.shape[0], output_data.shape[2])) 11 | self.sess = tf.Session() 12 | 13 | self.input_shape = [self.batch_size, self.input_data.shape[-1]] 14 | self.output_shape = [self.batch_size, self.output_data.shape[-1]] 15 | 16 | self.input_placeholder = tf.placeholder(tf.float32, shape = self.input_shape) 17 | self.output_placeholder_true = tf.placeholder(tf.float32, shape = self.output_shape) 18 | self.output_placeholder_false = self.build_model(self.input_placeholder) 19 | 20 | self.loss = tf.reduce_mean(tf.nn.l2_loss(self.output_placeholder_true - self.output_placeholder_false)) 21 | self.loss_summary = tf.summary.scalar("loss", self.loss) 22 | 23 | def build_model(self, input_placeholder): 24 | x = tf.layers.dense(input_placeholder, 64, activation = tf.nn.tanh) 25 | x = tf.layers.dense(x, 32, activation = tf.nn.tanh) 26 | x = tf.layers.dense(x, self.output_shape[-1]) 27 | return x 28 | 29 | def train(self, epochs = 20, train_data = None, test_data = None, number = None): 30 | if train_data is None and test_data is None : 31 | train_data, test_data = shuffle(self.input_data, self.output_data, random_state = 0) 32 | else: 33 | test_data = np.reshape(test_data, (test_data.shape[0], test_data.shape[2])) 34 | 35 | optimizer = tf.train.AdamOptimizer().minimize(self.loss) 36 | 37 | saver = tf.train.Saver() 38 | 39 | batch_idxs = len(train_data) // self.batch_size 40 | 41 | if number is None: 42 | writer = tf.summary.FileWriter(self.name) 43 | else: 44 | writer = tf.summary.FileWriter(self.name + str(number)) 45 | writer.add_graph(self.sess.graph) 46 | 47 | init_op = tf.global_variables_initializer() 48 | self.sess.run(init_op) 49 | 50 | for epoch in range(epochs): 51 | for idx in range(batch_idxs): 52 | batch_train = train_data[idx * self.batch_size : (idx + 1) * self.batch_size] 53 | batch_value = test_data[idx * self.batch_size : (idx + 1) * self.batch_size] 54 | feed_train = {self.input_placeholder : batch_train, self.output_placeholder_true : batch_value} 55 | self.sess.run(optimizer, feed_dict = feed_train) 56 | 57 | if idx % 20 == 0: 58 | loss_value = self.sess.run(self.loss_summary, feed_dict = feed_train) 59 | writer.add_summary(loss_value, epoch * batch_idxs + idx) 60 | 61 | saver.save(self.sess, self.name + "/behavior_cloning_model") 62 | 63 | def sample(self, input): 64 | output = self.sess.run(self.output_placeholder_false, feed_dict = {self.input_placeholder : np.repeat(input[None, :], 64, axis = 0)}) 65 | return output[0] 66 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CS 294-112 homework (offered in Fall of 2017) 2 | 3 | This is my github repo for homework for [CS294](http://rail.eecs.berkeley.edu/deeprlcourse-fa17/index.html) (offered in Fall 2017). I covered this course remotely (using lecture notes and videos) and implemented the coding parts of the homework. Below are synopses for what I implemented for each homework assignment. 4 | 5 | *Disclaimer: this code is for **educational purposes** only. Students taking current iterations of this course should refrain from copying this code, as that would breach academic integrity and hamper their own education.* 6 | 7 | ## Dependencies 8 | 9 | * [Tensorflow 1.4](https://www.tensorflow.org/) 10 | * [Numpy 1.13.3](http://www.numpy.org/) 11 | * [Gym 0.10.5](https://gym.openai.com/) Gym 0.9.5 was used for homework 3. 12 | * [Mujoco 1.5](http://www.mujoco.org/) 13 | 14 | Note that some of these dependencies were not released at the time of this course. Furthermore, the starter code has been modified to reflect changes in OpenAI Gym's documentation. 15 | 16 | ## Homework 1 17 | 18 | The course, up to this point, has covered more basic supervised learning. I implemented BC (behavior cloning) and DAgger (Dataset Aggregation), which improved the results (slightly). I also experimented with various hyperparameters. 19 | 20 | ## Homework 2 21 | 22 | I implemented the policy gradient algorithm and ran some tests on various environments. I played with the hyperparameters and saw that my implementation caused the agent's reward to converge to the theoretical value. I also implemented GAE (generalized advantage estimation) and compared its results. 23 | 24 | ## Homework 3 25 | 26 | I implemented the DQN algorithm and ran it on the Atari Pong simulator. I experimented with different hyperparameters and saw that my model converged to the perfect value. 27 | 28 | ## Homework 4 29 | 30 | I implemented the MPC algorithm. However, I was unable to run the provided HalfCheetahEnvNew as it threw 31 | 32 | ~~~~ 33 | 'mujoco_py.cymj.PyMjModel' object has no attribute 'data' 34 | ~~~~ 35 | 36 | Furthermore, when I attempted to work with the given 'HalfCheetah-v2' environment that (in terms of raw code) is isomorphic to the HalfCheetahEnvNew, the action dimensions representing 37 | 38 | ~~~~ 39 | - rootx slider position (m) 40 | - rootz slider position (m) 41 | - rooty hinge angle (rad) 42 | - bthigh hinge angle (rad) 43 | - bshin hinge angle (rad) 44 | - bfoot hinge angle (rad) 45 | - fthigh hinge angle (rad) 46 | - fshin hinge angle (rad) 47 | - ffoot hinge angle (rad) 48 | - rootx slider velocity (m/s) 49 | - rootz slider velocity (m/s) 50 | - rooty hinge angular velocity (rad/s) 51 | - bthigh hinge angular velocity (rad/s) 52 | - bshin hinge angular velocity (rad/s) 53 | - bfoot hinge angular velocity (rad/s) 54 | - fthigh hinge angular velocity (rad/s) 55 | - fshin hinge angular velocity (rad/s) 56 | - ffoot hinge angular velocity (rad/s) 57 | ~~~~ 58 | 59 | Aren't correctly represented in the loss function (the comments about what each part represents don't match up). Furthermore, for some strange reason, all HalfCheetah environments load in 17 variables, not 18. -------------------------------------------------------------------------------- /hw1/run_expert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Code to load an expert policy and generate roll-out data for behavioral cloning. 5 | Example usage: 6 | python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ 7 | --num_rollouts 20 8 | 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com) 10 | """ 11 | 12 | import pickle 13 | import tensorflow as tf 14 | import numpy as np 15 | import tf_util 16 | import gym 17 | import load_policy 18 | import model 19 | 20 | def main(): 21 | import argparse 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('expert_policy_file', type=str) 24 | parser.add_argument('envname', type=str) 25 | parser.add_argument('--render', action='store_true') 26 | parser.add_argument("--max_timesteps", type=int) 27 | parser.add_argument('--num_rollouts', type=int, default=20, 28 | help='Number of expert roll outs') 29 | args = parser.parse_args() 30 | 31 | print('loading and building expert policy') 32 | policy_fn = load_policy.load_policy(args.expert_policy_file) 33 | print('loaded and built') 34 | 35 | with tf.Session(): 36 | tf_util.initialize() 37 | 38 | import gym 39 | env = gym.make(args.envname) 40 | max_steps = args.max_timesteps or env.spec.timestep_limit 41 | 42 | returns = [] 43 | observations = [] 44 | actions = [] 45 | for i in range(args.num_rollouts): 46 | print('iter', i) 47 | obs = env.reset() 48 | done = False 49 | totalr = 0. 50 | steps = 0 51 | while not done: 52 | action = policy_fn(obs[None,:]) 53 | observations.append(obs) 54 | actions.append(action) 55 | obs, r, done, _ = env.step(action) 56 | totalr += r 57 | steps += 1 58 | if args.render: 59 | env.render() 60 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 61 | if steps >= max_steps: 62 | break 63 | returns.append(totalr) 64 | 65 | print('returns', returns) 66 | print('mean return', np.mean(returns)) 67 | print('std of return', np.std(returns)) 68 | 69 | expert_data = {'observations': np.array(observations), 70 | 'actions': np.array(actions)} 71 | 72 | print('observation shape', expert_data['observations'].shape) 73 | print('action shape', expert_data['actions'].shape) 74 | 75 | our_model = model.Model(expert_data['observations'], expert_data['actions'], args.envname[:-3], "behavior_cloning") 76 | our_model.train() 77 | 78 | for i in range(5): 79 | obs = env.reset() 80 | done = False 81 | totalr = 0. 82 | steps = 0 83 | while not done: 84 | action = our_model.sample(obs) 85 | obs, r, done, _ = env.step(action) 86 | totalr += r 87 | steps += 1 88 | env.render() 89 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 90 | if steps >= max_steps: 91 | break 92 | 93 | if __name__ == '__main__': 94 | main() 95 | -------------------------------------------------------------------------------- /sp17_hw/hw4/logz.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Some simple logging functionality, inspired by rllab's logging. 4 | Assumes that each diagnostic gets logged each iteration 5 | 6 | Call logz.configure_output_dir() to start logging to a 7 | tab-separated-values file (some_folder_name/log.txt) 8 | 9 | To load the learning curves, you can do, for example 10 | 11 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 12 | A['EpRewMean'] 13 | 14 | """ 15 | 16 | import os.path as osp, shutil, time, atexit, os, subprocess 17 | 18 | color2num = dict( 19 | gray=30, 20 | red=31, 21 | green=32, 22 | yellow=33, 23 | blue=34, 24 | magenta=35, 25 | cyan=36, 26 | white=37, 27 | crimson=38 28 | ) 29 | 30 | def colorize(string, color, bold=False, highlight=False): 31 | attr = [] 32 | num = color2num[color] 33 | if highlight: num += 10 34 | attr.append(str(num)) 35 | if bold: attr.append('1') 36 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 37 | 38 | class G: 39 | output_dir = None 40 | output_file = None 41 | first_row = True 42 | log_headers = [] 43 | log_current_row = {} 44 | 45 | def configure_output_dir(d=None): 46 | """ 47 | Set output directory to d, or to /tmp/somerandomnumber if d is None 48 | """ 49 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 50 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 51 | os.makedirs(G.output_dir) 52 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 53 | atexit.register(G.output_file.close) 54 | try: 55 | cmd = "cd %s && git diff > %s 2>/dev/null"%(osp.dirname(__file__), osp.join(G.output_dir, "a.diff")) 56 | subprocess.check_call(cmd, shell=True) # Save git diff to experiment directory 57 | except subprocess.CalledProcessError: 58 | print("configure_output_dir: not storing the git diff, probably because you're not in a git repo") 59 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 60 | 61 | def log_tabular(key, val): 62 | """ 63 | Log a value of some diagnostic 64 | Call this once for each diagnostic quantity, each iteration 65 | """ 66 | if G.first_row: 67 | G.log_headers.append(key) 68 | else: 69 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 70 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 71 | G.log_current_row[key] = val 72 | 73 | def dump_tabular(): 74 | """ 75 | Write all of the diagnostics from the current iteration 76 | """ 77 | vals = [] 78 | print("-"*37) 79 | for key in G.log_headers: 80 | val = G.log_current_row.get(key, "") 81 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 82 | else: valstr = val 83 | print("| %15s | %15s |"%(key, valstr)) 84 | vals.append(val) 85 | print("-"*37) 86 | if G.output_file is not None: 87 | if G.first_row: 88 | G.output_file.write("\t".join(G.log_headers)) 89 | G.output_file.write("\n") 90 | G.output_file.write("\t".join(map(str,vals))) 91 | G.output_file.write("\n") 92 | G.output_file.flush() 93 | G.log_current_row.clear() 94 | G.first_row=False -------------------------------------------------------------------------------- /hw1/DAgger.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import tensorflow as tf 3 | import numpy as np 4 | import tf_util 5 | import gym 6 | import load_policy 7 | import model 8 | 9 | def main(): 10 | import argparse 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('expert_policy_file', type=str) 13 | parser.add_argument('envname', type=str) 14 | parser.add_argument('--render', action='store_true') 15 | parser.add_argument("--max_timesteps", type=int) 16 | parser.add_argument('--num_rollouts', type=int, default=20, 17 | help='Number of expert roll outs') 18 | parser.add_argument('--DAgger_iter', type = int, default=5) 19 | args = parser.parse_args() 20 | 21 | print('loading and building expert policy') 22 | policy_fn = load_policy.load_policy(args.expert_policy_file) 23 | print('loaded and built') 24 | 25 | with tf.Session(): 26 | tf_util.initialize() 27 | import gym 28 | env = gym.make(args.envname) 29 | max_steps = args.max_timesteps or env.spec.timestep_limit 30 | 31 | returns = [] 32 | observations = [] 33 | actions = [] 34 | for i in range(args.num_rollouts): 35 | print('iter', i) 36 | obs = env.reset() 37 | done = False 38 | totalr = 0. 39 | steps = 0 40 | while not done: 41 | action = policy_fn(obs[None,:]) 42 | observations.append(obs) 43 | actions.append(action) 44 | obs, r, done, _ = env.step(action) 45 | totalr += r 46 | steps += 1 47 | if args.render: 48 | env.render() 49 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 50 | if steps >= max_steps: 51 | break 52 | returns.append(totalr) 53 | 54 | print('returns', returns) 55 | print('mean return', np.mean(returns)) 56 | print('std of return', np.std(returns)) 57 | 58 | expert_data = {'observations': np.array(observations), 59 | 'actions': np.array(actions)} 60 | 61 | training_obs = expert_data['observations'] 62 | training_actions = expert_data['actions'] 63 | 64 | print('observation shape', expert_data['observations'].shape) 65 | print('action shape', expert_data['actions'].shape) 66 | 67 | our_model = model.Model(training_obs, training_actions, args.envname[:-3], 'DAgger') 68 | our_model.train() 69 | 70 | for i in range(args.DAgger_iter): 71 | new_obs = [] 72 | new_actions = [] 73 | obs = env.reset() 74 | done = False 75 | while not done: 76 | action = our_model.sample(obs) 77 | obs, _, done, _ = env.step(action) 78 | if args.render: 79 | env.render() 80 | corrected_action = policy_fn(obs[None, :]) 81 | new_obs.append(obs) 82 | new_actions.append(corrected_action) 83 | 84 | training_obs = np.concatenate((training_obs, obs[None, :]), axis = 0) 85 | training_actions = np.concatenate((training_actions, corrected_action[None, :]), axis = 0) 86 | our_model.train(train_data = np.array(new_obs), test_data = np.array(new_actions), number = i) 87 | 88 | if __name__ == '__main__': 89 | main() 90 | -------------------------------------------------------------------------------- /hw2/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw4/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | if osp.exists(G.output_dir): 55 | print("Log dir %s already exists! Delete it first or use a different dir"%G.output_dir) 56 | else: 57 | os.makedirs(G.output_dir) 58 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 59 | atexit.register(G.output_file.close) 60 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 61 | 62 | def log_tabular(key, val): 63 | """ 64 | Log a value of some diagnostic 65 | Call this once for each diagnostic quantity, each iteration 66 | """ 67 | if G.first_row: 68 | G.log_headers.append(key) 69 | else: 70 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 71 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 72 | G.log_current_row[key] = val 73 | 74 | def save_params(params): 75 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 76 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 77 | 78 | def pickle_tf_vars(): 79 | """ 80 | Saves tensorflow variables 81 | Requires them to be initialized first, also a default session must exist 82 | """ 83 | _dict = {v.name : v.eval() for v in tf.global_variables()} 84 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 85 | pickle.dump(_dict, f) 86 | 87 | 88 | def dump_tabular(): 89 | """ 90 | Write all of the diagnostics from the current iteration 91 | """ 92 | vals = [] 93 | key_lens = [len(key) for key in G.log_headers] 94 | max_key_len = max(15,max(key_lens)) 95 | keystr = '%'+'%d'%max_key_len 96 | fmt = "| " + keystr + "s | %15s |" 97 | n_slashes = 22 + max_key_len 98 | print("-"*n_slashes) 99 | for key in G.log_headers: 100 | val = G.log_current_row.get(key, "") 101 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 102 | else: valstr = val 103 | print(fmt%(key, valstr)) 104 | vals.append(val) 105 | print("-"*n_slashes) 106 | if G.output_file is not None: 107 | if G.first_row: 108 | G.output_file.write("\t".join(G.log_headers)) 109 | G.output_file.write("\n") 110 | G.output_file.write("\t".join(map(str,vals))) 111 | G.output_file.write("\n") 112 | G.output_file.flush() 113 | G.log_current_row.clear() 114 | G.first_row=False 115 | -------------------------------------------------------------------------------- /hw2/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | sns.set(style="darkgrid", font_scale=1.5) 55 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 56 | plt.legend(loc='best').draggable() 57 | plt.show() 58 | 59 | 60 | def get_datasets(fpath, condition=None): 61 | unit = 0 62 | datasets = [] 63 | for root, dir, files in os.walk(fpath): 64 | if 'log.txt' in files: 65 | param_path = open(os.path.join(root,'params.json')) 66 | params = json.load(param_path) 67 | exp_name = params['exp_name'] 68 | 69 | log_path = os.path.join(root,'log.txt') 70 | experiment_data = pd.read_table(log_path) 71 | 72 | experiment_data.insert( 73 | len(experiment_data.columns), 74 | 'Unit', 75 | unit 76 | ) 77 | experiment_data.insert( 78 | len(experiment_data.columns), 79 | 'Condition', 80 | condition or exp_name 81 | ) 82 | 83 | datasets.append(experiment_data) 84 | unit += 1 85 | 86 | return datasets 87 | 88 | 89 | def main(): 90 | import argparse 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument('logdir', nargs='*') 93 | parser.add_argument('--legend', nargs='*') 94 | parser.add_argument('--value', default='AverageReturn', nargs='*') 95 | args = parser.parse_args() 96 | 97 | use_legend = False 98 | if args.legend is not None: 99 | assert len(args.legend) == len(args.logdir), \ 100 | "Must give a legend title for each set of experiments." 101 | use_legend = True 102 | 103 | data = [] 104 | if use_legend: 105 | for logdir, legend_title in zip(args.logdir, args.legend): 106 | data += get_datasets(logdir, legend_title) 107 | else: 108 | for logdir in args.logdir: 109 | data += get_datasets(logdir) 110 | 111 | if isinstance(args.value, list): 112 | values = args.value 113 | else: 114 | values = [args.value] 115 | for value in values: 116 | plot_data(data, value=value) 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /hw4/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | sns.set(style="darkgrid", font_scale=1.5) 55 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 56 | plt.legend(loc='best').draggable() 57 | plt.show() 58 | 59 | 60 | def get_datasets(fpath, condition=None): 61 | unit = 0 62 | datasets = [] 63 | for root, dir, files in os.walk(fpath): 64 | if 'log.txt' in files: 65 | param_path = open(os.path.join(root,'params.json')) 66 | params = json.load(param_path) 67 | exp_name = params['exp_name'] 68 | 69 | log_path = os.path.join(root,'log.txt') 70 | experiment_data = pd.read_table(log_path) 71 | 72 | experiment_data.insert( 73 | len(experiment_data.columns), 74 | 'Unit', 75 | unit 76 | ) 77 | experiment_data.insert( 78 | len(experiment_data.columns), 79 | 'Condition', 80 | condition or exp_name 81 | ) 82 | 83 | datasets.append(experiment_data) 84 | unit += 1 85 | 86 | return datasets 87 | 88 | 89 | def main(): 90 | import argparse 91 | parser = argparse.ArgumentParser() 92 | parser.add_argument('logdir', nargs='*') 93 | parser.add_argument('--legend', nargs='*') 94 | parser.add_argument('--value', default='AverageReturn', nargs='*') 95 | args = parser.parse_args() 96 | 97 | use_legend = False 98 | if args.legend is not None: 99 | assert len(args.legend) == len(args.logdir), \ 100 | "Must give a legend title for each set of experiments." 101 | use_legend = True 102 | 103 | data = [] 104 | if use_legend: 105 | for logdir, legend_title in zip(args.logdir, args.legend): 106 | data += get_datasets(logdir, legend_title) 107 | else: 108 | for logdir in args.logdir: 109 | data += get_datasets(logdir) 110 | 111 | if isinstance(args.value, list): 112 | values = args.value 113 | else: 114 | values = [args.value] 115 | for value in values: 116 | plot_data(data, value=value) 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /hw3/run_dqn_ram.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(ram_in, num_actions, scope, reuse=False): 16 | with tf.variable_scope(scope, reuse=reuse): 17 | out = ram_in 18 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65])) 19 | with tf.variable_scope("action_value"): 20 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) 21 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu) 22 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 23 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 24 | 25 | return out 26 | 27 | def atari_learn(env, 28 | session, 29 | num_timesteps): 30 | # This is just a rough estimate 31 | num_iterations = float(num_timesteps) / 4.0 32 | 33 | lr_multiplier = 1.0 34 | lr_schedule = PiecewiseSchedule([ 35 | (0, 1e-4 * lr_multiplier), 36 | (num_iterations / 10, 1e-4 * lr_multiplier), 37 | (num_iterations / 2, 5e-5 * lr_multiplier), 38 | ], 39 | outside_value=5e-5 * lr_multiplier) 40 | optimizer = dqn.OptimizerSpec( 41 | constructor=tf.train.AdamOptimizer, 42 | kwargs=dict(epsilon=1e-4), 43 | lr_schedule=lr_schedule 44 | ) 45 | 46 | def stopping_criterion(env, t): 47 | # notice that here t is the number of steps of the wrapped env, 48 | # which is different from the number of steps in the underlying env 49 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 50 | 51 | exploration_schedule = PiecewiseSchedule( 52 | [ 53 | (0, 0.2), 54 | (1e6, 0.1), 55 | (num_iterations / 2, 0.01), 56 | ], outside_value=0.01 57 | ) 58 | 59 | dqn.learn( 60 | env, 61 | q_func=atari_model, 62 | optimizer_spec=optimizer, 63 | session=session, 64 | exploration=exploration_schedule, 65 | stopping_criterion=stopping_criterion, 66 | replay_buffer_size=1000000, 67 | batch_size=32, 68 | gamma=0.99, 69 | learning_starts=50000, 70 | learning_freq=4, 71 | frame_history_len=1, 72 | target_update_freq=10000, 73 | grad_norm_clipping=10 74 | ) 75 | env.close() 76 | 77 | def get_available_gpus(): 78 | from tensorflow.python.client import device_lib 79 | local_device_protos = device_lib.list_local_devices() 80 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 81 | 82 | def set_global_seeds(i): 83 | try: 84 | import tensorflow as tf 85 | except ImportError: 86 | pass 87 | else: 88 | tf.set_random_seed(i) 89 | np.random.seed(i) 90 | random.seed(i) 91 | 92 | def get_session(): 93 | tf.reset_default_graph() 94 | tf_config = tf.ConfigProto( 95 | inter_op_parallelism_threads=1, 96 | intra_op_parallelism_threads=1) 97 | session = tf.Session(config=tf_config) 98 | print("AVAILABLE GPUS: ", get_available_gpus()) 99 | return session 100 | 101 | def get_env(seed): 102 | env = gym.make('Pong-ram-v0') 103 | 104 | set_global_seeds(seed) 105 | env.seed(seed) 106 | 107 | expt_dir = '/tmp/hw3_vid_dir/' 108 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 109 | env = wrap_deepmind_ram(env) 110 | 111 | return env 112 | 113 | def main(): 114 | # Run training 115 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 116 | env = get_env(seed) 117 | session = get_session() 118 | atari_learn(env, session, num_timesteps=int(4e7)) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /sp17_hw/hw3/run_dqn_ram.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(ram_in, num_actions, scope, reuse=False): 16 | with tf.variable_scope(scope, reuse=reuse): 17 | out = ram_in 18 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65])) 19 | with tf.variable_scope("action_value"): 20 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) 21 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu) 22 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 23 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 24 | 25 | return out 26 | 27 | def atari_learn(env, 28 | session, 29 | num_timesteps): 30 | # This is just a rough estimate 31 | num_iterations = float(num_timesteps) / 4.0 32 | 33 | lr_multiplier = 1.0 34 | lr_schedule = PiecewiseSchedule([ 35 | (0, 1e-4 * lr_multiplier), 36 | (num_iterations / 10, 1e-4 * lr_multiplier), 37 | (num_iterations / 2, 5e-5 * lr_multiplier), 38 | ], 39 | outside_value=5e-5 * lr_multiplier) 40 | optimizer = dqn.OptimizerSpec( 41 | constructor=tf.train.AdamOptimizer, 42 | kwargs=dict(epsilon=1e-4), 43 | lr_schedule=lr_schedule 44 | ) 45 | 46 | def stopping_criterion(env, t): 47 | # notice that here t is the number of steps of the wrapped env, 48 | # which is different from the number of steps in the underlying env 49 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 50 | 51 | exploration_schedule = PiecewiseSchedule( 52 | [ 53 | (0, 0.2), 54 | (1e6, 0.1), 55 | (num_iterations / 2, 0.01), 56 | ], outside_value=0.01 57 | ) 58 | 59 | dqn.learn( 60 | env, 61 | q_func=atari_model, 62 | optimizer_spec=optimizer, 63 | session=session, 64 | exploration=exploration_schedule, 65 | stopping_criterion=stopping_criterion, 66 | replay_buffer_size=1000000, 67 | batch_size=32, 68 | gamma=0.99, 69 | learning_starts=50000, 70 | learning_freq=4, 71 | frame_history_len=1, 72 | target_update_freq=10000, 73 | grad_norm_clipping=10 74 | ) 75 | env.close() 76 | 77 | def get_available_gpus(): 78 | from tensorflow.python.client import device_lib 79 | local_device_protos = device_lib.list_local_devices() 80 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 81 | 82 | def set_global_seeds(i): 83 | try: 84 | import tensorflow as tf 85 | except ImportError: 86 | pass 87 | else: 88 | tf.set_random_seed(i) 89 | np.random.seed(i) 90 | random.seed(i) 91 | 92 | def get_session(): 93 | tf.reset_default_graph() 94 | tf_config = tf.ConfigProto( 95 | inter_op_parallelism_threads=1, 96 | intra_op_parallelism_threads=1) 97 | session = tf.Session(config=tf_config) 98 | print("AVAILABLE GPUS: ", get_available_gpus()) 99 | return session 100 | 101 | def get_env(seed): 102 | env = gym.make('Pong-ram-v0') 103 | 104 | set_global_seeds(seed) 105 | env.seed(seed) 106 | 107 | expt_dir = '/tmp/hw3_vid_dir/' 108 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 109 | env = wrap_deepmind_ram(env) 110 | 111 | return env 112 | 113 | def main(): 114 | # Run training 115 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 116 | env = get_env(seed) 117 | session = get_session() 118 | atari_learn(env, session, num_timesteps=int(4e7)) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /hw4/dynamics.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import math 4 | 5 | # Predefined function to build a feedforward neural network 6 | def build_mlp(input_placeholder, 7 | output_size, 8 | scope, 9 | n_layers=2, 10 | size=500, 11 | activation=tf.tanh, 12 | output_activation=None 13 | ): 14 | out = input_placeholder 15 | with tf.variable_scope(scope): 16 | for _ in range(n_layers): 17 | out = tf.layers.dense(out, size, activation=activation) 18 | out = tf.layers.dense(out, output_size, activation=output_activation) 19 | return out 20 | 21 | class NNDynamicsModel(): 22 | def __init__(self, 23 | env, 24 | n_layers, 25 | size, 26 | activation, 27 | output_activation, 28 | normalization, 29 | batch_size, 30 | iterations, 31 | learning_rate, 32 | sess 33 | ): 34 | """ YOUR CODE HERE """ 35 | """ Note: Be careful about normalization """ 36 | self.mean_obs, self.std_obs, self.mean_deltas, self.std_deltas, self.mean_action, self.std_action = normalization 37 | self.sess = sess 38 | self.batch_size = batch_size 39 | self.iter = iterations 40 | 41 | self.state_dim = env.observation_space.shape[0] 42 | self.action_dim = env.action_space.shape[0] 43 | 44 | self.s_a_ph = tf.placeholder(shape = [None, self.state_dim + self.action_dim], name = 's_a_ph', dtype = tf.float32) 45 | self.delta_ph = tf.placeholder(shape = [None, self.state_dim], name = 'delta_ph', dtype = tf.float32) 46 | self.delta_pred = build_mlp(self.s_a_ph, self.state_dim, "dynamics", n_layers = n_layers, size = size, 47 | activation = activation, output_activation = output_activation) 48 | self.loss = tf.reduce_mean(tf.square(self.delta_pred - self.delta_ph)) 49 | self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss) 50 | 51 | def fit(self, data): 52 | """ 53 | Write a function to take in a dataset of (unnormalized)states, 54 | (unnormalized)actions, (unnormalized)next_states and fit the dynamics model going from normalized states, 55 | normalized actions to normalized state differences (s_t+1 - s_t) 56 | """ 57 | 58 | un_states = np.concatenate([d['observations'] for d in data]) 59 | un_actions = np.concatenate([d['actions'] for d in data]) 60 | un_states_next = np.concatenate([d['next_observations'] for d in data]) 61 | N = un_states.shape[0] 62 | indices = np.arange(N) 63 | 64 | n_states = (un_states - self.mean_obs) / (self.std_obs + 1e-7) 65 | n_deltas = ((un_states_next - un_states) - self.mean_deltas) / (self.std_deltas + 1e-7) 66 | n_actions = (un_actions - self.mean_action) / (self.std_action + 1e-7) 67 | 68 | state_action = np.concatenate((n_states, n_actions), axis = 1) 69 | 70 | for _ in range(self.iter): 71 | np.random.shuffle(indices) 72 | batches = int (math.ceil(N / self.batch_size)) 73 | for i in range(batches): 74 | start_idx = i * self.batch_size 75 | idxs = indices[start_idx : start_idx + self.batch_size] 76 | batch_s_a = state_action[idxs, :] 77 | batch_delta = n_deltas[idxs, :] 78 | self.sess.run(self.optimizer, feed_dict = {self.s_a_ph : batch_s_a, self.delta_ph : batch_delta}) 79 | 80 | def predict(self, states, actions): 81 | """ Write a function to take in a batch of (unnormalized) states and (unnormalized) 82 | actions and return the (unnormalized) next states as predicted by using the model """ 83 | 84 | n_states = (states - self.mean_obs) / (self.std_obs + 1e-7) 85 | n_actions = (actions - self.mean_action) / (self.std_action + 1e-7) 86 | print(n_states.shape) 87 | print(n_actions.shape) 88 | state_action = np.concatenate((n_states, n_actions), axis = 1) 89 | 90 | expected_deltas = self.sess.run(self.delta_pred, feed_dict = {self.s_a_ph : state_action}) 91 | 92 | return expected_deltas * self.std_deltas + self.mean_deltas + states 93 | -------------------------------------------------------------------------------- /sp17_hw/hw4/homework.md: -------------------------------------------------------------------------------- 1 | # Homework 4 2 | 3 | In `main.py` you will find an implementation of a "vanilla" policy gradient method, applied to an MDP with a discrete action space: an episodic version of the classic "cartpole" task. First, make sure the provided code works on your computer by running `python main.py`. We recommend reading through all of the code and comments in the function `main_cartpole`, starting at the top of the function. 4 | 5 | The code computes some useful diagnostics, which you may find helpful to look at while tuning hyperparameters: 6 | 7 | - **KL[policy before update || policy after update]**. Large spikes in KL divergence mean that the optimization took a large step, and sometimes these spikes cause a collapse in performance. 8 | - **Entropy of the policy**. If entropy goes down too fast, then you may not explore enough, but if it goes down too slowly, you'll probably not reach optimal performance. 9 | - **Explained variance of the value function**. If the value function perfectly explains the returns, then it will be 1; if you get a negative result, then it's worse than predicting a constant. 10 | 11 | Software dependencies: 12 | 13 | - tensorflow 14 | - numpy + scipy (Anaconda recommended) 15 | - gym (I'm using 0.8.0, `pip install gym==0.8.0`, but old versions should work just as well) 16 | 17 | ## Problem 1 18 | 19 | Here you will modify the `main_cartpole` policy gradient implementation to work on a continuous action space, specifically, the gym environment `Pendulum-v`. Note that in `main_cartpole`, note that the neural network outputs "logits" (i.e., log-probabilities plus-or-minus a constant) that specify a categorical distribution. On the other hand, for the pendulum task, your neural network should output the mean of a Gaussian distribution, a separate parameter vector to parameterize the log standard deviation. For example, you could use the following code: 20 | 21 | ```python 22 | 23 | mean_na = dense(h2, ac_dim, weight_init=normc_initializer(0.1)) # Mean control output 24 | logstd_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.zeros_initializer) # Variance 25 | 26 | sy_sampled_ac = YOUR_CODE_HERE 27 | sy_logprob_n = YOUR_CODE_HERE 28 | 29 | ``` 30 | 31 | You should also compute differential entropy (replacing `sy_ent`) and KL-divergence (`sy_kl`) for the Gaussian distribution. 32 | 33 | The pendulum problem is slightly harder, and using a fixed stepsize does not work reliably---thus, we instead recommend using an adaptive stepsize, where you adjust it based on the KL divergence between the new and old policy. Code for this stepsize adaptation is provided. 34 | 35 | You can plot your results using the script `plot_learning_curves.py` or your own plotting code. 36 | 37 | **Deliverables** 38 | 39 | - Show a plot with the pendulum converging to EpRewMean of at least `-300`. Include EpRewMean, KL, Entropy in your plots. 40 | - Describe the hyperparameters used and how many timesteps your algorithm took to learn. 41 | 42 | ## Problem 2 43 | 44 | 1. Implement a neural network value function with the same interface as `LinearVF`. Add it to the provided cartpole solver, and compare the performance of the linear and neural network value function (i.e., baseline). 45 | 2. Perform the same comparison--linear vs neural network--for your pendulum solver from Problem 1. You should be able to obtain faster learning using the neural network. 46 | 47 | 48 | **Deliverables** 49 | 50 | - A comparison of linear vs neural network value function on the cartpole. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 51 | - A comparison of linear vs neural network value function on the pendulum. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 52 | 53 | In both cases, list the hyperparameters used for neural network training. 54 | 55 | ## Problem 3 (bonus) 56 | 57 | Implement a more advanced policy gradient method from lecture (such as TRPO, or the advantage function estimator used in A3C or generalized advantage estimation), and apply it to the gym environment `Hopper-v1`. See if you can learn a good gait in less than 500,000 timesteps. 58 | Hint: it may help to standardize your inputs using a running estimate of mean and standard deviation. 59 | 60 | ob_rescaled = (ob_raw - mean) / (stdev + epsilon) 61 | 62 | **Deliverables** 63 | 64 | A description of what you implemented, and learning curves on the Hopper-v1 environment. -------------------------------------------------------------------------------- /hw3/run_dqn_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(img_in, num_actions, scope, reuse=False): 16 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = img_in 19 | with tf.variable_scope("convnet"): 20 | # original architecture 21 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 22 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 24 | out = layers.flatten(out) 25 | with tf.variable_scope("action_value"): 26 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 27 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 28 | 29 | return out 30 | 31 | def atari_learn(env, 32 | session, 33 | num_timesteps): 34 | # This is just a rough estimate 35 | num_iterations = float(num_timesteps) / 4.0 36 | 37 | lr_multiplier = 1.0 38 | lr_schedule = PiecewiseSchedule([ 39 | (0, 1e-4 * lr_multiplier), 40 | (num_iterations / 10, 1e-4 * lr_multiplier), 41 | (num_iterations / 2, 5e-5 * lr_multiplier), 42 | ], 43 | outside_value=5e-5 * lr_multiplier) 44 | optimizer = dqn.OptimizerSpec( 45 | constructor=tf.train.AdamOptimizer, 46 | kwargs=dict(epsilon=1e-4), 47 | lr_schedule=lr_schedule 48 | ) 49 | 50 | def stopping_criterion(env, t): 51 | # notice that here t is the number of steps of the wrapped env, 52 | # which is different from the number of steps in the underlying env 53 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 54 | 55 | exploration_schedule = PiecewiseSchedule( 56 | [ 57 | (0, 1.0), 58 | (1e6, 0.1), 59 | (num_iterations / 2, 0.01), 60 | ], outside_value=0.01 61 | ) 62 | 63 | dqn.learn( 64 | env, 65 | q_func=atari_model, 66 | optimizer_spec=optimizer, 67 | session=session, 68 | exploration=exploration_schedule, 69 | stopping_criterion=stopping_criterion, 70 | replay_buffer_size=1000000, 71 | batch_size=32, 72 | gamma=0.99, 73 | learning_starts=50000, 74 | learning_freq=4, 75 | frame_history_len=4, 76 | target_update_freq=10000, 77 | grad_norm_clipping=10 78 | ) 79 | env.close() 80 | 81 | def get_available_gpus(): 82 | from tensorflow.python.client import device_lib 83 | local_device_protos = device_lib.list_local_devices() 84 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 85 | 86 | def set_global_seeds(i): 87 | try: 88 | import tensorflow as tf 89 | except ImportError: 90 | pass 91 | else: 92 | tf.set_random_seed(i) 93 | np.random.seed(i) 94 | random.seed(i) 95 | 96 | def get_session(): 97 | tf.reset_default_graph() 98 | tf_config = tf.ConfigProto( 99 | inter_op_parallelism_threads=1, 100 | intra_op_parallelism_threads=1) 101 | session = tf.Session(config=tf_config) 102 | print("AVAILABLE GPUS: ", get_available_gpus()) 103 | return session 104 | 105 | def get_env(task, seed): 106 | env_id = task.env_id 107 | 108 | env = gym.make(env_id) 109 | 110 | set_global_seeds(seed) 111 | env.seed(seed) 112 | 113 | expt_dir = '/tmp/hw3_vid_dir2/' 114 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 115 | env = wrap_deepmind(env) 116 | 117 | return env 118 | 119 | def main(): 120 | # Get Atari games. 121 | benchmark = gym.benchmark_spec('Atari40M') 122 | 123 | # Change the index to select a different game. 124 | task = benchmark.tasks[3] 125 | 126 | # Run training 127 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 128 | env = get_env(task, seed) 129 | session = get_session() 130 | atari_learn(env, session, num_timesteps=task.max_timesteps) 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /sp17_hw/hw3/run_dqn_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(img_in, num_actions, scope, reuse=False): 16 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = img_in 19 | with tf.variable_scope("convnet"): 20 | # original architecture 21 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 22 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 24 | out = layers.flatten(out) 25 | with tf.variable_scope("action_value"): 26 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 27 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 28 | 29 | return out 30 | 31 | def atari_learn(env, 32 | session, 33 | num_timesteps): 34 | # This is just a rough estimate 35 | num_iterations = float(num_timesteps) / 4.0 36 | 37 | lr_multiplier = 1.0 38 | lr_schedule = PiecewiseSchedule([ 39 | (0, 1e-4 * lr_multiplier), 40 | (num_iterations / 10, 1e-4 * lr_multiplier), 41 | (num_iterations / 2, 5e-5 * lr_multiplier), 42 | ], 43 | outside_value=5e-5 * lr_multiplier) 44 | optimizer = dqn.OptimizerSpec( 45 | constructor=tf.train.AdamOptimizer, 46 | kwargs=dict(epsilon=1e-4), 47 | lr_schedule=lr_schedule 48 | ) 49 | 50 | def stopping_criterion(env, t): 51 | # notice that here t is the number of steps of the wrapped env, 52 | # which is different from the number of steps in the underlying env 53 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 54 | 55 | exploration_schedule = PiecewiseSchedule( 56 | [ 57 | (0, 1.0), 58 | (1e6, 0.1), 59 | (num_iterations / 2, 0.01), 60 | ], outside_value=0.01 61 | ) 62 | 63 | dqn.learn( 64 | env, 65 | q_func=atari_model, 66 | optimizer_spec=optimizer, 67 | session=session, 68 | exploration=exploration_schedule, 69 | stopping_criterion=stopping_criterion, 70 | replay_buffer_size=1000000, 71 | batch_size=32, 72 | gamma=0.99, 73 | learning_starts=50000, 74 | learning_freq=4, 75 | frame_history_len=4, 76 | target_update_freq=10000, 77 | grad_norm_clipping=10 78 | ) 79 | env.close() 80 | 81 | def get_available_gpus(): 82 | from tensorflow.python.client import device_lib 83 | local_device_protos = device_lib.list_local_devices() 84 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 85 | 86 | def set_global_seeds(i): 87 | try: 88 | import tensorflow as tf 89 | except ImportError: 90 | pass 91 | else: 92 | tf.set_random_seed(i) 93 | np.random.seed(i) 94 | random.seed(i) 95 | 96 | def get_session(): 97 | tf.reset_default_graph() 98 | tf_config = tf.ConfigProto( 99 | inter_op_parallelism_threads=1, 100 | intra_op_parallelism_threads=1) 101 | session = tf.Session(config=tf_config) 102 | print("AVAILABLE GPUS: ", get_available_gpus()) 103 | return session 104 | 105 | def get_env(task, seed): 106 | env_id = task.env_id 107 | 108 | env = gym.make(env_id) 109 | 110 | set_global_seeds(seed) 111 | env.seed(seed) 112 | 113 | expt_dir = '/tmp/hw3_vid_dir2/' 114 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 115 | env = wrap_deepmind(env) 116 | 117 | return env 118 | 119 | def main(): 120 | # Get Atari games. 121 | benchmark = gym.benchmark_spec('Atari40M') 122 | 123 | # Change the index to select a different game. 124 | task = benchmark.tasks[3] 125 | 126 | # Run training 127 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 128 | env = get_env(task, seed) 129 | session = get_session() 130 | atari_learn(env, session, num_timesteps=task.max_timesteps) 131 | 132 | if __name__ == "__main__": 133 | main() 134 | -------------------------------------------------------------------------------- /sp17_hw/hw2/frozen_lake.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | from six import StringIO, b 4 | 5 | from gym import utils 6 | import discrete_env 7 | 8 | LEFT = 0 9 | DOWN = 1 10 | RIGHT = 2 11 | UP = 3 12 | 13 | MAPS = { 14 | "4x4": [ 15 | "SFFF", 16 | "FHFH", 17 | "FFFH", 18 | "HFFG" 19 | ], 20 | "8x8": [ 21 | "SFFFFFFF", 22 | "FFFFFFFF", 23 | "FFFHFFFF", 24 | "FFFFFHFF", 25 | "FFFHFFFF", 26 | "FHHFFFHF", 27 | "FHFFHFHF", 28 | "FFFHFFFG" 29 | ], 30 | } 31 | 32 | class FrozenLakeEnv(discrete_env.DiscreteEnv): 33 | """ 34 | Winter is here. You and your friends were tossing around a frisbee at the park 35 | when you made a wild throw that left the frisbee out in the middle of the lake. 36 | The water is mostly frozen, but there are a few holes where the ice has melted. 37 | If you step into one of those holes, you'll fall into the freezing water. 38 | At this time, there's an international frisbee shortage, so it's absolutely imperative that 39 | you navigate across the lake and retrieve the disc. 40 | However, the ice is slippery, so you won't always move in the direction you intend. 41 | The surface is described using a grid like the following 42 | 43 | SFFF 44 | FHFH 45 | FFFH 46 | HFFG 47 | 48 | S : starting point, safe 49 | F : frozen surface, safe 50 | H : hole, fall to your doom 51 | G : goal, where the frisbee is located 52 | 53 | The episode ends when you reach the goal or fall in a hole. 54 | You receive a reward of 1 if you reach the goal, and zero otherwise. 55 | 56 | """ 57 | 58 | metadata = {'render.modes': ['human', 'ansi']} 59 | 60 | def __init__(self, desc=None, map_name="4x4",is_slippery=True): 61 | if desc is None and map_name is None: 62 | raise ValueError('Must provide either desc or map_name') 63 | elif desc is None: 64 | desc = MAPS[map_name] 65 | self.desc = desc = np.asarray(desc,dtype='c') 66 | self.nrow, self.ncol = nrow, ncol = desc.shape 67 | 68 | nA = 4 69 | nS = nrow * ncol 70 | 71 | isd = np.array(desc == b'S').astype('float64').ravel() 72 | isd /= isd.sum() 73 | 74 | P = {s : {a : [] for a in range(nA)} for s in range(nS)} 75 | 76 | def to_s(row, col): 77 | return row*ncol + col 78 | def inc(row, col, a): 79 | if a==0: # left 80 | col = max(col-1,0) 81 | elif a==1: # down 82 | row = min(row+1,nrow-1) 83 | elif a==2: # right 84 | col = min(col+1,ncol-1) 85 | elif a==3: # up 86 | row = max(row-1,0) 87 | return (row, col) 88 | 89 | for row in range(nrow): 90 | for col in range(ncol): 91 | s = to_s(row, col) 92 | for a in range(4): 93 | li = P[s][a] 94 | letter = desc[row, col] 95 | if letter in b'GH': 96 | li.append((1.0, s, 0, True)) 97 | else: 98 | if is_slippery: 99 | for b in [(a-1)%4, a, (a+1)%4]: 100 | newrow, newcol = inc(row, col, b) 101 | newstate = to_s(newrow, newcol) 102 | newletter = desc[newrow, newcol] 103 | done = bytes(newletter) in b'GH' 104 | rew = float(newletter == b'G') 105 | li.append((0.8 if b==a else 0.1, newstate, rew, done)) 106 | else: 107 | newrow, newcol = inc(row, col, a) 108 | newstate = to_s(newrow, newcol) 109 | newletter = desc[newrow, newcol] 110 | done = bytes(newletter) in b'GH' 111 | rew = float(newletter == b'G') 112 | li.append((1.0, newstate, rew, done)) 113 | 114 | super(FrozenLakeEnv, self).__init__(nS, nA, P, isd) 115 | 116 | def _render(self, mode='human', close=False): 117 | if close: 118 | return 119 | outfile = StringIO() if mode == 'ansi' else sys.stdout 120 | 121 | row, col = self.s // self.ncol, self.s % self.ncol 122 | desc = self.desc.tolist() 123 | desc = [[c.decode('utf-8') for c in line] for line in desc] 124 | desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True) 125 | if self.lastaction is not None: 126 | outfile.write(" ({})\n".format(["Left","Down","Right","Up"][self.lastaction])) 127 | else: 128 | outfile.write("\n") 129 | outfile.write("\n".join(''.join(line) for line in desc)+"\n") 130 | 131 | return outfile 132 | -------------------------------------------------------------------------------- /hw3/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | 7 | 8 | class NoopResetEnv(gym.Wrapper): 9 | def __init__(self, env=None, noop_max=30): 10 | """Sample initial states by taking random number of no-ops on reset. 11 | No-op is assumed to be action 0. 12 | """ 13 | super(NoopResetEnv, self).__init__(env) 14 | self.noop_max = noop_max 15 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 16 | 17 | def _reset(self): 18 | """ Do no-op action for a number of steps in [1, noop_max].""" 19 | self.env.reset() 20 | noops = np.random.randint(1, self.noop_max + 1) 21 | for _ in range(noops): 22 | obs, _, _, _ = self.env.step(0) 23 | return obs 24 | 25 | class FireResetEnv(gym.Wrapper): 26 | def __init__(self, env=None): 27 | """Take action on reset for environments that are fixed until firing.""" 28 | super(FireResetEnv, self).__init__(env) 29 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 30 | assert len(env.unwrapped.get_action_meanings()) >= 3 31 | 32 | def _reset(self): 33 | self.env.reset() 34 | obs, _, _, _ = self.env.step(1) 35 | obs, _, _, _ = self.env.step(2) 36 | return obs 37 | 38 | class EpisodicLifeEnv(gym.Wrapper): 39 | def __init__(self, env=None): 40 | """Make end-of-life == end-of-episode, but only reset on true game over. 41 | Done by DeepMind for the DQN and co. since it helps value estimation. 42 | """ 43 | super(EpisodicLifeEnv, self).__init__(env) 44 | self.lives = 0 45 | self.was_real_done = True 46 | self.was_real_reset = False 47 | 48 | def _step(self, action): 49 | obs, reward, done, info = self.env.step(action) 50 | self.was_real_done = done 51 | # check current lives, make loss of life terminal, 52 | # then update lives to handle bonus lives 53 | lives = self.env.unwrapped.ale.lives() 54 | if lives < self.lives and lives > 0: 55 | # for Qbert somtimes we stay in lives == 0 condtion for a few frames 56 | # so its important to keep lives > 0, so that we only reset once 57 | # the environment advertises done. 58 | done = True 59 | self.lives = lives 60 | return obs, reward, done, info 61 | 62 | def _reset(self): 63 | """Reset only when lives are exhausted. 64 | This way all states are still reachable even though lives are episodic, 65 | and the learner need not know about any of this behind-the-scenes. 66 | """ 67 | if self.was_real_done: 68 | obs = self.env.reset() 69 | self.was_real_reset = True 70 | else: 71 | # no-op step to advance from terminal/lost life state 72 | obs, _, _, _ = self.env.step(0) 73 | self.was_real_reset = False 74 | self.lives = self.env.unwrapped.ale.lives() 75 | return obs 76 | 77 | class MaxAndSkipEnv(gym.Wrapper): 78 | def __init__(self, env=None, skip=4): 79 | """Return only every `skip`-th frame""" 80 | super(MaxAndSkipEnv, self).__init__(env) 81 | # most recent raw observations (for max pooling across time steps) 82 | self._obs_buffer = deque(maxlen=2) 83 | self._skip = skip 84 | 85 | def _step(self, action): 86 | total_reward = 0.0 87 | done = None 88 | for _ in range(self._skip): 89 | obs, reward, done, info = self.env.step(action) 90 | self._obs_buffer.append(obs) 91 | total_reward += reward 92 | if done: 93 | break 94 | 95 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 96 | 97 | return max_frame, total_reward, done, info 98 | 99 | def _reset(self): 100 | """Clear past frame buffer and init. to first obs. from inner env.""" 101 | self._obs_buffer.clear() 102 | obs = self.env.reset() 103 | self._obs_buffer.append(obs) 104 | return obs 105 | 106 | def _process_frame84(frame): 107 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 108 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 109 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) 110 | x_t = resized_screen[18:102, :] 111 | x_t = np.reshape(x_t, [84, 84, 1]) 112 | return x_t.astype(np.uint8) 113 | 114 | class ProcessFrame84(gym.Wrapper): 115 | def __init__(self, env=None): 116 | super(ProcessFrame84, self).__init__(env) 117 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 118 | 119 | def _step(self, action): 120 | obs, reward, done, info = self.env.step(action) 121 | return _process_frame84(obs), reward, done, info 122 | 123 | def _reset(self): 124 | return _process_frame84(self.env.reset()) 125 | 126 | class ClippedRewardsWrapper(gym.Wrapper): 127 | def _step(self, action): 128 | obs, reward, done, info = self.env.step(action) 129 | return obs, np.sign(reward), done, info 130 | 131 | def wrap_deepmind_ram(env): 132 | env = EpisodicLifeEnv(env) 133 | env = NoopResetEnv(env, noop_max=30) 134 | env = MaxAndSkipEnv(env, skip=4) 135 | if 'FIRE' in env.unwrapped.get_action_meanings(): 136 | env = FireResetEnv(env) 137 | env = ClippedRewardsWrapper(env) 138 | return env 139 | 140 | def wrap_deepmind(env): 141 | assert 'NoFrameskip' in env.spec.id 142 | env = EpisodicLifeEnv(env) 143 | env = NoopResetEnv(env, noop_max=30) 144 | env = MaxAndSkipEnv(env, skip=4) 145 | if 'FIRE' in env.unwrapped.get_action_meanings(): 146 | env = FireResetEnv(env) 147 | env = ProcessFrame84(env) 148 | env = ClippedRewardsWrapper(env) 149 | return env 150 | -------------------------------------------------------------------------------- /sp17_hw/hw3/atari_wrappers.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from collections import deque 4 | import gym 5 | from gym import spaces 6 | 7 | 8 | class NoopResetEnv(gym.Wrapper): 9 | def __init__(self, env=None, noop_max=30): 10 | """Sample initial states by taking random number of no-ops on reset. 11 | No-op is assumed to be action 0. 12 | """ 13 | super(NoopResetEnv, self).__init__(env) 14 | self.noop_max = noop_max 15 | assert env.unwrapped.get_action_meanings()[0] == 'NOOP' 16 | 17 | def _reset(self): 18 | """ Do no-op action for a number of steps in [1, noop_max].""" 19 | self.env.reset() 20 | noops = np.random.randint(1, self.noop_max + 1) 21 | for _ in range(noops): 22 | obs, _, _, _ = self.env.step(0) 23 | return obs 24 | 25 | class FireResetEnv(gym.Wrapper): 26 | def __init__(self, env=None): 27 | """Take action on reset for environments that are fixed until firing.""" 28 | super(FireResetEnv, self).__init__(env) 29 | assert env.unwrapped.get_action_meanings()[1] == 'FIRE' 30 | assert len(env.unwrapped.get_action_meanings()) >= 3 31 | 32 | def _reset(self): 33 | self.env.reset() 34 | obs, _, _, _ = self.env.step(1) 35 | obs, _, _, _ = self.env.step(2) 36 | return obs 37 | 38 | class EpisodicLifeEnv(gym.Wrapper): 39 | def __init__(self, env=None): 40 | """Make end-of-life == end-of-episode, but only reset on true game over. 41 | Done by DeepMind for the DQN and co. since it helps value estimation. 42 | """ 43 | super(EpisodicLifeEnv, self).__init__(env) 44 | self.lives = 0 45 | self.was_real_done = True 46 | self.was_real_reset = False 47 | 48 | def _step(self, action): 49 | obs, reward, done, info = self.env.step(action) 50 | self.was_real_done = done 51 | # check current lives, make loss of life terminal, 52 | # then update lives to handle bonus lives 53 | lives = self.env.unwrapped.ale.lives() 54 | if lives < self.lives and lives > 0: 55 | # for Qbert somtimes we stay in lives == 0 condtion for a few frames 56 | # so its important to keep lives > 0, so that we only reset once 57 | # the environment advertises done. 58 | done = True 59 | self.lives = lives 60 | return obs, reward, done, info 61 | 62 | def _reset(self): 63 | """Reset only when lives are exhausted. 64 | This way all states are still reachable even though lives are episodic, 65 | and the learner need not know about any of this behind-the-scenes. 66 | """ 67 | if self.was_real_done: 68 | obs = self.env.reset() 69 | self.was_real_reset = True 70 | else: 71 | # no-op step to advance from terminal/lost life state 72 | obs, _, _, _ = self.env.step(0) 73 | self.was_real_reset = False 74 | self.lives = self.env.unwrapped.ale.lives() 75 | return obs 76 | 77 | class MaxAndSkipEnv(gym.Wrapper): 78 | def __init__(self, env=None, skip=4): 79 | """Return only every `skip`-th frame""" 80 | super(MaxAndSkipEnv, self).__init__(env) 81 | # most recent raw observations (for max pooling across time steps) 82 | self._obs_buffer = deque(maxlen=2) 83 | self._skip = skip 84 | 85 | def _step(self, action): 86 | total_reward = 0.0 87 | done = None 88 | for _ in range(self._skip): 89 | obs, reward, done, info = self.env.step(action) 90 | self._obs_buffer.append(obs) 91 | total_reward += reward 92 | if done: 93 | break 94 | 95 | max_frame = np.max(np.stack(self._obs_buffer), axis=0) 96 | 97 | return max_frame, total_reward, done, info 98 | 99 | def _reset(self): 100 | """Clear past frame buffer and init. to first obs. from inner env.""" 101 | self._obs_buffer.clear() 102 | obs = self.env.reset() 103 | self._obs_buffer.append(obs) 104 | return obs 105 | 106 | def _process_frame84(frame): 107 | img = np.reshape(frame, [210, 160, 3]).astype(np.float32) 108 | img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114 109 | resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_LINEAR) 110 | x_t = resized_screen[18:102, :] 111 | x_t = np.reshape(x_t, [84, 84, 1]) 112 | return x_t.astype(np.uint8) 113 | 114 | class ProcessFrame84(gym.Wrapper): 115 | def __init__(self, env=None): 116 | super(ProcessFrame84, self).__init__(env) 117 | self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1)) 118 | 119 | def _step(self, action): 120 | obs, reward, done, info = self.env.step(action) 121 | return _process_frame84(obs), reward, done, info 122 | 123 | def _reset(self): 124 | return _process_frame84(self.env.reset()) 125 | 126 | class ClippedRewardsWrapper(gym.Wrapper): 127 | def _step(self, action): 128 | obs, reward, done, info = self.env.step(action) 129 | return obs, np.sign(reward), done, info 130 | 131 | def wrap_deepmind_ram(env): 132 | env = EpisodicLifeEnv(env) 133 | env = NoopResetEnv(env, noop_max=30) 134 | env = MaxAndSkipEnv(env, skip=4) 135 | if 'FIRE' in env.unwrapped.get_action_meanings(): 136 | env = FireResetEnv(env) 137 | env = ClippedRewardsWrapper(env) 138 | return env 139 | 140 | def wrap_deepmind(env): 141 | assert 'NoFrameskip' in env.spec.id 142 | env = EpisodicLifeEnv(env) 143 | env = NoopResetEnv(env, noop_max=30) 144 | env = MaxAndSkipEnv(env, skip=4) 145 | if 'FIRE' in env.unwrapped.get_action_meanings(): 146 | env = FireResetEnv(env) 147 | env = ProcessFrame84(env) 148 | env = ClippedRewardsWrapper(env) 149 | return env 150 | -------------------------------------------------------------------------------- /hw2/TestNoteBook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n", 13 | " return f(*args, **kwds)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import tensorflow as tf" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "def build_mlp(\n", 30 | " input_placeholder, \n", 31 | " output_size,\n", 32 | " scope, \n", 33 | " n_layers=2, \n", 34 | " size=64, \n", 35 | " activation=tf.tanh,\n", 36 | " output_activation=None\n", 37 | " ):\n", 38 | " with tf.variable_scope(scope):\n", 39 | " x = input_placeholder\n", 40 | " while n_layers > 0:\n", 41 | " x = tf.layers.dense(x, size)\n", 42 | " x = activation(x)\n", 43 | " n_layers-=1\n", 44 | " x = tf.layers.dense(x, output_size)\n", 45 | " if output_activation != None:\n", 46 | " x = output_activation(x)\n", 47 | " return x" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": { 54 | "collapsed": true 55 | }, 56 | "outputs": [], 57 | "source": [ 58 | "in_place = tf.placeholder(tf.float32, shape = (1024, 1024))" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "mlp = build_mlp(in_place, 64, \"test1\")" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "TensorShape([Dimension(1024), Dimension(64)])" 81 | ] 82 | }, 83 | "execution_count": 5, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "mlp.shape" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "metadata": { 96 | "collapsed": true 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "import numpy as np" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 7, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "ob_dim = 50\n", 112 | "ac_dim = 100\n", 113 | "discrete = True" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 8, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "sy_ob_no = tf.placeholder(shape=[None, ob_dim], name=\"ob\", dtype=tf.float32)\n", 123 | "if discrete:\n", 124 | " sy_ac_na = tf.placeholder(shape=[None], name=\"ac\", dtype=tf.int32) \n", 125 | "else:\n", 126 | " sy_ac_na = tf.placeholder(shape=[None, ac_dim], name=\"ac\", dtype=tf.float32) \n", 127 | "\n", 128 | "sy_adv_n = tf.placeholder(shape = [None], name = \"adv\", dtype = tf.float32)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 9, 134 | "metadata": {}, 135 | "outputs": [ 136 | { 137 | "data": { 138 | "text/plain": [ 139 | "TensorShape([Dimension(None)])" 140 | ] 141 | }, 142 | "execution_count": 9, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "sy_adv_n.shape" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 10, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "sy_mean = build_mlp(sy_ob_no, ac_dim, \"policy\")\n", 158 | "sy_logstd = tf.Variable(tf.zeros([1, ac_dim], name = 'logstd'))\n", 159 | "sy_std = tf.exp(sy_logstd)\n", 160 | "sy_sampled_ac = sy_mean + sy_std * tf.random_normal(tf.shape(sy_mean))" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 11, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "TensorShape([Dimension(None), Dimension(100)])" 172 | ] 173 | }, 174 | "execution_count": 11, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "sy_sampled_ac.shape" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 48, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "data": { 190 | "text/plain": [ 191 | "TensorShape([Dimension(64), Dimension(200)])" 192 | ] 193 | }, 194 | "execution_count": 48, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "(tf.Variable(tf.zeros([1, 200])) * tf.Variable(tf.zeros([64, 200]))).shape" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 22, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "x = np.array([1,2,3,4])" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 26, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/plain": [ 222 | "32" 223 | ] 224 | }, 225 | "execution_count": 26, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "2 ** 5" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 28, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "sample1 = np.random.normal([1024, 64])" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 31, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "real_values = tf.placeholder(shape = [1024, 64], dtype = tf.float32)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 33, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [ 258 | "sample2 = np.random.normal(size = [1024, 64], loc = 2)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 43, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "sample1 = np.random.normal(size = [1024, 1024])" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 39, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "loss = tf.nn.l2_loss(mlp - real_values)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 41, 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "train1 = tf.train.AdamOptimizer(0.005).minimize(loss)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 42, 291 | "metadata": { 292 | "collapsed": true 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "sess = tf.InteractiveSession()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 46, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "sess.run(train1, feed_dict = {in_place : sample1, real_values : sample2})" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 45, 311 | "metadata": { 312 | "collapsed": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "tf.global_variables_initializer().run()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": { 323 | "collapsed": true 324 | }, 325 | "outputs": [], 326 | "source": [] 327 | } 328 | ], 329 | "metadata": { 330 | "kernelspec": { 331 | "display_name": "Python 3", 332 | "language": "python", 333 | "name": "python3" 334 | }, 335 | "language_info": { 336 | "codemirror_mode": { 337 | "name": "ipython", 338 | "version": 3 339 | }, 340 | "file_extension": ".py", 341 | "mimetype": "text/x-python", 342 | "name": "python", 343 | "nbconvert_exporter": "python", 344 | "pygments_lexer": "ipython3", 345 | "version": "3.6.3" 346 | } 347 | }, 348 | "nbformat": 4, 349 | "nbformat_minor": 2 350 | } 351 | -------------------------------------------------------------------------------- /hw3/Testing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n", 13 | " return f(*args, **kwds)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import tensorflow as tf\n", 20 | "import gym" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "benchmark = gym.benchmark_spec('Atari40M')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "task = benchmark.tasks[3]" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 6, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "from run_dqn_atari import *" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 7, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | "[2018-06-08 23:37:20,714] Making new env: PongNoFrameskip-v4\n", 66 | "[2018-06-08 23:37:21,102] Clearing 2 monitor files from previous run (because force=True was provided)\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "env = get_env(task, 0)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 10, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "action = env.action_space.sample()" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 13, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stderr", 90 | "output_type": "stream", 91 | "text": [ 92 | "[2018-06-08 23:38:59,068] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.0.6079.video000000.mp4\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "observation = env.reset()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 15, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "(84, 84, 1)" 109 | ] 110 | }, 111 | "execution_count": 15, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "observation.shape" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 16, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "Discrete(6)" 129 | ] 130 | }, 131 | "execution_count": 16, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "env.action_space" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 17, 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "data": { 147 | "text/plain": [ 148 | "(84, 84, 1)" 149 | ] 150 | }, 151 | "execution_count": 17, 152 | "metadata": {}, 153 | "output_type": "execute_result" 154 | } 155 | ], 156 | "source": [ 157 | "env.observation_space.shape" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 28, 163 | "metadata": { 164 | "collapsed": true 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "sample_space = tf.placeholder(shape = (64, 84, 84, 4), dtype = tf.float32)" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 19, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [ 179 | "num_actions = env.action_space.n" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 21, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "6" 191 | ] 192 | }, 193 | "execution_count": 21, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "num_actions" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 29, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "model1 = atari_model(sample_space, num_actions, \"test2\")" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 30, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "TensorShape([Dimension(64), Dimension(6)])" 220 | ] 221 | }, 222 | "execution_count": 30, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "model1.get_shape()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 32, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "v1 = tf.Variable([1,6,3,4,5,0], dtype = tf.float32)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 33, 245 | "metadata": { 246 | "collapsed": true 247 | }, 248 | "outputs": [], 249 | "source": [ 250 | "sess = tf.InteractiveSession()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 36, 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "data": { 260 | "text/plain": [ 261 | "1" 262 | ] 263 | }, 264 | "execution_count": 36, 265 | "metadata": {}, 266 | "output_type": "execute_result" 267 | } 268 | ], 269 | "source": [ 270 | "sess.run(tf.argmax(v1))" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 57, 276 | "metadata": { 277 | "collapsed": true 278 | }, 279 | "outputs": [], 280 | "source": [ 281 | "sess.run(tf.global_variables_initializer())" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 37, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "6.0" 293 | ] 294 | }, 295 | "execution_count": 37, 296 | "metadata": {}, 297 | "output_type": "execute_result" 298 | } 299 | ], 300 | "source": [ 301 | "sess.run(tf.reduce_max(v1))" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 39, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "v2 = tf.Variable([1,6,3,4,5,0], dtype = tf.uint8)" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": 41, 316 | "metadata": {}, 317 | "outputs": [ 318 | { 319 | "data": { 320 | "text/plain": [ 321 | "array([[ 0., 1., 0., 0., 0., 0., 0.],\n", 322 | " [ 0., 0., 0., 0., 0., 0., 1.],\n", 323 | " [ 0., 0., 0., 1., 0., 0., 0.],\n", 324 | " [ 0., 0., 0., 0., 1., 0., 0.],\n", 325 | " [ 0., 0., 0., 0., 0., 1., 0.],\n", 326 | " [ 1., 0., 0., 0., 0., 0., 0.]], dtype=float32)" 327 | ] 328 | }, 329 | "execution_count": 41, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "sess.run(tf.one_hot(v2, 7))" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 46, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "array([ 0. , -29.96999931, -5.99399996, -11.98799992,\n", 347 | " -19.97999954, 0. ], dtype=float32)" 348 | ] 349 | }, 350 | "execution_count": 46, 351 | "metadata": {}, 352 | "output_type": "execute_result" 353 | } 354 | ], 355 | "source": [ 356 | "sess.run(0.999 * (1 - v1) * v1)" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 56, 362 | "metadata": {}, 363 | "outputs": [], 364 | "source": [ 365 | "ph2 = tf.placeholder(tf.float32, shape = [6])\n", 366 | "optimizer = tf.train.AdamOptimizer().minimize(tf.nn.l2_loss(tf.reduce_max(ph2) - tf.reduce_max(v1)))" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 64, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "None\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "print(sess.run(optimizer, {ph2: [1,6,3,4,5,0]}))" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 66, 389 | "metadata": {}, 390 | "outputs": [ 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "[ 0. 0. 0. 1.]\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "a = tf.placeholder(dtype=tf.float32, shape=[4])\n", 401 | "b = tf.reduce_max(a)\n", 402 | "c = tf.gradients([b], [a])[0]\n", 403 | "with tf.Session() as sess:\n", 404 | " v = np.asarray([1, 2, 3, 4], dtype='float32')\n", 405 | " print(sess.run(c, feed_dict={a:v}))" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 67, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "import gym" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 68, 422 | "metadata": { 423 | "collapsed": true 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "benchmark = gym.benchmark_spec('Atari40M')" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 69, 433 | "metadata": { 434 | "collapsed": true 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "task = benchmark.tasks[3]" 439 | ] 440 | }, 441 | { 442 | "cell_type": "code", 443 | "execution_count": 70, 444 | "metadata": {}, 445 | "outputs": [ 446 | { 447 | "data": { 448 | "text/plain": [ 449 | "40000000" 450 | ] 451 | }, 452 | "execution_count": 70, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | }, 456 | { 457 | "name": "stderr", 458 | "output_type": "stream", 459 | "text": [ 460 | "[2018-06-14 13:16:55,320] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/hw3_vid_dir2/gym')\n" 461 | ] 462 | } 463 | ], 464 | "source": [ 465 | "task.max_timesteps" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": null, 471 | "metadata": { 472 | "collapsed": true 473 | }, 474 | "outputs": [], 475 | "source": [] 476 | } 477 | ], 478 | "metadata": { 479 | "kernelspec": { 480 | "display_name": "Python 3", 481 | "language": "python", 482 | "name": "python3" 483 | }, 484 | "language_info": { 485 | "codemirror_mode": { 486 | "name": "ipython", 487 | "version": 3 488 | }, 489 | "file_extension": ".py", 490 | "mimetype": "text/x-python", 491 | "name": "python", 492 | "nbconvert_exporter": "python", 493 | "pygments_lexer": "ipython3", 494 | "version": "3.6.3" 495 | } 496 | }, 497 | "nbformat": 4, 498 | "nbformat_minor": 2 499 | } 500 | -------------------------------------------------------------------------------- /hw4/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | from dynamics import NNDynamicsModel 5 | from controllers import MPCcontroller, RandomController 6 | from cost_functions import cheetah_cost_fn, trajectory_cost_fn 7 | import time 8 | import logz 9 | import os 10 | import copy 11 | import matplotlib.pyplot as plt 12 | from cheetah_env import HalfCheetahEnvNew 13 | from tqdm import tqdm 14 | import matplotlib.pyplot as plt 15 | from time import gmtime, strftime 16 | 17 | def sample(env, 18 | controller, 19 | num_paths=10, 20 | horizon=1000, 21 | render=False, 22 | verbose=False): 23 | """ 24 | Write a sampler function which takes in an environment, a controller (either random or the MPC controller), 25 | and returns rollouts by running on the env. 26 | Each path can have elements for observations, next_observations, rewards, returns, actions, etc. 27 | """ 28 | paths = [] 29 | iterator = range(num_paths) 30 | if verbose: 31 | iterator = tqdm(iterator) 32 | for _ in iterator: 33 | ob = env.reset() 34 | if render: 35 | env.render() 36 | obs, next_obs, actions, rewards = [], [], [], [] 37 | steps = 0 38 | while True: 39 | obs.append(ob) 40 | action = controller.get_action(ob) 41 | actions.append(action) 42 | ob, reward, done, _ = env.step(action) 43 | next_obs.append(ob) 44 | rewards.append(reward) 45 | steps += 1 46 | if done or steps >= horizon: 47 | break 48 | paths.append({ 49 | 'observations': np.array(obs), 50 | 'actions': np.array(actions), 51 | 'next_observations': np.array(next_obs) 52 | }) 53 | 54 | return paths 55 | 56 | # Utility to compute cost a path for a given cost function 57 | def path_cost(cost_fn, path): 58 | return trajectory_cost_fn(cost_fn, path['observations'], path['actions'], path['next_observations']) 59 | 60 | def compute_normalization(data): 61 | """ 62 | Write a function to take in a dataset and compute the means, and stds. 63 | Return 6 elements: mean of s_t, std of s_t, mean of (s_t+1 - s_t), std of (s_t+1 - s_t), mean of actions, std of actions 64 | """ 65 | obs = np.concatenate([d['observations'] for d in data]) 66 | next_obs = np.concatenate([d['next_observations'] for d in data]) 67 | actions = np.concatenate([d['actions'] for d in data]) 68 | 69 | mean_obs = np.mean(obs) 70 | std_obs = np.std(obs) 71 | mean_deltas = np.mean(next_obs - obs) 72 | std_deltas = np.std(next_obs - obs) 73 | mean_action = np.mean(actions) 74 | std_action = np.std(actions) 75 | 76 | return mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action 77 | 78 | 79 | def plot_comparison(env, dyn_model): 80 | """ 81 | Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 82 | """ 83 | real_obs, pred_obs = [], [] 84 | ob = env.reset() 85 | i = 0 86 | while True: 87 | action = env.action_space.sample() 88 | pred_ob = dyn_model.predict(ob, action) 89 | real_ob, _, done, _ = env.step(action) 90 | real_obs.append(real_ob) 91 | pred_obs.append(pred_ob) 92 | i += 1 93 | if done: 94 | break 95 | abs_diff = np.abs(np.array(real_obs - pred_obs)) 96 | plt.plot(np.arange(i), abs_diff) 97 | time_str = strftime("%Y-%m-%d_%H:%M:%S", gmtime()) 98 | plt.savefig("figures/" + time_str + ".png") 99 | 100 | def train(env, 101 | cost_fn, 102 | logdir=None, 103 | render=False, 104 | learning_rate=1e-3, 105 | onpol_iters=10, 106 | dynamics_iters=60, 107 | batch_size=512, 108 | num_paths_random=10, 109 | num_paths_onpol=10, 110 | num_simulated_paths=10000, 111 | env_horizon=1000, 112 | mpc_horizon=15, 113 | n_layers=2, 114 | size=500, 115 | activation=tf.nn.relu, 116 | output_activation=None 117 | ): 118 | 119 | """ 120 | 121 | Arguments: 122 | 123 | onpol_iters Number of iterations of onpolicy aggregation for the loop to run. 124 | 125 | dynamics_iters Number of iterations of training for the dynamics model 126 | |_ which happen per iteration of the aggregation loop. 127 | 128 | batch_size Batch size for dynamics training. 129 | 130 | num_paths_random Number of paths/trajectories/rollouts generated 131 | | by a random agent. We use these to train our 132 | |_ initial dynamics model. 133 | 134 | num_paths_onpol Number of paths to collect at each iteration of 135 | |_ aggregation, using the Model Predictive Control policy. 136 | 137 | num_simulated_paths How many fictitious rollouts the MPC policy 138 | | should generate each time it is asked for an 139 | |_ action. 140 | 141 | env_horizon Number of timesteps in each path. 142 | 143 | mpc_horizon The MPC policy generates actions by imagining 144 | | fictitious rollouts, and picking the first action 145 | | of the best fictitious rollout. This argument is 146 | | how many timesteps should be in each fictitious 147 | |_ rollout. 148 | 149 | n_layers/size/activations Neural network architecture arguments. 150 | 151 | """ 152 | 153 | logz.configure_output_dir(logdir) 154 | 155 | #======================================================== 156 | # 157 | # First, we need a lot of data generated by a random 158 | # agent, with which we'll begin to train our dynamics 159 | # model. 160 | 161 | random_controller = RandomController(env) 162 | 163 | data = sample(env, random_controller, num_paths_random, env_horizon) 164 | 165 | #======================================================== 166 | # 167 | # The random data will be used to get statistics (mean 168 | # and std) for the observations, actions, and deltas 169 | # (where deltas are o_{t+1} - o_t). These will be used 170 | # for normalizing inputs and denormalizing outputs 171 | # from the dynamics network. 172 | # 173 | normalization = compute_normalization(data) 174 | 175 | 176 | #======================================================== 177 | # 178 | # Build dynamics model and MPC controllers. 179 | # 180 | sess = tf.Session() 181 | 182 | dyn_model = NNDynamicsModel(env=env, 183 | n_layers=n_layers, 184 | size=size, 185 | activation=activation, 186 | output_activation=output_activation, 187 | normalization=normalization, 188 | batch_size=batch_size, 189 | iterations=dynamics_iters, 190 | learning_rate=learning_rate, 191 | sess=sess) 192 | 193 | mpc_controller = MPCcontroller(env=env, 194 | dyn_model=dyn_model, 195 | horizon=mpc_horizon, 196 | cost_fn=cost_fn, 197 | num_simulated_paths=num_simulated_paths) 198 | 199 | 200 | #======================================================== 201 | # 202 | # Tensorflow session building. 203 | # 204 | sess.__enter__() 205 | tf.global_variables_initializer().run() 206 | 207 | #======================================================== 208 | # 209 | # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 210 | # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 211 | # 212 | for itr in range(onpol_iters): 213 | dyn_model.fit(data) 214 | data_new = sample(env, mpc_controller, num_paths_onpol, env_horizon) 215 | data = np.concatenate([data, data_new]) 216 | 217 | returns = [np.sum(path['reward']) for path in data_new] 218 | costs = [path_cost(cost_fn, path) for path in data_new] 219 | 220 | # LOGGING 221 | # Statistics for performance of MPC policy using 222 | # our learned dynamics model 223 | logz.log_tabular('Iteration', itr) 224 | # In terms of cost function which your MPC controller uses to plan 225 | logz.log_tabular('AverageCost', np.mean(costs)) 226 | logz.log_tabular('StdCost', np.std(costs)) 227 | logz.log_tabular('MinimumCost', np.min(costs)) 228 | logz.log_tabular('MaximumCost', np.max(costs)) 229 | # In terms of true environment reward of your rolled out trajectory using the MPC controller 230 | logz.log_tabular('AverageReturn', np.mean(returns)) 231 | logz.log_tabular('StdReturn', np.std(returns)) 232 | logz.log_tabular('MinimumReturn', np.min(returns)) 233 | logz.log_tabular('MaximumReturn', np.max(returns)) 234 | 235 | logz.dump_tabular() 236 | 237 | def main(): 238 | 239 | import argparse 240 | parser = argparse.ArgumentParser() 241 | parser.add_argument('--env_name', type=str, default='HalfCheetah-v1') 242 | # Experiment meta-params 243 | parser.add_argument('--exp_name', type=str, default='mb_mpc') 244 | parser.add_argument('--seed', type=int, default=3) 245 | parser.add_argument('--render', action='store_true') 246 | # Training args 247 | parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3) 248 | parser.add_argument('--onpol_iters', '-n', type=int, default=1) 249 | parser.add_argument('--dyn_iters', '-nd', type=int, default=60) 250 | parser.add_argument('--batch_size', '-b', type=int, default=512) 251 | # Data collection 252 | parser.add_argument('--random_paths', '-r', type=int, default=10) 253 | parser.add_argument('--onpol_paths', '-d', type=int, default=10) 254 | parser.add_argument('--simulated_paths', '-sp', type=int, default=1000) 255 | parser.add_argument('--ep_len', '-ep', type=int, default=1000) 256 | # Neural network architecture args 257 | parser.add_argument('--n_layers', '-l', type=int, default=2) 258 | parser.add_argument('--size', '-s', type=int, default=500) 259 | # MPC Controller 260 | parser.add_argument('--mpc_horizon', '-m', type=int, default=15) 261 | args = parser.parse_args() 262 | 263 | # Set seed 264 | np.random.seed(args.seed) 265 | tf.set_random_seed(args.seed) 266 | 267 | # Make data directory if it does not already exist 268 | if not(os.path.exists('data')): 269 | os.makedirs('data') 270 | logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 271 | logdir = os.path.join('data', logdir) 272 | if not(os.path.exists(logdir)): 273 | os.makedirs(logdir) 274 | 275 | # Make env 276 | if args.env_name is "HalfCheetah-v1": 277 | env = gym.make('HalfCheetah-v2') 278 | cost_fn = cheetah_cost_fn 279 | train(env=env, 280 | cost_fn=cost_fn, 281 | logdir=logdir, 282 | render=args.render, 283 | learning_rate=args.learning_rate, 284 | onpol_iters=args.onpol_iters, 285 | dynamics_iters=args.dyn_iters, 286 | batch_size=args.batch_size, 287 | num_paths_random=args.random_paths, 288 | num_paths_onpol=args.onpol_paths, 289 | num_simulated_paths=args.simulated_paths, 290 | env_horizon=args.ep_len, 291 | mpc_horizon=args.mpc_horizon, 292 | n_layers = args.n_layers, 293 | size=args.size, 294 | activation=tf.nn.relu, 295 | output_activation=None, 296 | ) 297 | 298 | if __name__ == "__main__": 299 | main() 300 | -------------------------------------------------------------------------------- /sp17_hw/hw3/dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym.spaces 3 | import itertools 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | import tensorflow.contrib.layers as layers 8 | from collections import namedtuple 9 | from dqn_utils import * 10 | 11 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"]) 12 | 13 | def learn(env, 14 | q_func, 15 | optimizer_spec, 16 | session, 17 | exploration=LinearSchedule(1000000, 0.1), 18 | stopping_criterion=None, 19 | replay_buffer_size=1000000, 20 | batch_size=32, 21 | gamma=0.99, 22 | learning_starts=50000, 23 | learning_freq=4, 24 | frame_history_len=4, 25 | target_update_freq=10000, 26 | grad_norm_clipping=10): 27 | """Run Deep Q-learning algorithm. 28 | 29 | You can specify your own convnet using q_func. 30 | 31 | All schedules are w.r.t. total number of steps taken in the environment. 32 | 33 | Parameters 34 | ---------- 35 | env: gym.Env 36 | gym environment to train on. 37 | q_func: function 38 | Model to use for computing the q function. It should accept the 39 | following named arguments: 40 | img_in: tf.Tensor 41 | tensorflow tensor representing the input image 42 | num_actions: int 43 | number of actions 44 | scope: str 45 | scope in which all the model related variables 46 | should be created 47 | reuse: bool 48 | whether previously created variables should be reused. 49 | optimizer_spec: OptimizerSpec 50 | Specifying the constructor and kwargs, as well as learning rate schedule 51 | for the optimizer 52 | session: tf.Session 53 | tensorflow session to use. 54 | exploration: rl_algs.deepq.utils.schedules.Schedule 55 | schedule for probability of chosing random action. 56 | stopping_criterion: (env, t) -> bool 57 | should return true when it's ok for the RL algorithm to stop. 58 | takes in env and the number of steps executed so far. 59 | replay_buffer_size: int 60 | How many memories to store in the replay buffer. 61 | batch_size: int 62 | How many transitions to sample each time experience is replayed. 63 | gamma: float 64 | Discount Factor 65 | learning_starts: int 66 | After how many environment steps to start replaying experiences 67 | learning_freq: int 68 | How many steps of environment to take between every experience replay 69 | frame_history_len: int 70 | How many past frames to include as input to the model. 71 | target_update_freq: int 72 | How many experience replay rounds (not steps!) to perform between 73 | each update to the target Q network 74 | grad_norm_clipping: float or None 75 | If not None gradients' norms are clipped to this value. 76 | """ 77 | assert type(env.observation_space) == gym.spaces.Box 78 | assert type(env.action_space) == gym.spaces.Discrete 79 | 80 | ############### 81 | # BUILD MODEL # 82 | ############### 83 | 84 | if len(env.observation_space.shape) == 1: 85 | # This means we are running on low-dimensional observations (e.g. RAM) 86 | input_shape = env.observation_space.shape 87 | else: 88 | img_h, img_w, img_c = env.observation_space.shape 89 | input_shape = (img_h, img_w, frame_history_len * img_c) 90 | num_actions = env.action_space.n 91 | 92 | # set up placeholders 93 | # placeholder for current observation (or state) 94 | obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 95 | # placeholder for current action 96 | act_t_ph = tf.placeholder(tf.int32, [None]) 97 | # placeholder for current reward 98 | rew_t_ph = tf.placeholder(tf.float32, [None]) 99 | # placeholder for next observation (or state) 100 | obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 101 | # placeholder for end of episode mask 102 | # this value is 1 if the next state corresponds to the end of an episode, 103 | # in which case there is no Q-value at the next state; at the end of an 104 | # episode, only the current state reward contributes to the target, not the 105 | # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) 106 | done_mask_ph = tf.placeholder(tf.float32, [None]) 107 | 108 | # casting to float on GPU ensures lower data transfer times. 109 | obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 110 | obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 111 | 112 | # Here, you should fill in your own code to compute the Bellman error. This requires 113 | # evaluating the current and next Q-values and constructing the corresponding error. 114 | # TensorFlow will differentiate this error for you, you just need to pass it to the 115 | # optimizer. See assignment text for details. 116 | # Your code should produce one scalar-valued tensor: total_error 117 | # This will be passed to the optimizer in the provided code below. 118 | # Your code should also produce two collections of variables: 119 | # q_func_vars 120 | # target_q_func_vars 121 | # These should hold all of the variables of the Q-function network and target network, 122 | # respectively. A convenient way to get these is to make use of TF's "scope" feature. 123 | # For example, you can create your Q-function network with the scope "q_func" like this: 124 | # = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) 125 | # And then you can obtain the variables like this: 126 | # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 127 | # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" 128 | ###### 129 | 130 | # YOUR CODE HERE 131 | 132 | ###### 133 | 134 | # construct optimization op (with gradient clipping) 135 | learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") 136 | optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) 137 | train_fn = minimize_and_clip(optimizer, total_error, 138 | var_list=q_func_vars, clip_val=grad_norm_clipping) 139 | 140 | # update_target_fn will be called periodically to copy Q network to target Q network 141 | update_target_fn = [] 142 | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), 143 | sorted(target_q_func_vars, key=lambda v: v.name)): 144 | update_target_fn.append(var_target.assign(var)) 145 | update_target_fn = tf.group(*update_target_fn) 146 | 147 | # construct the replay buffer 148 | replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) 149 | 150 | ############### 151 | # RUN ENV # 152 | ############### 153 | model_initialized = False 154 | num_param_updates = 0 155 | mean_episode_reward = -float('nan') 156 | best_mean_episode_reward = -float('inf') 157 | last_obs = env.reset() 158 | LOG_EVERY_N_STEPS = 10000 159 | 160 | for t in itertools.count(): 161 | ### 1. Check stopping criterion 162 | if stopping_criterion is not None and stopping_criterion(env, t): 163 | break 164 | 165 | ### 2. Step the env and store the transition 166 | # At this point, "last_obs" contains the latest observation that was 167 | # recorded from the simulator. Here, your code needs to store this 168 | # observation and its outcome (reward, next observation, etc.) into 169 | # the replay buffer while stepping the simulator forward one step. 170 | # At the end of this block of code, the simulator should have been 171 | # advanced one step, and the replay buffer should contain one more 172 | # transition. 173 | # Specifically, last_obs must point to the new latest observation. 174 | # Useful functions you'll need to call: 175 | # obs, reward, done, info = env.step(action) 176 | # this steps the environment forward one step 177 | # obs = env.reset() 178 | # this resets the environment if you reached an episode boundary. 179 | # Don't forget to call env.reset() to get a new observation if done 180 | # is true!! 181 | # Note that you cannot use "last_obs" directly as input 182 | # into your network, since it needs to be processed to include context 183 | # from previous frames. You should check out the replay buffer 184 | # implementation in dqn_utils.py to see what functionality the replay 185 | # buffer exposes. The replay buffer has a function called 186 | # encode_recent_observation that will take the latest observation 187 | # that you pushed into the buffer and compute the corresponding 188 | # input that should be given to a Q network by appending some 189 | # previous frames. 190 | # Don't forget to include epsilon greedy exploration! 191 | # And remember that the first time you enter this loop, the model 192 | # may not yet have been initialized (but of course, the first step 193 | # might as well be random, since you haven't trained your net...) 194 | 195 | ##### 196 | 197 | # YOUR CODE HERE 198 | 199 | ##### 200 | 201 | # at this point, the environment should have been advanced one step (and 202 | # reset if done was true), and last_obs should point to the new latest 203 | # observation 204 | 205 | ### 3. Perform experience replay and train the network. 206 | # note that this is only done if the replay buffer contains enough samples 207 | # for us to learn something useful -- until then, the model will not be 208 | # initialized and random actions should be taken 209 | if (t > learning_starts and 210 | t % learning_freq == 0 and 211 | replay_buffer.can_sample(batch_size)): 212 | # Here, you should perform training. Training consists of four steps: 213 | # 3.a: use the replay buffer to sample a batch of transitions (see the 214 | # replay buffer code for function definition, each batch that you sample 215 | # should consist of current observations, current actions, rewards, 216 | # next observations, and done indicator). 217 | # 3.b: initialize the model if it has not been initialized yet; to do 218 | # that, call 219 | # initialize_interdependent_variables(session, tf.global_variables(), { 220 | # obs_t_ph: obs_t_batch, 221 | # obs_tp1_ph: obs_tp1_batch, 222 | # }) 223 | # where obs_t_batch and obs_tp1_batch are the batches of observations at 224 | # the current and next time step. The boolean variable model_initialized 225 | # indicates whether or not the model has been initialized. 226 | # Remember that you have to update the target network too (see 3.d)! 227 | # 3.c: train the model. To do this, you'll need to use the train_fn and 228 | # total_error ops that were created earlier: total_error is what you 229 | # created to compute the total Bellman error in a batch, and train_fn 230 | # will actually perform a gradient step and update the network parameters 231 | # to reduce total_error. When calling session.run on these you'll need to 232 | # populate the following placeholders: 233 | # obs_t_ph 234 | # act_t_ph 235 | # rew_t_ph 236 | # obs_tp1_ph 237 | # done_mask_ph 238 | # (this is needed for computing total_error) 239 | # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) 240 | # (this is needed by the optimizer to choose the learning rate) 241 | # 3.d: periodically update the target network by calling 242 | # session.run(update_target_fn) 243 | # you should update every target_update_freq steps, and you may find the 244 | # variable num_param_updates useful for this (it was initialized to 0) 245 | ##### 246 | 247 | # YOUR CODE HERE 248 | 249 | ##### 250 | 251 | ### 4. Log progress 252 | episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() 253 | if len(episode_rewards) > 0: 254 | mean_episode_reward = np.mean(episode_rewards[-100:]) 255 | if len(episode_rewards) > 100: 256 | best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) 257 | if t % LOG_EVERY_N_STEPS == 0 and model_initialized: 258 | print("Timestep %d" % (t,)) 259 | print("mean reward (100 episodes) %f" % mean_episode_reward) 260 | print("best mean reward %f" % best_mean_episode_reward) 261 | print("episodes %d" % len(episode_rewards)) 262 | print("exploration %f" % exploration.value(t)) 263 | print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) 264 | sys.stdout.flush() 265 | -------------------------------------------------------------------------------- /sp17_hw/hw4/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import gym 4 | import logz 5 | import scipy.signal 6 | 7 | def normc_initializer(std=1.0): 8 | """ 9 | Initialize array with normalized columns 10 | """ 11 | def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613 12 | out = np.random.randn(*shape).astype(np.float32) 13 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 14 | return tf.constant(out) 15 | return _initializer 16 | 17 | 18 | def dense(x, size, name, weight_init=None): 19 | """ 20 | Dense (fully connected) layer 21 | """ 22 | w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) 23 | b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer()) 24 | return tf.matmul(x, w) + b 25 | 26 | def fancy_slice_2d(X, inds0, inds1): 27 | """ 28 | Like numpy's X[inds0, inds1] 29 | """ 30 | inds0 = tf.cast(inds0, tf.int64) 31 | inds1 = tf.cast(inds1, tf.int64) 32 | shape = tf.cast(tf.shape(X), tf.int64) 33 | ncols = shape[1] 34 | Xflat = tf.reshape(X, [-1]) 35 | return tf.gather(Xflat, inds0 * ncols + inds1) 36 | 37 | def discount(x, gamma): 38 | """ 39 | Compute discounted sum of future values 40 | out[i] = in[i] + gamma * in[i+1] + gamma^2 * in[i+2] + ... 41 | """ 42 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 43 | 44 | def explained_variance_1d(ypred,y): 45 | """ 46 | Var[ypred - y] / var[y]. 47 | https://www.quora.com/What-is-the-meaning-proportion-of-variance-explained-in-linear-regression 48 | """ 49 | assert y.ndim == 1 and ypred.ndim == 1 50 | vary = np.var(y) 51 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 52 | 53 | def categorical_sample_logits(logits): 54 | """ 55 | Samples (symbolically) from categorical distribution, where logits is a NxK 56 | matrix specifying N categorical distributions with K categories 57 | 58 | specifically, exp(logits) / sum( exp(logits), axis=1 ) is the 59 | probabilities of the different classes 60 | 61 | Cleverly uses gumbell trick, based on 62 | https://github.com/tensorflow/tensorflow/issues/456 63 | """ 64 | U = tf.random_uniform(tf.shape(logits)) 65 | return tf.argmax(logits - tf.log(-tf.log(U)), dimension=1) 66 | 67 | def pathlength(path): 68 | return len(path["reward"]) 69 | 70 | class LinearValueFunction(object): 71 | coef = None 72 | def fit(self, X, y): 73 | Xp = self.preproc(X) 74 | A = Xp.T.dot(Xp) 75 | nfeats = Xp.shape[1] 76 | A[np.arange(nfeats), np.arange(nfeats)] += 1e-3 # a little ridge regression 77 | b = Xp.T.dot(y) 78 | self.coef = np.linalg.solve(A, b) 79 | def predict(self, X): 80 | if self.coef is None: 81 | return np.zeros(X.shape[0]) 82 | else: 83 | return self.preproc(X).dot(self.coef) 84 | def preproc(self, X): 85 | return np.concatenate([np.ones([X.shape[0], 1]), X, np.square(X)/2.0], axis=1) 86 | 87 | class NnValueFunction(object): 88 | pass # YOUR CODE HERE 89 | 90 | def lrelu(x, leak=0.2): 91 | f1 = 0.5 * (1 + leak) 92 | f2 = 0.5 * (1 - leak) 93 | return f1 * x + f2 * abs(x) 94 | 95 | 96 | 97 | def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None): 98 | env = gym.make("CartPole-v0") 99 | ob_dim = env.observation_space.shape[0] 100 | num_actions = env.action_space.n 101 | logz.configure_output_dir(logdir) 102 | vf = LinearValueFunction() 103 | 104 | # Symbolic variables have the prefix sy_, to distinguish them from the numerical values 105 | # that are computed later in these function 106 | sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations 107 | sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation 108 | sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate 109 | sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer 110 | sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer 111 | # we use a small initialization for the last layer, so the initial policy has maximal entropy 112 | sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic) 113 | sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions 114 | sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient) 115 | sy_n = tf.shape(sy_ob_no)[0] 116 | sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation 117 | 118 | # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> 119 | sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na) 120 | sy_oldp_na = tf.exp(sy_oldlogp_na) 121 | sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n) 122 | sy_p_na = tf.exp(sy_logp_na) 123 | sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n) 124 | # <<<<<<<<<<<<< 125 | 126 | sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") 127 | 128 | sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) 129 | update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) 130 | 131 | tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 132 | # use single thread. on such a small problem, multithreading gives you a slowdown 133 | # this way, we can better use multiple cores for different experiments 134 | sess = tf.Session(config=tf_config) 135 | sess.__enter__() # equivalent to `with sess:` 136 | tf.global_variables_initializer().run() #pylint: disable=E1101 137 | 138 | total_timesteps = 0 139 | 140 | for i in range(n_iter): 141 | print("********** Iteration %i ************"%i) 142 | 143 | # Collect paths until we have enough timesteps 144 | timesteps_this_batch = 0 145 | paths = [] 146 | while True: 147 | ob = env.reset() 148 | terminated = False 149 | obs, acs, rewards = [], [], [] 150 | animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) 151 | while True: 152 | if animate_this_episode: 153 | env.render() 154 | obs.append(ob) 155 | ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) 156 | acs.append(ac) 157 | ob, rew, done, _ = env.step(ac) 158 | rewards.append(rew) 159 | if done: 160 | break 161 | path = {"observation" : np.array(obs), "terminated" : terminated, 162 | "reward" : np.array(rewards), "action" : np.array(acs)} 163 | paths.append(path) 164 | timesteps_this_batch += pathlength(path) 165 | if timesteps_this_batch > min_timesteps_per_batch: 166 | break 167 | total_timesteps += timesteps_this_batch 168 | # Estimate advantage function 169 | vtargs, vpreds, advs = [], [], [] 170 | for path in paths: 171 | rew_t = path["reward"] 172 | return_t = discount(rew_t, gamma) 173 | vpred_t = vf.predict(path["observation"]) 174 | adv_t = return_t - vpred_t 175 | advs.append(adv_t) 176 | vtargs.append(return_t) 177 | vpreds.append(vpred_t) 178 | 179 | # Build arrays for policy update 180 | ob_no = np.concatenate([path["observation"] for path in paths]) 181 | ac_n = np.concatenate([path["action"] for path in paths]) 182 | adv_n = np.concatenate(advs) 183 | standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) 184 | vtarg_n = np.concatenate(vtargs) 185 | vpred_n = np.concatenate(vpreds) 186 | vf.fit(ob_no, vtarg_n) 187 | 188 | # Policy update 189 | _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) 190 | kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na}) 191 | 192 | # Log diagnostics 193 | logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) 194 | logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) 195 | logz.log_tabular("KLOldNew", kl) 196 | logz.log_tabular("Entropy", ent) 197 | logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) 198 | logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) 199 | logz.log_tabular("TimestepsSoFar", total_timesteps) 200 | # If you're overfitting, EVAfter will be way larger than EVBefore. 201 | # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias 202 | logz.dump_tabular() 203 | 204 | def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): 205 | tf.set_random_seed(seed) 206 | np.random.seed(seed) 207 | env = gym.make("Pendulum-v0") 208 | ob_dim = env.observation_space.shape[0] 209 | ac_dim = env.action_space.shape[0] 210 | logz.configure_output_dir(logdir) 211 | if vf_type == 'linear': 212 | vf = LinearValueFunction(**vf_params) 213 | elif vf_type == 'nn': 214 | vf = NnValueFunction(ob_dim=ob_dim, **vf_params) 215 | 216 | 217 | YOUR_CODE_HERE 218 | 219 | 220 | sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") 221 | 222 | sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) 223 | update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) 224 | 225 | sess = tf.Session() 226 | sess.__enter__() # equivalent to `with sess:` 227 | tf.global_variables_initializer().run() #pylint: disable=E1101 228 | 229 | total_timesteps = 0 230 | stepsize = initial_stepsize 231 | 232 | for i in range(n_iter): 233 | print("********** Iteration %i ************"%i) 234 | 235 | YOUR_CODE_HERE 236 | 237 | if kl > desired_kl * 2: 238 | stepsize /= 1.5 239 | print('stepsize -> %s'%stepsize) 240 | elif kl < desired_kl / 2: 241 | stepsize *= 1.5 242 | print('stepsize -> %s'%stepsize) 243 | else: 244 | print('stepsize OK') 245 | 246 | 247 | # Log diagnostics 248 | logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) 249 | logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) 250 | logz.log_tabular("KLOldNew", kl) 251 | logz.log_tabular("Entropy", ent) 252 | logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) 253 | logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) 254 | logz.log_tabular("TimestepsSoFar", total_timesteps) 255 | # If you're overfitting, EVAfter will be way larger than EVBefore. 256 | # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias 257 | logz.dump_tabular() 258 | 259 | 260 | def main_pendulum1(d): 261 | return main_pendulum(**d) 262 | 263 | if __name__ == "__main__": 264 | if 1: 265 | main_cartpole(logdir=None) # when you want to start collecting results, set the logdir 266 | if 0: 267 | general_params = dict(gamma=0.97, animate=False, min_timesteps_per_batch=2500, n_iter=300, initial_stepsize=1e-3) 268 | params = [ 269 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 270 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 271 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 272 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 273 | dict(logdir='/tmp/ref/linearvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params), 274 | dict(logdir='/tmp/ref/nnvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params), 275 | ] 276 | import multiprocessing 277 | p = multiprocessing.Pool() 278 | p.map(main_pendulum1, params) 279 | -------------------------------------------------------------------------------- /hw3/dqn_utils.py: -------------------------------------------------------------------------------- 1 | """This file includes a collection of utility functions that are useful for 2 | implementing DQN.""" 3 | import gym 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | def huber_loss(x, delta=1.0): 9 | # https://en.wikipedia.org/wiki/Huber_loss 10 | return tf.select( 11 | tf.abs(x) < delta, 12 | tf.square(x) * 0.5, 13 | delta * (tf.abs(x) - 0.5 * delta) 14 | ) 15 | 16 | def sample_n_unique(sampling_f, n): 17 | """Helper function. Given a function `sampling_f` that returns 18 | comparable objects, sample n such unique objects. 19 | """ 20 | res = [] 21 | while len(res) < n: 22 | candidate = sampling_f() 23 | if candidate not in res: 24 | res.append(candidate) 25 | return res 26 | 27 | class Schedule(object): 28 | def value(self, t): 29 | """Value of the schedule at time t""" 30 | raise NotImplementedError() 31 | 32 | class ConstantSchedule(object): 33 | def __init__(self, value): 34 | """Value remains constant over time. 35 | Parameters 36 | ---------- 37 | value: float 38 | Constant value of the schedule 39 | """ 40 | self._v = value 41 | 42 | def value(self, t): 43 | """See Schedule.value""" 44 | return self._v 45 | 46 | def linear_interpolation(l, r, alpha): 47 | return l + alpha * (r - l) 48 | 49 | class PiecewiseSchedule(object): 50 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 51 | """Piecewise schedule. 52 | endpoints: [(int, int)] 53 | list of pairs `(time, value)` meanining that schedule should output 54 | `value` when `t==time`. All the values for time must be sorted in 55 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 56 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 57 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 58 | time passed between `time_a` and `time_b` for time `t`. 59 | interpolation: lambda float, float, float: float 60 | a function that takes value to the left and to the right of t according 61 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 62 | right endpoint that t has covered. See linear_interpolation for example. 63 | outside_value: float 64 | if the value is requested outside of all the intervals sepecified in 65 | `endpoints` this value is returned. If None then AssertionError is 66 | raised when outside value is requested. 67 | """ 68 | idxes = [e[0] for e in endpoints] 69 | assert idxes == sorted(idxes) 70 | self._interpolation = interpolation 71 | self._outside_value = outside_value 72 | self._endpoints = endpoints 73 | 74 | def value(self, t): 75 | """See Schedule.value""" 76 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 77 | if l_t <= t and t < r_t: 78 | alpha = float(t - l_t) / (r_t - l_t) 79 | return self._interpolation(l, r, alpha) 80 | 81 | # t does not belong to any of the pieces, so doom. 82 | assert self._outside_value is not None 83 | return self._outside_value 84 | 85 | class LinearSchedule(object): 86 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 87 | """Linear interpolation between initial_p and final_p over 88 | schedule_timesteps. After this many timesteps pass final_p is 89 | returned. 90 | Parameters 91 | ---------- 92 | schedule_timesteps: int 93 | Number of timesteps for which to linearly anneal initial_p 94 | to final_p 95 | initial_p: float 96 | initial output value 97 | final_p: float 98 | final output value 99 | """ 100 | self.schedule_timesteps = schedule_timesteps 101 | self.final_p = final_p 102 | self.initial_p = initial_p 103 | 104 | def value(self, t): 105 | """See Schedule.value""" 106 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 107 | return self.initial_p + fraction * (self.final_p - self.initial_p) 108 | 109 | def compute_exponential_averages(variables, decay): 110 | """Given a list of tensorflow scalar variables 111 | create ops corresponding to their exponential 112 | averages 113 | Parameters 114 | ---------- 115 | variables: [tf.Tensor] 116 | List of scalar tensors. 117 | Returns 118 | ------- 119 | averages: [tf.Tensor] 120 | List of scalar tensors corresponding to averages 121 | of al the `variables` (in order) 122 | apply_op: tf.runnable 123 | Op to be run to update the averages with current value 124 | of variables. 125 | """ 126 | averager = tf.train.ExponentialMovingAverage(decay=decay) 127 | apply_op = averager.apply(variables) 128 | return [averager.average(v) for v in variables], apply_op 129 | 130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 131 | """Minimized `objective` using `optimizer` w.r.t. variables in 132 | `var_list` while ensure the norm of the gradients for each 133 | variable is clipped to `clip_val` 134 | """ 135 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 136 | for i, (grad, var) in enumerate(gradients): 137 | if grad is not None: 138 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 139 | return optimizer.apply_gradients(gradients) 140 | 141 | def initialize_interdependent_variables(session, vars_list, feed_dict): 142 | """Initialize a list of variables one at a time, which is useful if 143 | initialization of some variables depends on initialization of the others. 144 | """ 145 | vars_left = vars_list 146 | while len(vars_left) > 0: 147 | new_vars_left = [] 148 | for v in vars_left: 149 | try: 150 | # If using an older version of TensorFlow, uncomment the line 151 | # below and comment out the line after it. 152 | #session.run(tf.initialize_variables([v]), feed_dict) 153 | session.run(tf.variables_initializer([v]), feed_dict) 154 | except tf.errors.FailedPreconditionError: 155 | new_vars_left.append(v) 156 | if len(new_vars_left) >= len(vars_left): 157 | # This can happend if the variables all depend on each other, or more likely if there's 158 | # another variable outside of the list, that still needs to be initialized. This could be 159 | # detected here, but life's finite. 160 | raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") 161 | else: 162 | vars_left = new_vars_left 163 | 164 | def get_wrapper_by_name(env, classname): 165 | currentenv = env 166 | while True: 167 | if classname in currentenv.__class__.__name__: 168 | return currentenv 169 | elif isinstance(env, gym.Wrapper): 170 | currentenv = currentenv.env 171 | else: 172 | raise ValueError("Couldn't find wrapper named %s"%classname) 173 | 174 | class ReplayBuffer(object): 175 | def __init__(self, size, frame_history_len): 176 | """This is a memory efficient implementation of the replay buffer. 177 | 178 | The sepecific memory optimizations use here are: 179 | - only store each frame once rather than k times 180 | even if every observation normally consists of k last frames 181 | - store frames as np.uint8 (actually it is most time-performance 182 | to cast them back to float32 on GPU to minimize memory transfer 183 | time) 184 | - store frame_t and frame_(t+1) in the same buffer. 185 | 186 | For the tipical use case in Atari Deep RL buffer with 1M frames the total 187 | memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes 188 | 189 | Warning! Assumes that returning frame of zeros at the beginning 190 | of the episode, when there is less frames than `frame_history_len`, 191 | is acceptable. 192 | 193 | Parameters 194 | ---------- 195 | size: int 196 | Max number of transitions to store in the buffer. When the buffer 197 | overflows the old memories are dropped. 198 | frame_history_len: int 199 | Number of memories to be retried for each observation. 200 | """ 201 | self.size = size 202 | self.frame_history_len = frame_history_len 203 | 204 | self.next_idx = 0 205 | self.num_in_buffer = 0 206 | 207 | self.obs = None 208 | self.action = None 209 | self.reward = None 210 | self.done = None 211 | 212 | def can_sample(self, batch_size): 213 | """Returns true if `batch_size` different transitions can be sampled from the buffer.""" 214 | return batch_size + 1 <= self.num_in_buffer 215 | 216 | def _encode_sample(self, idxes): 217 | obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) 218 | act_batch = self.action[idxes] 219 | rew_batch = self.reward[idxes] 220 | next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) 221 | done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) 222 | 223 | return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask 224 | 225 | 226 | def sample(self, batch_size): 227 | """Sample `batch_size` different transitions. 228 | 229 | i-th sample transition is the following: 230 | 231 | when observing `obs_batch[i]`, action `act_batch[i]` was taken, 232 | after which reward `rew_batch[i]` was received and subsequent 233 | observation next_obs_batch[i] was observed, unless the epsiode 234 | was done which is represented by `done_mask[i]` which is equal 235 | to 1 if episode has ended as a result of that action. 236 | 237 | Parameters 238 | ---------- 239 | batch_size: int 240 | How many transitions to sample. 241 | 242 | Returns 243 | ------- 244 | obs_batch: np.array 245 | Array of shape 246 | (batch_size, img_h, img_w, img_c * frame_history_len) 247 | and dtype np.uint8 248 | act_batch: np.array 249 | Array of shape (batch_size,) and dtype np.int32 250 | rew_batch: np.array 251 | Array of shape (batch_size,) and dtype np.float32 252 | next_obs_batch: np.array 253 | Array of shape 254 | (batch_size, img_h, img_w, img_c * frame_history_len) 255 | and dtype np.uint8 256 | done_mask: np.array 257 | Array of shape (batch_size,) and dtype np.float32 258 | """ 259 | assert self.can_sample(batch_size) 260 | idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) 261 | return self._encode_sample(idxes) 262 | 263 | def encode_recent_observation(self): 264 | """Return the most recent `frame_history_len` frames. 265 | 266 | Returns 267 | ------- 268 | observation: np.array 269 | Array of shape (img_h, img_w, img_c * frame_history_len) 270 | and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] 271 | encodes frame at time `t - frame_history_len + i` 272 | """ 273 | assert self.num_in_buffer > 0 274 | return self._encode_observation((self.next_idx - 1) % self.size) 275 | 276 | def _encode_observation(self, idx): 277 | end_idx = idx + 1 # make noninclusive 278 | start_idx = end_idx - self.frame_history_len 279 | # this checks if we are using low-dimensional observations, such as RAM 280 | # state, in which case we just directly return the latest RAM. 281 | if len(self.obs.shape) == 2: 282 | return self.obs[end_idx-1] 283 | # if there weren't enough frames ever in the buffer for context 284 | if start_idx < 0 and self.num_in_buffer != self.size: 285 | start_idx = 0 286 | for idx in range(start_idx, end_idx - 1): 287 | if self.done[idx % self.size]: 288 | start_idx = idx + 1 289 | missing_context = self.frame_history_len - (end_idx - start_idx) 290 | # if zero padding is needed for missing context 291 | # or we are on the boundry of the buffer 292 | if start_idx < 0 or missing_context > 0: 293 | frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] 294 | for idx in range(start_idx, end_idx): 295 | frames.append(self.obs[idx % self.size]) 296 | return np.concatenate(frames, 2) 297 | else: 298 | # this optimization has potential to saves about 30% compute time \o/ 299 | img_h, img_w = self.obs.shape[1], self.obs.shape[2] 300 | return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) 301 | 302 | def store_frame(self, frame): 303 | """Store a single frame in the buffer at the next available index, overwriting 304 | old frames if necessary. 305 | 306 | Parameters 307 | ---------- 308 | frame: np.array 309 | Array of shape (img_h, img_w, img_c) and dtype np.uint8 310 | the frame to be stored 311 | 312 | Returns 313 | ------- 314 | idx: int 315 | Index at which the frame is stored. To be used for `store_effect` later. 316 | """ 317 | if self.obs is None: 318 | self.obs = np.empty([self.size] + list(frame.shape), dtype=np.uint8) 319 | self.action = np.empty([self.size], dtype=np.int32) 320 | self.reward = np.empty([self.size], dtype=np.float32) 321 | self.done = np.empty([self.size], dtype=np.bool) 322 | self.obs[self.next_idx] = frame 323 | 324 | ret = self.next_idx 325 | self.next_idx = (self.next_idx + 1) % self.size 326 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 327 | 328 | return ret 329 | 330 | def store_effect(self, idx, action, reward, done): 331 | """Store effects of action taken after obeserving frame stored 332 | at index idx. The reason `store_frame` and `store_effect` is broken 333 | up into two functions is so that once can call `encode_recent_observation` 334 | in between. 335 | 336 | Paramters 337 | --------- 338 | idx: int 339 | Index in buffer of recently observed frame (returned by `store_frame`). 340 | action: int 341 | Action that was performed upon observing this frame. 342 | reward: float 343 | Reward that was received when the actions was performed. 344 | done: bool 345 | True if episode was finished after performing that action. 346 | """ 347 | self.action[idx] = action 348 | self.reward[idx] = reward 349 | self.done[idx] = done 350 | 351 | -------------------------------------------------------------------------------- /sp17_hw/hw3/dqn_utils.py: -------------------------------------------------------------------------------- 1 | """This file includes a collection of utility functions that are useful for 2 | implementing DQN.""" 3 | import gym 4 | import tensorflow as tf 5 | import numpy as np 6 | import random 7 | 8 | def huber_loss(x, delta=1.0): 9 | # https://en.wikipedia.org/wiki/Huber_loss 10 | return tf.select( 11 | tf.abs(x) < delta, 12 | tf.square(x) * 0.5, 13 | delta * (tf.abs(x) - 0.5 * delta) 14 | ) 15 | 16 | def sample_n_unique(sampling_f, n): 17 | """Helper function. Given a function `sampling_f` that returns 18 | comparable objects, sample n such unique objects. 19 | """ 20 | res = [] 21 | while len(res) < n: 22 | candidate = sampling_f() 23 | if candidate not in res: 24 | res.append(candidate) 25 | return res 26 | 27 | class Schedule(object): 28 | def value(self, t): 29 | """Value of the schedule at time t""" 30 | raise NotImplementedError() 31 | 32 | class ConstantSchedule(object): 33 | def __init__(self, value): 34 | """Value remains constant over time. 35 | Parameters 36 | ---------- 37 | value: float 38 | Constant value of the schedule 39 | """ 40 | self._v = value 41 | 42 | def value(self, t): 43 | """See Schedule.value""" 44 | return self._v 45 | 46 | def linear_interpolation(l, r, alpha): 47 | return l + alpha * (r - l) 48 | 49 | class PiecewiseSchedule(object): 50 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 51 | """Piecewise schedule. 52 | endpoints: [(int, int)] 53 | list of pairs `(time, value)` meanining that schedule should output 54 | `value` when `t==time`. All the values for time must be sorted in 55 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 56 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 57 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 58 | time passed between `time_a` and `time_b` for time `t`. 59 | interpolation: lambda float, float, float: float 60 | a function that takes value to the left and to the right of t according 61 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 62 | right endpoint that t has covered. See linear_interpolation for example. 63 | outside_value: float 64 | if the value is requested outside of all the intervals sepecified in 65 | `endpoints` this value is returned. If None then AssertionError is 66 | raised when outside value is requested. 67 | """ 68 | idxes = [e[0] for e in endpoints] 69 | assert idxes == sorted(idxes) 70 | self._interpolation = interpolation 71 | self._outside_value = outside_value 72 | self._endpoints = endpoints 73 | 74 | def value(self, t): 75 | """See Schedule.value""" 76 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 77 | if l_t <= t and t < r_t: 78 | alpha = float(t - l_t) / (r_t - l_t) 79 | return self._interpolation(l, r, alpha) 80 | 81 | # t does not belong to any of the pieces, so doom. 82 | assert self._outside_value is not None 83 | return self._outside_value 84 | 85 | class LinearSchedule(object): 86 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 87 | """Linear interpolation between initial_p and final_p over 88 | schedule_timesteps. After this many timesteps pass final_p is 89 | returned. 90 | Parameters 91 | ---------- 92 | schedule_timesteps: int 93 | Number of timesteps for which to linearly anneal initial_p 94 | to final_p 95 | initial_p: float 96 | initial output value 97 | final_p: float 98 | final output value 99 | """ 100 | self.schedule_timesteps = schedule_timesteps 101 | self.final_p = final_p 102 | self.initial_p = initial_p 103 | 104 | def value(self, t): 105 | """See Schedule.value""" 106 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 107 | return self.initial_p + fraction * (self.final_p - self.initial_p) 108 | 109 | def compute_exponential_averages(variables, decay): 110 | """Given a list of tensorflow scalar variables 111 | create ops corresponding to their exponential 112 | averages 113 | Parameters 114 | ---------- 115 | variables: [tf.Tensor] 116 | List of scalar tensors. 117 | Returns 118 | ------- 119 | averages: [tf.Tensor] 120 | List of scalar tensors corresponding to averages 121 | of al the `variables` (in order) 122 | apply_op: tf.runnable 123 | Op to be run to update the averages with current value 124 | of variables. 125 | """ 126 | averager = tf.train.ExponentialMovingAverage(decay=decay) 127 | apply_op = averager.apply(variables) 128 | return [averager.average(v) for v in variables], apply_op 129 | 130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10): 131 | """Minimized `objective` using `optimizer` w.r.t. variables in 132 | `var_list` while ensure the norm of the gradients for each 133 | variable is clipped to `clip_val` 134 | """ 135 | gradients = optimizer.compute_gradients(objective, var_list=var_list) 136 | for i, (grad, var) in enumerate(gradients): 137 | if grad is not None: 138 | gradients[i] = (tf.clip_by_norm(grad, clip_val), var) 139 | return optimizer.apply_gradients(gradients) 140 | 141 | def initialize_interdependent_variables(session, vars_list, feed_dict): 142 | """Initialize a list of variables one at a time, which is useful if 143 | initialization of some variables depends on initialization of the others. 144 | """ 145 | vars_left = vars_list 146 | while len(vars_left) > 0: 147 | new_vars_left = [] 148 | for v in vars_left: 149 | try: 150 | # If using an older version of TensorFlow, uncomment the line 151 | # below and comment out the line after it. 152 | #session.run(tf.initialize_variables([v]), feed_dict) 153 | session.run(tf.variables_initializer([v]), feed_dict) 154 | except tf.errors.FailedPreconditionError: 155 | new_vars_left.append(v) 156 | if len(new_vars_left) >= len(vars_left): 157 | # This can happend if the variables all depend on each other, or more likely if there's 158 | # another variable outside of the list, that still needs to be initialized. This could be 159 | # detected here, but life's finite. 160 | raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.") 161 | else: 162 | vars_left = new_vars_left 163 | 164 | def get_wrapper_by_name(env, classname): 165 | currentenv = env 166 | while True: 167 | if classname in currentenv.__class__.__name__: 168 | return currentenv 169 | elif isinstance(env, gym.Wrapper): 170 | currentenv = currentenv.env 171 | else: 172 | raise ValueError("Couldn't find wrapper named %s"%classname) 173 | 174 | class ReplayBuffer(object): 175 | def __init__(self, size, frame_history_len): 176 | """This is a memory efficient implementation of the replay buffer. 177 | 178 | The sepecific memory optimizations use here are: 179 | - only store each frame once rather than k times 180 | even if every observation normally consists of k last frames 181 | - store frames as np.uint8 (actually it is most time-performance 182 | to cast them back to float32 on GPU to minimize memory transfer 183 | time) 184 | - store frame_t and frame_(t+1) in the same buffer. 185 | 186 | For the tipical use case in Atari Deep RL buffer with 1M frames the total 187 | memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes 188 | 189 | Warning! Assumes that returning frame of zeros at the beginning 190 | of the episode, when there is less frames than `frame_history_len`, 191 | is acceptable. 192 | 193 | Parameters 194 | ---------- 195 | size: int 196 | Max number of transitions to store in the buffer. When the buffer 197 | overflows the old memories are dropped. 198 | frame_history_len: int 199 | Number of memories to be retried for each observation. 200 | """ 201 | self.size = size 202 | self.frame_history_len = frame_history_len 203 | 204 | self.next_idx = 0 205 | self.num_in_buffer = 0 206 | 207 | self.obs = None 208 | self.action = None 209 | self.reward = None 210 | self.done = None 211 | 212 | def can_sample(self, batch_size): 213 | """Returns true if `batch_size` different transitions can be sampled from the buffer.""" 214 | return batch_size + 1 <= self.num_in_buffer 215 | 216 | def _encode_sample(self, idxes): 217 | obs_batch = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0) 218 | act_batch = self.action[idxes] 219 | rew_batch = self.reward[idxes] 220 | next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0) 221 | done_mask = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32) 222 | 223 | return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask 224 | 225 | 226 | def sample(self, batch_size): 227 | """Sample `batch_size` different transitions. 228 | 229 | i-th sample transition is the following: 230 | 231 | when observing `obs_batch[i]`, action `act_batch[i]` was taken, 232 | after which reward `rew_batch[i]` was received and subsequent 233 | observation next_obs_batch[i] was observed, unless the epsiode 234 | was done which is represented by `done_mask[i]` which is equal 235 | to 1 if episode has ended as a result of that action. 236 | 237 | Parameters 238 | ---------- 239 | batch_size: int 240 | How many transitions to sample. 241 | 242 | Returns 243 | ------- 244 | obs_batch: np.array 245 | Array of shape 246 | (batch_size, img_h, img_w, img_c * frame_history_len) 247 | and dtype np.uint8 248 | act_batch: np.array 249 | Array of shape (batch_size,) and dtype np.int32 250 | rew_batch: np.array 251 | Array of shape (batch_size,) and dtype np.float32 252 | next_obs_batch: np.array 253 | Array of shape 254 | (batch_size, img_h, img_w, img_c * frame_history_len) 255 | and dtype np.uint8 256 | done_mask: np.array 257 | Array of shape (batch_size,) and dtype np.float32 258 | """ 259 | assert self.can_sample(batch_size) 260 | idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size) 261 | return self._encode_sample(idxes) 262 | 263 | def encode_recent_observation(self): 264 | """Return the most recent `frame_history_len` frames. 265 | 266 | Returns 267 | ------- 268 | observation: np.array 269 | Array of shape (img_h, img_w, img_c * frame_history_len) 270 | and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c] 271 | encodes frame at time `t - frame_history_len + i` 272 | """ 273 | assert self.num_in_buffer > 0 274 | return self._encode_observation((self.next_idx - 1) % self.size) 275 | 276 | def _encode_observation(self, idx): 277 | end_idx = idx + 1 # make noninclusive 278 | start_idx = end_idx - self.frame_history_len 279 | # this checks if we are using low-dimensional observations, such as RAM 280 | # state, in which case we just directly return the latest RAM. 281 | if len(self.obs.shape) == 2: 282 | return self.obs[end_idx-1] 283 | # if there weren't enough frames ever in the buffer for context 284 | if start_idx < 0 and self.num_in_buffer != self.size: 285 | start_idx = 0 286 | for idx in range(start_idx, end_idx - 1): 287 | if self.done[idx % self.size]: 288 | start_idx = idx + 1 289 | missing_context = self.frame_history_len - (end_idx - start_idx) 290 | # if zero padding is needed for missing context 291 | # or we are on the boundry of the buffer 292 | if start_idx < 0 or missing_context > 0: 293 | frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)] 294 | for idx in range(start_idx, end_idx): 295 | frames.append(self.obs[idx % self.size]) 296 | return np.concatenate(frames, 2) 297 | else: 298 | # this optimization has potential to saves about 30% compute time \o/ 299 | img_h, img_w = self.obs.shape[1], self.obs.shape[2] 300 | return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1) 301 | 302 | def store_frame(self, frame): 303 | """Store a single frame in the buffer at the next available index, overwriting 304 | old frames if necessary. 305 | 306 | Parameters 307 | ---------- 308 | frame: np.array 309 | Array of shape (img_h, img_w, img_c) and dtype np.uint8 310 | the frame to be stored 311 | 312 | Returns 313 | ------- 314 | idx: int 315 | Index at which the frame is stored. To be used for `store_effect` later. 316 | """ 317 | if self.obs is None: 318 | self.obs = np.empty([self.size] + list(frame.shape), dtype=np.uint8) 319 | self.action = np.empty([self.size], dtype=np.int32) 320 | self.reward = np.empty([self.size], dtype=np.float32) 321 | self.done = np.empty([self.size], dtype=np.bool) 322 | self.obs[self.next_idx] = frame 323 | 324 | ret = self.next_idx 325 | self.next_idx = (self.next_idx + 1) % self.size 326 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 327 | 328 | return ret 329 | 330 | def store_effect(self, idx, action, reward, done): 331 | """Store effects of action taken after obeserving frame stored 332 | at index idx. The reason `store_frame` and `store_effect` is broken 333 | up into two functions is so that once can call `encode_recent_observation` 334 | in between. 335 | 336 | Paramters 337 | --------- 338 | idx: int 339 | Index in buffer of recently observed frame (returned by `store_frame`). 340 | action: int 341 | Action that was performed upon observing this frame. 342 | reward: float 343 | Reward that was received when the actions was performed. 344 | done: bool 345 | True if episode was finished after performing that action. 346 | """ 347 | self.action[idx] = action 348 | self.reward[idx] = reward 349 | self.done[idx] = done 350 | 351 | -------------------------------------------------------------------------------- /hw3/dqn.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gym.spaces 3 | import itertools 4 | import numpy as np 5 | import random 6 | import tensorflow as tf 7 | import tensorflow.contrib.layers as layers 8 | from collections import namedtuple 9 | from dqn_utils import * 10 | 11 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"]) 12 | 13 | def learn(env, 14 | q_func, 15 | optimizer_spec, 16 | session, 17 | exploration=LinearSchedule(1000000, 0.1), 18 | stopping_criterion=None, 19 | replay_buffer_size=1000000, 20 | batch_size=32, 21 | gamma=0.99, 22 | learning_starts=50000, 23 | learning_freq=4, 24 | frame_history_len=4, 25 | target_update_freq=10000, 26 | grad_norm_clipping=10): 27 | """Run Deep Q-learning algorithm. 28 | 29 | You can specify your own convnet using q_func. 30 | 31 | All schedules are w.r.t. total number of steps taken in the environment. 32 | 33 | Parameters 34 | ---------- 35 | env: gym.Env 36 | gym environment to train on. 37 | q_func: function 38 | Model to use for computing the q function. It should accept the 39 | following named arguments: 40 | img_in: tf.Tensor 41 | tensorflow tensor representing the input image 42 | num_actions: int 43 | number of actions 44 | scope: str 45 | scope in which all the model related variables 46 | should be created 47 | reuse: bool 48 | whether previously created variables should be reused. 49 | optimizer_spec: OptimizerSpec 50 | Specifying the constructor and kwargs, as well as learning rate schedule 51 | for the optimizer 52 | session: tf.Session 53 | tensorflow session to use. 54 | exploration: rl_algs.deepq.utils.schedules.Schedule 55 | schedule for probability of chosing random action. 56 | stopping_criterion: (env, t) -> bool 57 | should return true when it's ok for the RL algorithm to stop. 58 | takes in env and the number of steps executed so far. 59 | replay_buffer_size: int 60 | How many memories to store in the replay buffer. 61 | batch_size: int 62 | How many transitions to sample each time experience is replayed. 63 | gamma: float 64 | Discount Factor 65 | learning_starts: int 66 | After how many environment steps to start replaying experiences 67 | learning_freq: int 68 | How many steps of environment to take between every experience replay 69 | frame_history_len: int 70 | How many past frames to include as input to the model. 71 | target_update_freq: int 72 | How many experience replay rounds (not steps!) to perform between 73 | each update to the target Q network 74 | grad_norm_clipping: float or None 75 | If not None gradients' norms are clipped to this value. 76 | """ 77 | assert type(env.observation_space) == gym.spaces.Box 78 | assert type(env.action_space) == gym.spaces.Discrete 79 | 80 | ############### 81 | # BUILD MODEL # 82 | ############### 83 | 84 | if len(env.observation_space.shape) == 1: 85 | # This means we are running on low-dimensional observations (e.g. RAM) 86 | input_shape = env.observation_space.shape 87 | else: 88 | img_h, img_w, img_c = env.observation_space.shape 89 | input_shape = (img_h, img_w, frame_history_len * img_c) 90 | num_actions = env.action_space.n 91 | 92 | # set up placeholders 93 | # placeholder for current observation (or state) 94 | obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 95 | # placeholder for current action 96 | act_t_ph = tf.placeholder(tf.int32, [None]) 97 | # placeholder for current reward 98 | rew_t_ph = tf.placeholder(tf.float32, [None]) 99 | # placeholder for next observation (or state) 100 | obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) 101 | # placeholder for end of episode mask 102 | # this value is 1 if the next state corresponds to the end of an episode, 103 | # in which case there is no Q-value at the next state; at the end of an 104 | # episode, only the current state reward contributes to the target, not the 105 | # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) 106 | done_mask_ph = tf.placeholder(tf.float32, [None]) 107 | 108 | # casting to float on GPU ensures lower data transfer times. 109 | obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 110 | obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 111 | 112 | # Here, you should fill in your own code to compute the Bellman error. This requires 113 | # evaluating the current and next Q-values and constructing the corresponding error. 114 | # TensorFlow will differentiate this error for you, you just need to pass it to the 115 | # optimizer. See assignment text for details. 116 | # Your code should produce one scalar-valued tensor: total_error 117 | # This will be passed to the optimizer in the provided code below. 118 | # Your code should also produce two collections of variables: 119 | # q_func_vars 120 | # target_q_func_vars 121 | # These should hold all of the variables of the Q-function network and target network, 122 | # respectively. A convenient way to get these is to make use of TF's "scope" feature. 123 | # For example, you can create your Q-function network with the scope "q_func" like this: 124 | # = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) 125 | # And then you can obtain the variables like this: 126 | # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 127 | # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" 128 | ###### 129 | 130 | q_t = q_func(obs_t_float, num_actions, scope = "q_func", reuse = False) 131 | q_t1 = q_func(obs_tp1_float, num_actions, scope = "target_q_func", reuse = False) 132 | best_action = tf.argmax(q_t, axis = 1) 133 | max_q = tf.reduce_max(q_t1, axis = 1) 134 | y = rew_t_ph + gamma * tf.multiply((1.0 - done_mask_ph), max_q) 135 | q_t_taken = tf.reduce_sum(tf.multiply(q_t, tf.one_hot(act_t_ph, num_actions)), axis = 1) 136 | 137 | total_error = tf.losses.mean_squared_error(y, q_t_taken) 138 | 139 | q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') 140 | target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func') 141 | 142 | ###### 143 | 144 | # construct optimization op (with gradient clipping) 145 | learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") 146 | optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) 147 | train_fn = minimize_and_clip(optimizer, total_error, 148 | var_list=q_func_vars, clip_val=grad_norm_clipping) 149 | 150 | # update_target_fn will be called periodically to copy Q network to target Q network 151 | update_target_fn = [] 152 | for var, var_target in zip(sorted(q_func_vars, key=lambda v: v.name), 153 | sorted(target_q_func_vars, key=lambda v: v.name)): 154 | update_target_fn.append(var_target.assign(var)) 155 | update_target_fn = tf.group(*update_target_fn) 156 | 157 | # construct the replay buffer 158 | replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) 159 | 160 | ############### 161 | # RUN ENV # 162 | ############### 163 | model_initialized = False 164 | num_param_updates = 0 165 | mean_episode_reward = -float('nan') 166 | best_mean_episode_reward = -float('inf') 167 | last_obs = env.reset() 168 | LOG_EVERY_N_STEPS = 1000 #10000 169 | 170 | for t in itertools.count(): 171 | ### 1. Check stopping criterion 172 | if stopping_criterion is not None and stopping_criterion(env, t): 173 | break 174 | 175 | ### 2. Step the env and store the transition 176 | # At this point, "last_obs" contains the latest observation that was 177 | # recorded from the simulator. Here, your code needs to store this 178 | # observation and its outcome (reward, next observation, etc.) into 179 | # the replay buffer while stepping the simulator forward one step. 180 | # At the end of this block of code, the simulator should have been 181 | # advanced one step, and the replay buffer should contain one more 182 | # transition. 183 | # Specifically, last_obs must point to the new latest observation. 184 | # Useful functions you'll need to call: 185 | # obs, reward, done, info = env.step(action) 186 | # this steps the environment forward one step 187 | # obs = env.reset() 188 | # this resets the environment if you reached an episode boundary. 189 | # Don't forget to call env.reset() to get a new observation if done 190 | # is true!! 191 | # Note that you cannot use "last_obs" directly as input 192 | # into your network, since it needs to be processed to include context 193 | # from previous frames. You should check out the replay buffer 194 | # implementation in dqn_utils.py to see what functionality the replay 195 | # buffer exposes. The replay buffer has a function called 196 | # encode_recent_observation that will take the latest observation 197 | # that you pushed into the buffer and compute the corresponding 198 | # input that should be given to a Q network by appending some 199 | # previous frames. 200 | # Don't forget to include epsilon greedy exploration! 201 | # And remember that the first time you enter this loop, the model 202 | # may not yet have been initialized (but of course, the first step 203 | # might as well be random, since you haven't trained your net...) 204 | 205 | ##### 206 | 207 | idx = replay_buffer.store_frame(last_obs) 208 | 209 | if not model_initialized: 210 | action = random.randint(0, num_actions - 1) 211 | else: 212 | obs = replay_buffer.encode_recent_observation() 213 | action = session.run(best_action, feed_dict = {obs_t_ph: [obs]}) 214 | if random.random() < exploration.value(t) * num_actions / (num_actions - 1): 215 | action = random.randint(0, num_actions - 1) 216 | 217 | next_obs, reward, done, _ = env.step(action) 218 | replay_buffer.store_effect(idx, action, reward, done) 219 | if done: 220 | last_obs = env.reset() 221 | else: 222 | last_obs = next_obs 223 | 224 | ##### 225 | 226 | # at this point, the environment should have been advanced one step (and 227 | # reset if done was true), and last_obs should point to the new latest 228 | # observation 229 | 230 | ### 3. Perform experience replay and train the network. 231 | # note that this is only done if the replay buffer contains enough samples 232 | # for us to learn something useful -- until then, the model will not be 233 | # initialized and random actions should be taken 234 | if (t > learning_starts and 235 | t % learning_freq == 0 and 236 | replay_buffer.can_sample(batch_size)): 237 | # Here, you should perform training. Training consists of four steps: 238 | # 3.a: use the replay buffer to sample a batch of transitions (see the 239 | # replay buffer code for function definition, each batch that you sample 240 | # should consist of current observations, current actions, rewards, 241 | # next observations, and done indicator). 242 | # 3.b: initialize the model if it has not been initialized yet; to do 243 | # that, call 244 | # initialize_interdependent_variables(session, tf.global_variables(), { 245 | # obs_t_ph: obs_t_batch, 246 | # obs_tp1_ph: obs_tp1_batch, 247 | # }) 248 | # where obs_t_batch and obs_tp1_batch are the batches of observations at 249 | # the current and next time step. The boolean variable model_initialized 250 | # indicates whether or not the model has been initialized. 251 | # Remember that you have to update the target network too (see 3.d)! 252 | # 3.c: train the model. To do this, you'll need to use the train_fn and 253 | # total_error ops that were created earlier: total_error is what you 254 | # created to compute the total Bellman error in a batch, and train_fn 255 | # will actually perform a gradient step and update the network parameters 256 | # to reduce total_error. When calling session.run on these you'll need to 257 | # populate the following placeholders: 258 | # obs_t_ph 259 | # act_t_ph 260 | # rew_t_ph 261 | # obs_tp1_ph 262 | # done_mask_ph 263 | # (this is needed for computing total_error) 264 | # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) 265 | # (this is needed by the optimizer to choose the learning rate) 266 | # 3.d: periodically update the target network by calling 267 | # session.run(update_target_fn) 268 | # you should update every target_update_freq steps, and you may find the 269 | # variable num_param_updates useful for this (it was initialized to 0) 270 | ##### 271 | obs_batch, act_batch, rew_batch, next_obs_batch, done_batch = replay_buffer.sample(batch_size) 272 | 273 | if not model_initialized: 274 | initialize_interdependent_variables(session, tf.global_variables(), { 275 | obs_t_ph: obs_batch, 276 | obs_tp1_ph: next_obs_batch, 277 | }) 278 | model_initialized = True 279 | 280 | session.run(train_fn, { 281 | obs_t_ph: obs_batch, 282 | act_t_ph: act_batch, 283 | rew_t_ph: rew_batch, 284 | obs_tp1_ph: next_obs_batch, 285 | done_mask_ph: done_batch, 286 | learning_rate: optimizer_spec.lr_schedule.value(t) 287 | }) 288 | 289 | num_param_updates += 1 290 | if num_param_updates % target_update_freq == 0: 291 | session.run(update_target_fn) 292 | 293 | ##### 294 | 295 | ### 4. Log progress 296 | episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards() 297 | if len(episode_rewards) > 0: 298 | mean_episode_reward = np.mean(episode_rewards[-100:]) 299 | if len(episode_rewards) > 100: 300 | best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) 301 | if t % LOG_EVERY_N_STEPS == 0 and model_initialized: 302 | print("Timestep %d" % (t,)) 303 | print("mean reward (100 episodes) %f" % mean_episode_reward) 304 | print("best mean reward %f" % best_mean_episode_reward) 305 | print("episodes %d" % len(episode_rewards)) 306 | print("exploration %f" % exploration.value(t)) 307 | print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) 308 | sys.stdout.flush() 309 | -------------------------------------------------------------------------------- /hw1/tf_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf # pylint: ignore-module 3 | #import builtins 4 | import functools 5 | import copy 6 | import os 7 | import collections 8 | 9 | # ================================================================ 10 | # Import all names into common namespace 11 | # ================================================================ 12 | 13 | clip = tf.clip_by_value 14 | 15 | # Make consistent with numpy 16 | # ---------------------------------------- 17 | 18 | def sum(x, axis=None, keepdims=False): 19 | return tf.reduce_sum(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 20 | def mean(x, axis=None, keepdims=False): 21 | return tf.reduce_mean(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 22 | def var(x, axis=None, keepdims=False): 23 | meanx = mean(x, axis=axis, keepdims=keepdims) 24 | return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims) 25 | def std(x, axis=None, keepdims=False): 26 | return tf.sqrt(var(x, axis=axis, keepdims=keepdims)) 27 | def max(x, axis=None, keepdims=False): 28 | return tf.reduce_max(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 29 | def min(x, axis=None, keepdims=False): 30 | return tf.reduce_min(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims) 31 | def concatenate(arrs, axis=0): 32 | return tf.concat(axis, arrs) 33 | def argmax(x, axis=None): 34 | return tf.argmax(x, dimension=axis) 35 | 36 | def switch(condition, then_expression, else_expression): 37 | '''Switches between two operations depending on a scalar value (int or bool). 38 | Note that both `then_expression` and `else_expression` 39 | should be symbolic tensors of the *same shape*. 40 | 41 | # Arguments 42 | condition: scalar tensor. 43 | then_expression: TensorFlow operation. 44 | else_expression: TensorFlow operation. 45 | ''' 46 | x_shape = copy.copy(then_expression.get_shape()) 47 | x = tf.cond(tf.cast(condition, 'bool'), 48 | lambda: then_expression, 49 | lambda: else_expression) 50 | x.set_shape(x_shape) 51 | return x 52 | 53 | # Extras 54 | # ---------------------------------------- 55 | def l2loss(params): 56 | if len(params) == 0: 57 | return tf.constant(0.0) 58 | else: 59 | return tf.add_n([sum(tf.square(p)) for p in params]) 60 | def lrelu(x, leak=0.2): 61 | f1 = 0.5 * (1 + leak) 62 | f2 = 0.5 * (1 - leak) 63 | return f1 * x + f2 * abs(x) 64 | def categorical_sample_logits(X): 65 | # https://github.com/tensorflow/tensorflow/issues/456 66 | U = tf.random_uniform(tf.shape(X)) 67 | return argmax(X - tf.log(-tf.log(U)), axis=1) 68 | 69 | # ================================================================ 70 | # Global session 71 | # ================================================================ 72 | 73 | def get_session(): 74 | return tf.get_default_session() 75 | 76 | def single_threaded_session(): 77 | tf_config = tf.ConfigProto( 78 | inter_op_parallelism_threads=1, 79 | intra_op_parallelism_threads=1) 80 | return tf.Session(config=tf_config) 81 | 82 | def make_session(num_cpu): 83 | tf_config = tf.ConfigProto( 84 | inter_op_parallelism_threads=num_cpu, 85 | intra_op_parallelism_threads=num_cpu) 86 | return tf.Session(config=tf_config) 87 | 88 | 89 | ALREADY_INITIALIZED = set() 90 | def initialize(): 91 | new_variables = set(tf.all_variables()) - ALREADY_INITIALIZED 92 | get_session().run(tf.initialize_variables(new_variables)) 93 | ALREADY_INITIALIZED.update(new_variables) 94 | 95 | 96 | def eval(expr, feed_dict=None): 97 | if feed_dict is None: feed_dict = {} 98 | return get_session().run(expr, feed_dict=feed_dict) 99 | 100 | def set_value(v, val): 101 | get_session().run(v.assign(val)) 102 | 103 | def load_state(fname): 104 | saver = tf.train.Saver() 105 | saver.restore(get_session(), fname) 106 | 107 | def save_state(fname): 108 | os.makedirs(os.path.dirname(fname), exist_ok=True) 109 | saver = tf.train.Saver() 110 | saver.save(get_session(), fname) 111 | 112 | # ================================================================ 113 | # Model components 114 | # ================================================================ 115 | 116 | 117 | def normc_initializer(std=1.0): 118 | def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613 119 | out = np.random.randn(*shape).astype(np.float32) 120 | out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True)) 121 | return tf.constant(out) 122 | return _initializer 123 | 124 | 125 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None, 126 | summary_tag=None): 127 | with tf.variable_scope(name): 128 | stride_shape = [1, stride[0], stride[1], 1] 129 | filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters] 130 | 131 | # there are "num input feature maps * filter height * filter width" 132 | # inputs to each hidden unit 133 | fan_in = intprod(filter_shape[:3]) 134 | # each unit in the lower layer receives a gradient from: 135 | # "num output feature maps * filter height * filter width" / 136 | # pooling size 137 | fan_out = intprod(filter_shape[:2]) * num_filters 138 | # initialize weights with random weights 139 | w_bound = np.sqrt(6. / (fan_in + fan_out)) 140 | 141 | w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound), 142 | collections=collections) 143 | b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer, 144 | collections=collections) 145 | 146 | if summary_tag is not None: 147 | tf.image_summary(summary_tag, 148 | tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]), 149 | [2, 0, 1, 3]), 150 | max_images=10) 151 | 152 | return tf.nn.conv2d(x, w, stride_shape, pad) + b 153 | 154 | 155 | def dense(x, size, name, weight_init=None, bias=True): 156 | w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init) 157 | ret = tf.matmul(x, w) 158 | if bias: 159 | b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer) 160 | return ret + b 161 | else: 162 | return ret 163 | 164 | def wndense(x, size, name, init_scale=1.0): 165 | v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size], 166 | initializer=tf.random_normal_initializer(0, 0.05)) 167 | g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale)) 168 | b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0)) 169 | 170 | # use weight normalization (Salimans & Kingma, 2016) 171 | x = tf.matmul(x, v) 172 | scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True)) 173 | return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size]) 174 | 175 | def densenobias(x, size, name, weight_init=None): 176 | return dense(x, size, name, weight_init=weight_init, bias=False) 177 | 178 | def dropout(x, pkeep, phase=None, mask=None): 179 | mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask 180 | if phase is None: 181 | return mask * x 182 | else: 183 | return switch(phase, mask*x, pkeep*x) 184 | 185 | def batchnorm(x, name, phase, updates, gamma=0.96): 186 | k = x.get_shape()[1] 187 | runningmean = tf.get_variable(name+"/mean", shape=[1, k], initializer=tf.constant_initializer(0.0), trainable=False) 188 | runningvar = tf.get_variable(name+"/var", shape=[1, k], initializer=tf.constant_initializer(1e-4), trainable=False) 189 | testy = (x - runningmean) / tf.sqrt(runningvar) 190 | 191 | mean_ = mean(x, axis=0, keepdims=True) 192 | var_ = mean(tf.square(x), axis=0, keepdims=True) 193 | std = tf.sqrt(var_) 194 | trainy = (x - mean_) / std 195 | 196 | updates.extend([ 197 | tf.assign(runningmean, runningmean * gamma + mean_ * (1 - gamma)), 198 | tf.assign(runningvar, runningvar * gamma + var_ * (1 - gamma)) 199 | ]) 200 | 201 | y = switch(phase, trainy, testy) 202 | 203 | out = y * tf.get_variable(name+"/scaling", shape=[1, k], initializer=tf.constant_initializer(1.0), trainable=True)\ 204 | + tf.get_variable(name+"/translation", shape=[1,k], initializer=tf.constant_initializer(0.0), trainable=True) 205 | return out 206 | 207 | 208 | 209 | # ================================================================ 210 | # Basic Stuff 211 | # ================================================================ 212 | 213 | def function(inputs, outputs, updates=None, givens=None): 214 | if isinstance(outputs, list): 215 | return _Function(inputs, outputs, updates, givens=givens) 216 | elif isinstance(outputs, (dict, collections.OrderedDict)): 217 | f = _Function(inputs, outputs.values(), updates, givens=givens) 218 | return lambda *inputs : type(outputs)(zip(outputs.keys(), f(*inputs))) 219 | else: 220 | f = _Function(inputs, [outputs], updates, givens=givens) 221 | return lambda *inputs : f(*inputs)[0] 222 | 223 | class _Function(object): 224 | def __init__(self, inputs, outputs, updates, givens, check_nan=False): 225 | assert all(len(i.op.inputs)==0 for i in inputs), "inputs should all be placeholders" 226 | self.inputs = inputs 227 | updates = updates or [] 228 | self.update_group = tf.group(*updates) 229 | self.outputs_update = list(outputs) + [self.update_group] 230 | self.givens = {} if givens is None else givens 231 | self.check_nan = check_nan 232 | def __call__(self, *inputvals): 233 | assert len(inputvals) == len(self.inputs) 234 | feed_dict = dict(zip(self.inputs, inputvals)) 235 | feed_dict.update(self.givens) 236 | results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] 237 | if self.check_nan: 238 | if any(np.isnan(r).any() for r in results): 239 | raise RuntimeError("Nan detected") 240 | return results 241 | 242 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size): 243 | if isinstance(outputs, list): 244 | return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size) 245 | else: 246 | f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size) 247 | return lambda *inputs : f(*inputs)[0] 248 | 249 | class _MemFriendlyFunction(object): 250 | def __init__(self, nondata_inputs, data_inputs, outputs, batch_size): 251 | self.nondata_inputs = nondata_inputs 252 | self.data_inputs = data_inputs 253 | self.outputs = list(outputs) 254 | self.batch_size = batch_size 255 | def __call__(self, *inputvals): 256 | assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs) 257 | nondata_vals = inputvals[0:len(self.nondata_inputs)] 258 | data_vals = inputvals[len(self.nondata_inputs):] 259 | feed_dict = dict(zip(self.nondata_inputs, nondata_vals)) 260 | n = data_vals[0].shape[0] 261 | for v in data_vals[1:]: 262 | assert v.shape[0] == n 263 | for i_start in range(0, n, self.batch_size): 264 | slice_vals = [v[i_start:min(i_start+self.batch_size, n)] for v in data_vals] 265 | for (var,val) in zip(self.data_inputs, slice_vals): 266 | feed_dict[var]=val 267 | results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict) 268 | if i_start==0: 269 | sum_results = results 270 | else: 271 | for i in range(len(results)): 272 | sum_results[i] = sum_results[i] + results[i] 273 | for i in range(len(results)): 274 | sum_results[i] = sum_results[i] / n 275 | return sum_results 276 | 277 | # ================================================================ 278 | # Modules 279 | # ================================================================ 280 | 281 | class Module(object): 282 | def __init__(self, name): 283 | self.name = name 284 | self.first_time = True 285 | self.scope = None 286 | self.cache = {} 287 | def __call__(self, *args): 288 | if args in self.cache: 289 | print("(%s) retrieving value from cache"%self.name) 290 | return self.cache[args] 291 | with tf.variable_scope(self.name, reuse=not self.first_time): 292 | scope = tf.get_variable_scope().name 293 | if self.first_time: 294 | self.scope = scope 295 | print("(%s) running function for the first time"%self.name) 296 | else: 297 | assert self.scope == scope, "Tried calling function with a different scope" 298 | print("(%s) running function on new inputs"%self.name) 299 | self.first_time = False 300 | out = self._call(*args) 301 | self.cache[args] = out 302 | return out 303 | def _call(self, *args): 304 | raise NotImplementedError 305 | 306 | @property 307 | def trainable_variables(self): 308 | assert self.scope is not None, "need to call module once before getting variables" 309 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 310 | 311 | @property 312 | def variables(self): 313 | assert self.scope is not None, "need to call module once before getting variables" 314 | return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope) 315 | 316 | 317 | def module(name): 318 | @functools.wraps 319 | def wrapper(f): 320 | class WrapperModule(Module): 321 | def _call(self, *args): 322 | return f(*args) 323 | return WrapperModule(name) 324 | return wrapper 325 | 326 | # ================================================================ 327 | # Graph traversal 328 | # ================================================================ 329 | 330 | VARIABLES = {} 331 | 332 | 333 | def get_parents(node): 334 | return node.op.inputs 335 | 336 | def topsorted(outputs): 337 | """ 338 | Topological sort via non-recursive depth-first search 339 | """ 340 | assert isinstance(outputs, (list,tuple)) 341 | marks = {} 342 | out = [] 343 | stack = [] #pylint: disable=W0621 344 | # i: node 345 | # jidx = number of children visited so far from that node 346 | # marks: state of each node, which is one of 347 | # 0: haven't visited 348 | # 1: have visited, but not done visiting children 349 | # 2: done visiting children 350 | for x in outputs: 351 | stack.append((x,0)) 352 | while stack: 353 | (i,jidx) = stack.pop() 354 | if jidx == 0: 355 | m = marks.get(i,0) 356 | if m == 0: 357 | marks[i] = 1 358 | elif m == 1: 359 | raise ValueError("not a dag") 360 | else: 361 | continue 362 | ps = get_parents(i) 363 | if jidx == len(ps): 364 | marks[i] = 2 365 | out.append(i) 366 | else: 367 | stack.append((i,jidx+1)) 368 | j = ps[jidx] 369 | stack.append((j,0)) 370 | return out 371 | 372 | 373 | # ================================================================ 374 | # Flat vectors 375 | # ================================================================ 376 | 377 | def var_shape(x): 378 | out = [k.value for k in x.get_shape()] 379 | assert all(isinstance(a, int) for a in out), \ 380 | "shape function assumes that shape is fully known" 381 | return out 382 | 383 | def numel(x): 384 | return intprod(var_shape(x)) 385 | 386 | def intprod(x): 387 | return int(np.prod(x)) 388 | 389 | def flatgrad(loss, var_list): 390 | grads = tf.gradients(loss, var_list) 391 | return tf.concat(0, [tf.reshape(grad, [numel(v)]) 392 | for (v, grad) in zip(var_list, grads)]) 393 | 394 | class SetFromFlat(object): 395 | def __init__(self, var_list, dtype=tf.float32): 396 | assigns = [] 397 | shapes = list(map(var_shape, var_list)) 398 | total_size = np.sum([intprod(shape) for shape in shapes]) 399 | 400 | self.theta = theta = tf.placeholder(dtype,[total_size]) 401 | start=0 402 | assigns = [] 403 | for (shape,v) in zip(shapes,var_list): 404 | size = intprod(shape) 405 | assigns.append(tf.assign(v, tf.reshape(theta[start:start+size],shape))) 406 | start+=size 407 | self.op = tf.group(*assigns) 408 | def __call__(self, theta): 409 | get_session().run(self.op, feed_dict={self.theta:theta}) 410 | 411 | class GetFlat(object): 412 | def __init__(self, var_list): 413 | self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list]) 414 | def __call__(self): 415 | return get_session().run(self.op) 416 | 417 | # ================================================================ 418 | # Misc 419 | # ================================================================ 420 | 421 | 422 | def fancy_slice_2d(X, inds0, inds1): 423 | """ 424 | like numpy X[inds0, inds1] 425 | XXX this implementation is bad 426 | """ 427 | inds0 = tf.cast(inds0, tf.int64) 428 | inds1 = tf.cast(inds1, tf.int64) 429 | shape = tf.cast(tf.shape(X), tf.int64) 430 | ncols = shape[1] 431 | Xflat = tf.reshape(X, [-1]) 432 | return tf.gather(Xflat, inds0 * ncols + inds1) 433 | 434 | 435 | def scope_vars(scope, trainable_only): 436 | """ 437 | Get variables inside a scope 438 | The scope can be specified as a string 439 | """ 440 | return tf.get_collection( 441 | tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES, 442 | scope=scope if isinstance(scope, str) else scope.name 443 | ) 444 | 445 | def lengths_to_mask(lengths_b, max_length): 446 | """ 447 | Turns a vector of lengths into a boolean mask 448 | 449 | Args: 450 | lengths_b: an integer vector of lengths 451 | max_length: maximum length to fill the mask 452 | 453 | Returns: 454 | a boolean array of shape (batch_size, max_length) 455 | row[i] consists of True repeated lengths_b[i] times, followed by False 456 | """ 457 | lengths_b = tf.convert_to_tensor(lengths_b) 458 | assert lengths_b.get_shape().ndims == 1 459 | mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1) 460 | return mask_bt 461 | 462 | 463 | def in_session(f): 464 | @functools.wraps(f) 465 | def newfunc(*args, **kwargs): 466 | with tf.Session(): 467 | f(*args, **kwargs) 468 | return newfunc 469 | 470 | 471 | _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape) 472 | def get_placeholder(name, dtype, shape): 473 | print("calling get_placeholder", name) 474 | if name in _PLACEHOLDER_CACHE: 475 | out, dtype1, shape1 = _PLACEHOLDER_CACHE[name] 476 | assert dtype1==dtype and shape1==shape 477 | return out 478 | else: 479 | out = tf.placeholder(dtype=dtype, shape=shape, name=name) 480 | _PLACEHOLDER_CACHE[name] = (out,dtype,shape) 481 | return out 482 | def get_placeholder_cached(name): 483 | return _PLACEHOLDER_CACHE[name][0] 484 | 485 | def flattenallbut0(x): 486 | return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])]) 487 | 488 | def reset(): 489 | global _PLACEHOLDER_CACHE 490 | global VARIABLES 491 | _PLACEHOLDER_CACHE = {} 492 | VARIABLES = {} 493 | tf.reset_default_graph() 494 | --------------------------------------------------------------------------------