├── hw4
    ├── data
    │   ├── mb_mpc_HalfCheetah-v1_25-06-2018_14-51-34
    │   │   └── log.txt
    │   ├── mb_mpc_HalfCheetah-v1_25-06-2018_15-01-46
    │   │   └── log.txt
    │   ├── mb_mpc_HalfCheetah-v1_25-06-2018_15-22-28
    │   │   └── log.txt
    │   ├── mb_mpc_HalfCheetah-v1_25-06-2018_15-23-41
    │   │   └── log.txt
    │   └── mb_mpc_HalfCheetah-v1_25-06-2018_15-28-11
    │   │   └── log.txt
    ├── HW4.pdf
    ├── cheetah_env.py
    ├── controllers.py
    ├── cost_functions.py
    ├── logz.py
    ├── plot.py
    ├── dynamics.py
    └── main.py
├── hw1
    ├── HW1.pdf
    ├── .DS_Store
    ├── experts
    │   ├── Ant-v1.pkl
    │   ├── Hopper-v1.pkl
    │   ├── Humanoid-v1.pkl
    │   ├── Reacher-v1.pkl
    │   ├── Walker2d-v1.pkl
    │   └── HalfCheetah-v1.pkl
    ├── DAgger.bash
    ├── demo.bash
    ├── README.md
    ├── load_policy.py
    ├── model.py
    ├── run_expert.py
    ├── DAgger.py
    └── tf_util.py
├── hw2
    ├── HW2.pdf
    ├── hw2_final.pdf
    ├── logz.py
    ├── plot.py
    └── TestNoteBook.ipynb
├── hw3
    ├── HW3.pdf
    ├── README
    ├── run_dqn_ram.py
    ├── run_dqn_atari.py
    ├── atari_wrappers.py
    ├── Testing.ipynb
    ├── dqn_utils.py
    └── dqn.py
├── sp17_hw
    ├── hw1
    │   ├── experts
    │   │   ├── Ant-v1.pkl
    │   │   ├── Hopper-v1.pkl
    │   │   ├── Reacher-v1.pkl
    │   │   ├── Humanoid-v1.pkl
    │   │   ├── Walker2d-v1.pkl
    │   │   └── HalfCheetah-v1.pkl
    │   ├── demo.bash
    │   ├── README.md
    │   ├── run_expert.py
    │   └── load_policy.py
    ├── hw3
    │   ├── README
    │   ├── run_dqn_ram.py
    │   ├── run_dqn_atari.py
    │   ├── atari_wrappers.py
    │   ├── dqn.py
    │   └── dqn_utils.py
    ├── hw4
    │   ├── plot_learning_curves.py
    │   ├── logz.py
    │   ├── homework.md
    │   └── main.py
    └── hw2
    │   ├── discrete_env.py
    │   └── frozen_lake.py
├── LICENSE
├── .gitignore
└── README.md


/hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_14-51-34/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_15-01-46/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_15-22-28/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_15-23-41/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hw4/data/mb_mpc_HalfCheetah-v1_25-06-2018_15-28-11/log.txt:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/hw1/HW1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/HW1.pdf


--------------------------------------------------------------------------------
/hw2/HW2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw2/HW2.pdf


--------------------------------------------------------------------------------
/hw3/HW3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw3/HW3.pdf


--------------------------------------------------------------------------------
/hw4/HW4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw4/HW4.pdf


--------------------------------------------------------------------------------
/hw1/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/.DS_Store


--------------------------------------------------------------------------------
/hw2/hw2_final.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw2/hw2_final.pdf


--------------------------------------------------------------------------------
/hw1/experts/Ant-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Ant-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Hopper-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Hopper-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Humanoid-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Humanoid-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Reacher-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Reacher-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/Walker2d-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/Walker2d-v1.pkl


--------------------------------------------------------------------------------
/hw1/experts/HalfCheetah-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/hw1/experts/HalfCheetah-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Ant-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Ant-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Hopper-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Hopper-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Reacher-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Reacher-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Humanoid-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Humanoid-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/Walker2d-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/Walker2d-v1.pkl


--------------------------------------------------------------------------------
/sp17_hw/hw1/experts/HalfCheetah-v1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/louaaron/CS294_homework/HEAD/sp17_hw/hw1/experts/HalfCheetah-v1.pkl


--------------------------------------------------------------------------------
/hw1/DAgger.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 | for e in Hopper Ant HalfCheetah Humanoid Reacher Walker2d
4 | do
5 |     python DAgger.py experts/$e-v1.pkl $e-v2 --num_rollouts=5
6 | done
7 | 


--------------------------------------------------------------------------------
/hw1/demo.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 | for e in Hopper Ant HalfCheetah Humanoid Reacher Walker2d
4 | do
5 |     python run_expert.py experts/$e-v1.pkl $e-v2 --num_rollouts=5
6 | done
7 | 


--------------------------------------------------------------------------------
/sp17_hw/hw1/demo.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 | for e in Hopper-v1 Ant-v1 HalfCheetah-v1 Humanoid-v1 Reacher-v1 Walker2d-v1
4 | do
5 |     python run_expert.py experts/$e.pkl $e --render --num_rollouts=1
6 | done
7 | 


--------------------------------------------------------------------------------
/hw3/README:
--------------------------------------------------------------------------------
1 | See http://rll.berkeley.edu/deeprlcourse/f17docs/hw3.pdf for instructions
2 | 
3 | The starter code was based on an implementation of Q-learning for Atari
4 | generously provided by Szymon Sidor from OpenAI
5 | 
6 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/README:
--------------------------------------------------------------------------------
1 | See http://rll.berkeley.edu/deeprlcourse/docs/hw3.pdf for instructions
2 | 
3 | The starter code was based on an implementation of Q-learning for Atari
4 | generously provided by Szymon Sidor from OpenAI
5 | 
6 | 


--------------------------------------------------------------------------------
/hw1/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 1: Imitation Learning
 2 | 
 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym
 4 | 
 5 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines.
 6 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638).
 7 | 
 8 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data.
 9 | 
10 | In `experts/`, the provided expert policies are:
11 | * Ant-v1.pkl
12 | * HalfCheetah-v1.pkl
13 | * Hopper-v1.pkl
14 | * Humanoid-v1.pkl
15 | * Reacher-v1.pkl
16 | * Walker2d-v1.pkl
17 | 
18 | The name of the pickle file corresponds to the name of the gym environment.
19 | 


--------------------------------------------------------------------------------
/sp17_hw/hw1/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 1: Imitation Learning
 2 | 
 3 | Dependencies: TensorFlow, MuJoCo version 1.31, OpenAI Gym
 4 | 
 5 | **Note**: MuJoCo versions until 1.5 do not support NVMe disks therefore won't be compatible with recent Mac machines.
 6 | There is a request for OpenAI to support it that can be followed [here](https://github.com/openai/gym/issues/638).
 7 | 
 8 | The only file that you need to look at is `run_expert.py`, which is code to load up an expert policy, run a specified number of roll-outs, and save out data.
 9 | 
10 | In `experts/`, the provided expert policies are:
11 | * Ant-v1.pkl
12 | * HalfCheetah-v1.pkl
13 | * Hopper-v1.pkl
14 | * Humanoid-v1.pkl
15 | * Reacher-v1.pkl
16 | * Walker2d-v1.pkl
17 | 
18 | The name of the pickle file corresponds to the name of the gym environment.
19 | 


--------------------------------------------------------------------------------
/sp17_hw/hw4/plot_learning_curves.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | parser = argparse.ArgumentParser()
 3 | parser.add_argument("expdir", help="experiment dir, e.g., /tmp/experiments")
 4 | args = parser.parse_args()
 5 | 
 6 | from pylab import *
 7 | import os
 8 | from os.path import join
 9 | 
10 | dirnames = os.listdir(args.expdir)
11 | 
12 | fig, axes = subplots(4)
13 | for dirname in dirnames:
14 |     print(dirname)
15 |     A = np.genfromtxt(join(args.expdir, dirname, 'log.txt'),delimiter='\t',dtype=None, names=True)
16 |     # axes[0].plot(scipy.signal.savgol_filter(A['EpRewMean'] , 21, 3), '-x')
17 |     x = A['TimestepsSoFar']
18 |     axes[0].plot(x, A['EpRewMean'], '-x')
19 |     axes[1].plot(x, A['KLOldNew'], '-x')
20 |     axes[2].plot(x, A['Entropy'], '-x')
21 |     axes[3].plot(x, A['EVBefore'], '-x')
22 | legend(dirnames,loc='best').draggable()
23 | axes[0].set_ylabel("EpRewMean")
24 | axes[1].set_ylabel("KLOldNew")
25 | axes[2].set_ylabel("Entropy")
26 | axes[3].set_ylabel("EVBefore")
27 | axes[3].set_ylim(-1,1)
28 | axes[-1].set_xlabel("Iterations")
29 | show()
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 berkeleydeeprlcourse
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/hw4/cheetah_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class HalfCheetahEnvNew(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     def __init__(self):
 7 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
 8 |         utils.EzPickle.__init__(self)
 9 | 
10 |     def _step(self, action):
11 |         xposbefore = self.model.data.qpos[0, 0]
12 |         self.do_simulation(action, self.frame_skip)
13 |         xposafter = self.model.data.qpos[0, 0]
14 |         ob = self._get_obs()
15 |         reward_ctrl = - 0.1 * np.square(action).sum()
16 |         reward_run = (xposafter - xposbefore)/self.dt
17 |         reward = reward_ctrl + reward_run
18 |         done = False
19 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
20 | 
21 |     def _get_obs(self):
22 |         return np.concatenate([
23 |             self.model.data.qpos.flat[1:],
24 |             self.model.data.qvel.flat,
25 |             self.get_body_com("torso").flat,
26 |             # self.get_body_comvel("torso").flat,
27 |         ])
28 | 
29 |     def reset_model(self):
30 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
31 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
32 |         self.set_state(qpos, qvel)
33 |         return self._get_obs()
34 | 
35 |     def viewer_setup(self):
36 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | hw1/behavior_cloning/
 2 | hw1/DAgger/
 3 | hw2/data/
 4 | 
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # PyInstaller
32 | #  Usually these files are written by a python script from a template
33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | .hypothesis/
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | 
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 | 
64 | # Scrapy stuff:
65 | .scrapy
66 | 
67 | # Sphinx documentation
68 | docs/_build/
69 | 
70 | # PyBuilder
71 | target/
72 | 
73 | # IPython Notebook
74 | .ipynb_checkpoints
75 | 
76 | # pyenv
77 | .python-version
78 | 
79 | # celery beat schedule file
80 | celerybeat-schedule
81 | 
82 | # dotenv
83 | .env
84 | 
85 | # virtualenv
86 | venv/
87 | ENV/
88 | 
89 | # Spyder project settings
90 | .spyderproject
91 | 
92 | # Rope project settings
93 | .ropeproject
94 | 


--------------------------------------------------------------------------------
/sp17_hw/hw2/discrete_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from gym import Env, spaces
 4 | from gym.utils import seeding
 5 | 
 6 | def categorical_sample(prob_n, np_random):
 7 |     """
 8 |     Sample from categorical distribution
 9 |     Each row specifies class probabilities
10 |     """
11 |     prob_n = np.asarray(prob_n)
12 |     csprob_n = np.cumsum(prob_n)
13 |     return (csprob_n > np_random.rand()).argmax()
14 | 
15 | 
16 | class DiscreteEnv(Env):
17 | 
18 |     """
19 |     Has the following members
20 |     - nS: number of states
21 |     - nA: number of actions
22 |     - P: transitions (*)
23 |     - isd: initial state distribution (**)
24 | 
25 |     (*) dictionary dict of dicts of lists, where
26 |       P[s][a] == [(probability, nextstate, reward, done), ...]
27 |     (**) list or array of length nS
28 | 
29 | 
30 |     """
31 |     def __init__(self, nS, nA, P, isd):
32 |         self.P = P
33 |         self.isd = isd
34 |         self.lastaction=None # for rendering
35 |         self.nS = nS
36 |         self.nA = nA
37 | 
38 |         self.action_space = spaces.Discrete(self.nA)
39 |         self.observation_space = spaces.Discrete(self.nS)
40 | 
41 |         self._seed()
42 |         self._reset()
43 | 
44 |     def _seed(self, seed=None):
45 |         self.np_random, seed = seeding.np_random(seed)
46 |         return [seed]
47 | 
48 |     def _reset(self):
49 |         self.s = categorical_sample(self.isd, self.np_random)
50 |         self.lastaction=None
51 |         return self.s
52 | 
53 |     def _step(self, a):
54 |         transitions = self.P[self.s][a]
55 |         i = categorical_sample([t[0] for t in transitions], self.np_random)
56 |         p, s, r, d= transitions[i]
57 |         self.s = s
58 |         self.lastaction=a
59 |         return (s, r, d, {"prob" : p})
60 | 


--------------------------------------------------------------------------------
/hw4/controllers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from cost_functions import trajectory_cost_fn
 3 | import time
 4 | 
 5 | class Controller():
 6 | 	def __init__(self):
 7 | 		pass
 8 | 
 9 | 	# Get the appropriate action(s) for this state(s)
10 | 	def get_action(self, state):
11 | 		pass
12 | 
13 | 
14 | class RandomController(Controller):
15 | 	def __init__(self, env):
16 | 		self.env = env
17 | 
18 | 	def get_action(self, state):
19 | 		""" Your code should randomly sample an action uniformly from the action space """
20 | 		return self.env.action_space.sample()
21 | 
22 | 
23 | class MPCcontroller(Controller):
24 | 	""" Controller built using the MPC method outlined in https://arxiv.org/abs/1708.02596 """
25 | 	def __init__(self, 
26 | 				 env, 
27 | 				 dyn_model, 
28 | 				 horizon=5, 
29 | 				 cost_fn=None, 
30 | 				 num_simulated_paths=10,
31 | 				 ):
32 | 		self.env = env
33 | 		self.dyn_model = dyn_model
34 | 		self.horizon = horizon
35 | 		self.cost_fn = cost_fn
36 | 		self.num_simulated_paths = num_simulated_paths
37 | 
38 | 	def get_action(self, state):
39 | 		""" Note: be careful to batch your simulations through the model for speed """
40 | 		
41 | 		state_batch, states, next_states, actions = [], [], [], []
42 | 
43 | 		#state batches have dimension (K, dim(state))
44 | 		for _ in range(self.num_simulated_paths):
45 | 			state_batch.append(state)
46 | 
47 | 		for _ in range(self.horizon):
48 | 			action = []
49 | 			for _ in range(self.num_simulated_paths):
50 | 				action.append(self.env.action_space.sample())
51 | 			actions.append(action)
52 | 			states.append(state_batch)
53 | 			#use batch for speed
54 | 			state_batch = self.dyn_model.predict(np.array(state_batch), np.array(action))
55 | 
56 | 			next_states.append(state_batch)
57 | 
58 | 		costs = trajectory_cost_fn(self.cost_fn, np.array(states), np.array(actions), np.array(next_states))
59 | 		j_star = np.argmin(np.array(costs))
60 | 		return actions[0][j_star]
61 | 
62 | 


--------------------------------------------------------------------------------
/hw4/cost_functions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | #========================================================
 5 | # 
 6 | # Environment-specific cost functions:
 7 | #
 8 | 
 9 | def cheetah_cost_fn(state, action, next_state):
10 |     if len(state.shape) > 1:
11 | 
12 |         heading_penalty_factor=10
13 |         scores=np.zeros((state.shape[0],))
14 | 
15 |         #dont move front shin back so far that you tilt forward
16 |         front_leg = state[:,5]
17 |         my_range = 0.2
18 |         scores[front_leg>=my_range] += heading_penalty_factor
19 | 
20 |         front_shin = state[:,6]
21 |         my_range = 0
22 |         scores[front_shin>=my_range] += heading_penalty_factor
23 | 
24 |         front_foot = state[:,7]
25 |         my_range = 0
26 |         scores[front_foot>=my_range] += heading_penalty_factor
27 | 
28 |         scores-= (next_state[:,17] - state[:,17]) / 0.01 #+ 0.1 * (np.sum(action**2, axis=1))
29 |         return scores
30 | 
31 |     heading_penalty_factor=10
32 |     score = 0
33 | 
34 |     #dont move front shin back so far that you tilt forward
35 |     front_leg = state[5]
36 |     my_range = 0.2
37 |     if front_leg>=my_range:
38 |         score += heading_penalty_factor
39 | 
40 |     front_shin = state[6]
41 |     my_range = 0
42 |     if front_shin>=my_range:
43 |         score += heading_penalty_factor
44 | 
45 |     front_foot = state[7]
46 |     my_range = 0
47 |     if front_foot>=my_range:
48 |         score += heading_penalty_factor
49 | 
50 |     score -= (next_state[17] - state[17]) / 0.01 #+ 0.1 * (np.sum(action**2))
51 |     return score
52 | 
53 | #========================================================
54 | # 
55 | # Cost function for a whole trajectory:
56 | #
57 | 
58 | def trajectory_cost_fn(cost_fn, states, actions, next_states):
59 |     trajectory_cost = 0
60 |     for i in range(len(actions)):
61 |         trajectory_cost += cost_fn(states[i], actions[i], next_states[i])
62 |     return trajectory_cost


--------------------------------------------------------------------------------
/sp17_hw/hw1/run_expert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Code to load an expert policy and generate roll-out data for behavioral cloning.
 5 | Example usage:
 6 |     python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
 7 |             --num_rollouts 20
 8 | 
 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
10 | """
11 | 
12 | import pickle
13 | import tensorflow as tf
14 | import numpy as np
15 | import tf_util
16 | import gym
17 | import load_policy
18 | 
19 | def main():
20 |     import argparse
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('expert_policy_file', type=str)
23 |     parser.add_argument('envname', type=str)
24 |     parser.add_argument('--render', action='store_true')
25 |     parser.add_argument("--max_timesteps", type=int)
26 |     parser.add_argument('--num_rollouts', type=int, default=20,
27 |                         help='Number of expert roll outs')
28 |     args = parser.parse_args()
29 | 
30 |     print('loading and building expert policy')
31 |     policy_fn = load_policy.load_policy(args.expert_policy_file)
32 |     print('loaded and built')
33 | 
34 |     with tf.Session():
35 |         tf_util.initialize()
36 | 
37 |         import gym
38 |         env = gym.make(args.envname)
39 |         max_steps = args.max_timesteps or env.spec.timestep_limit
40 | 
41 |         returns = []
42 |         observations = []
43 |         actions = []
44 |         for i in range(args.num_rollouts):
45 |             print('iter', i)
46 |             obs = env.reset()
47 |             done = False
48 |             totalr = 0.
49 |             steps = 0
50 |             while not done:
51 |                 action = policy_fn(obs[None,:])
52 |                 observations.append(obs)
53 |                 actions.append(action)
54 |                 obs, r, done, _ = env.step(action)
55 |                 totalr += r
56 |                 steps += 1
57 |                 if args.render:
58 |                     env.render()
59 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
60 |                 if steps >= max_steps:
61 |                     break
62 |             returns.append(totalr)
63 | 
64 |         print('returns', returns)
65 |         print('mean return', np.mean(returns))
66 |         print('std of return', np.std(returns))
67 | 
68 |         expert_data = {'observations': np.array(observations),
69 |                        'actions': np.array(actions)}
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/hw1/load_policy.py:
--------------------------------------------------------------------------------
 1 | import pickle, tensorflow as tf, tf_util, numpy as np
 2 | 
 3 | def load_policy(filename):
 4 |     with open(filename, 'rb') as f:
 5 |         data = pickle.loads(f.read())
 6 | 
 7 |     # assert len(data.keys()) == 2
 8 |     nonlin_type = data['nonlin_type']
 9 |     policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
10 | 
11 |     assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
12 |     policy_params = data[policy_type]
13 | 
14 |     assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
15 | 
16 |     # Keep track of input and output dims (i.e. observation and action dims) for the user
17 | 
18 |     def build_policy(obs_bo):
19 |         def read_layer(l):
20 |             assert list(l.keys()) == ['AffineLayer']
21 |             assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
22 |             return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)
23 | 
24 |         def apply_nonlin(x):
25 |             if nonlin_type == 'lrelu':
26 |                 return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
27 |             elif nonlin_type == 'tanh':
28 |                 return tf.tanh(x)
29 |             else:
30 |                 raise NotImplementedError(nonlin_type)
31 | 
32 |         # Build the policy. First, observation normalization.
33 |         assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
34 |         obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
35 |         obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
36 |         obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
37 |         print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
38 |         normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
39 | 
40 |         curr_activations_bd = normedobs_bo
41 | 
42 |         # Hidden layers next
43 |         assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
44 |         layer_params = policy_params['hidden']['FeedforwardNet']
45 |         for layer_name in sorted(layer_params.keys()):
46 |             l = layer_params[layer_name]
47 |             W, b = read_layer(l)
48 |             curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)
49 | 
50 |         # Output layer
51 |         W, b = read_layer(policy_params['out'])
52 |         output_bo = tf.matmul(curr_activations_bd, W) + b
53 |         return output_bo
54 | 
55 |     obs_bo = tf.placeholder(tf.float32, [None, None])
56 |     a_ba = build_policy(obs_bo)
57 |     policy_fn = tf_util.function([obs_bo], a_ba)
58 |     return policy_fn


--------------------------------------------------------------------------------
/sp17_hw/hw1/load_policy.py:
--------------------------------------------------------------------------------
 1 | import pickle, tensorflow as tf, tf_util, numpy as np
 2 | 
 3 | def load_policy(filename):
 4 |     with open(filename, 'rb') as f:
 5 |         data = pickle.loads(f.read())
 6 | 
 7 |     # assert len(data.keys()) == 2
 8 |     nonlin_type = data['nonlin_type']
 9 |     policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
10 | 
11 |     assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
12 |     policy_params = data[policy_type]
13 | 
14 |     assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
15 | 
16 |     # Keep track of input and output dims (i.e. observation and action dims) for the user
17 | 
18 |     def build_policy(obs_bo):
19 |         def read_layer(l):
20 |             assert list(l.keys()) == ['AffineLayer']
21 |             assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
22 |             return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)
23 | 
24 |         def apply_nonlin(x):
25 |             if nonlin_type == 'lrelu':
26 |                 return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
27 |             elif nonlin_type == 'tanh':
28 |                 return tf.tanh(x)
29 |             else:
30 |                 raise NotImplementedError(nonlin_type)
31 | 
32 |         # Build the policy. First, observation normalization.
33 |         assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
34 |         obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
35 |         obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
36 |         obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
37 |         print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
38 |         normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
39 | 
40 |         curr_activations_bd = normedobs_bo
41 | 
42 |         # Hidden layers next
43 |         assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
44 |         layer_params = policy_params['hidden']['FeedforwardNet']
45 |         for layer_name in sorted(layer_params.keys()):
46 |             l = layer_params[layer_name]
47 |             W, b = read_layer(l)
48 |             curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)
49 | 
50 |         # Output layer
51 |         W, b = read_layer(policy_params['out'])
52 |         output_bo = tf.matmul(curr_activations_bd, W) + b
53 |         return output_bo
54 | 
55 |     obs_bo = tf.placeholder(tf.float32, [None, None])
56 |     a_ba = build_policy(obs_bo)
57 |     policy_fn = tf_util.function([obs_bo], a_ba)
58 |     return policy_fn


--------------------------------------------------------------------------------
/hw1/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from sklearn.utils import shuffle
 4 | 
 5 | class Model(object):
 6 |     def __init__(self, input_data, output_data, name, type, batch_size = 64):
 7 |         self.name = type + "/" + name
 8 |         self.batch_size = batch_size
 9 |         self.input_data = input_data
10 |         self.output_data = np.reshape(output_data, (output_data.shape[0], output_data.shape[2]))
11 |         self.sess = tf.Session()
12 | 
13 |         self.input_shape = [self.batch_size, self.input_data.shape[-1]]
14 |         self.output_shape = [self.batch_size, self.output_data.shape[-1]]
15 | 
16 |         self.input_placeholder = tf.placeholder(tf.float32, shape = self.input_shape)
17 |         self.output_placeholder_true = tf.placeholder(tf.float32, shape = self.output_shape)
18 |         self.output_placeholder_false = self.build_model(self.input_placeholder)
19 | 
20 |         self.loss = tf.reduce_mean(tf.nn.l2_loss(self.output_placeholder_true - self.output_placeholder_false))
21 |         self.loss_summary = tf.summary.scalar("loss", self.loss)
22 | 
23 |     def build_model(self, input_placeholder):
24 |         x = tf.layers.dense(input_placeholder, 64, activation = tf.nn.tanh)
25 |         x = tf.layers.dense(x, 32, activation = tf.nn.tanh)
26 |         x = tf.layers.dense(x, self.output_shape[-1])
27 |         return x
28 | 
29 |     def train(self, epochs = 20, train_data = None, test_data = None, number = None):
30 |         if train_data is None and test_data is None :
31 |             train_data, test_data = shuffle(self.input_data, self.output_data, random_state = 0)
32 |         else:
33 |             test_data = np.reshape(test_data, (test_data.shape[0], test_data.shape[2]))
34 | 
35 |         optimizer = tf.train.AdamOptimizer().minimize(self.loss)
36 | 
37 |         saver = tf.train.Saver()
38 | 
39 |         batch_idxs = len(train_data) // self.batch_size
40 | 
41 |         if number is None:
42 |             writer = tf.summary.FileWriter(self.name)
43 |         else:
44 |             writer = tf.summary.FileWriter(self.name + str(number))
45 |         writer.add_graph(self.sess.graph)
46 | 
47 |         init_op = tf.global_variables_initializer()
48 |         self.sess.run(init_op)
49 | 
50 |         for epoch in range(epochs):
51 |             for idx in range(batch_idxs):
52 |                 batch_train = train_data[idx * self.batch_size : (idx + 1) * self.batch_size]
53 |                 batch_value = test_data[idx * self.batch_size : (idx + 1) * self.batch_size]
54 |                 feed_train = {self.input_placeholder : batch_train, self.output_placeholder_true : batch_value}
55 |                 self.sess.run(optimizer, feed_dict = feed_train)
56 | 
57 |                 if idx % 20 == 0:
58 |                     loss_value = self.sess.run(self.loss_summary, feed_dict = feed_train)
59 |                     writer.add_summary(loss_value, epoch * batch_idxs + idx)
60 | 
61 |         saver.save(self.sess, self.name + "/behavior_cloning_model")
62 | 
63 |     def sample(self, input):
64 |         output = self.sess.run(self.output_placeholder_false, feed_dict = {self.input_placeholder : np.repeat(input[None, :], 64, axis = 0)})
65 |         return output[0]
66 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CS 294-112 homework (offered in Fall of 2017)
 2 | 
 3 | This is my github repo for homework for [CS294](http://rail.eecs.berkeley.edu/deeprlcourse-fa17/index.html) (offered in Fall 2017). I covered this course remotely (using lecture notes and videos) and implemented the coding parts of the homework. Below are synopses for what I implemented for each homework assignment.  
 4 | 
 5 | *Disclaimer: this code is for **educational purposes** only.  Students taking current iterations of this course should refrain from copying this code, as that would breach academic integrity and hamper their own education.*
 6 | 
 7 | ## Dependencies
 8 | 
 9 | * [Tensorflow 1.4](https://www.tensorflow.org/)
10 | * [Numpy 1.13.3](http://www.numpy.org/)
11 | * [Gym 0.10.5](https://gym.openai.com/) Gym 0.9.5 was used for homework 3.
12 | * [Mujoco 1.5](http://www.mujoco.org/)
13 | 
14 | Note that some of these dependencies were not released at the time of this course. Furthermore, the starter code has been modified to reflect changes in OpenAI Gym's documentation.
15 | 
16 | ## Homework 1
17 | 
18 | The course, up to this point, has covered more basic supervised learning. I implemented BC (behavior cloning) and DAgger (Dataset Aggregation), which improved the results (slightly). I also experimented with various hyperparameters. 
19 | 
20 | ## Homework 2
21 | 
22 | I implemented the policy gradient algorithm and ran some tests on various environments. I played with the hyperparameters and saw that my implementation caused the agent's reward to converge to the theoretical value. I also implemented GAE (generalized advantage estimation) and compared its results. 
23 | 
24 | ## Homework 3
25 | 
26 | I implemented the DQN algorithm and ran it on the Atari Pong simulator. I experimented with different hyperparameters and saw that my model converged to the perfect value.
27 | 
28 | ## Homework 4
29 | 
30 | I implemented the MPC algorithm. However, I was unable to run the provided HalfCheetahEnvNew as it threw 
31 | 
32 | ~~~~
33 | 'mujoco_py.cymj.PyMjModel' object has no attribute 'data'
34 | ~~~~
35 | 
36 | Furthermore, when I attempted to work with the given 'HalfCheetah-v2' environment that (in terms of raw code) is isomorphic to the HalfCheetahEnvNew, the action dimensions representing
37 | 
38 | ~~~~
39 | - rootx     slider      position (m)
40 | - rootz     slider      position (m)
41 | - rooty     hinge       angle (rad)
42 | - bthigh    hinge       angle (rad)
43 | - bshin     hinge       angle (rad)
44 | - bfoot     hinge       angle (rad)
45 | - fthigh    hinge       angle (rad)
46 | - fshin     hinge       angle (rad)
47 | - ffoot     hinge       angle (rad)
48 | - rootx     slider      velocity (m/s)
49 | - rootz     slider      velocity (m/s)
50 | - rooty     hinge       angular velocity (rad/s)
51 | - bthigh    hinge       angular velocity (rad/s)
52 | - bshin     hinge       angular velocity (rad/s)
53 | - bfoot     hinge       angular velocity (rad/s)
54 | - fthigh    hinge       angular velocity (rad/s)
55 | - fshin     hinge       angular velocity (rad/s)
56 | - ffoot     hinge       angular velocity (rad/s)
57 | ~~~~
58 | 
59 | Aren't correctly represented in the loss function (the comments about what each part represents don't match up). Furthermore, for some strange reason, all HalfCheetah environments load in 17 variables, not 18. 


--------------------------------------------------------------------------------
/hw1/run_expert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Code to load an expert policy and generate roll-out data for behavioral cloning.
 5 | Example usage:
 6 |     python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
 7 |             --num_rollouts 20
 8 | 
 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
10 | """
11 | 
12 | import pickle
13 | import tensorflow as tf
14 | import numpy as np
15 | import tf_util
16 | import gym
17 | import load_policy
18 | import model
19 | 
20 | def main():
21 |     import argparse
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('expert_policy_file', type=str)
24 |     parser.add_argument('envname', type=str)
25 |     parser.add_argument('--render', action='store_true')
26 |     parser.add_argument("--max_timesteps", type=int)
27 |     parser.add_argument('--num_rollouts', type=int, default=20,
28 |                         help='Number of expert roll outs')
29 |     args = parser.parse_args()
30 | 
31 |     print('loading and building expert policy')
32 |     policy_fn = load_policy.load_policy(args.expert_policy_file)
33 |     print('loaded and built')
34 | 
35 |     with tf.Session():
36 |         tf_util.initialize()
37 | 
38 |         import gym
39 |         env = gym.make(args.envname)
40 |         max_steps = args.max_timesteps or env.spec.timestep_limit
41 | 
42 |         returns = []
43 |         observations = []
44 |         actions = []
45 |         for i in range(args.num_rollouts):
46 |             print('iter', i)
47 |             obs = env.reset()
48 |             done = False
49 |             totalr = 0.
50 |             steps = 0
51 |             while not done:
52 |                 action = policy_fn(obs[None,:])
53 |                 observations.append(obs)
54 |                 actions.append(action)
55 |                 obs, r, done, _ = env.step(action)
56 |                 totalr += r
57 |                 steps += 1
58 |                 if args.render:
59 |                     env.render()
60 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
61 |                 if steps >= max_steps:
62 |                     break
63 |             returns.append(totalr)
64 | 
65 |         print('returns', returns)
66 |         print('mean return', np.mean(returns))
67 |         print('std of return', np.std(returns))
68 | 
69 |         expert_data = {'observations': np.array(observations),
70 |                        'actions': np.array(actions)}
71 | 
72 |         print('observation shape', expert_data['observations'].shape)
73 |         print('action shape', expert_data['actions'].shape)
74 | 
75 |         our_model = model.Model(expert_data['observations'], expert_data['actions'], args.envname[:-3], "behavior_cloning")
76 |         our_model.train()
77 | 
78 |         for i in range(5):
79 |             obs = env.reset()
80 |             done = False
81 |             totalr = 0.
82 |             steps = 0
83 |             while not done:
84 |                 action = our_model.sample(obs)
85 |                 obs, r, done, _ = env.step(action)
86 |                 totalr += r
87 |                 steps += 1
88 |                 env.render()
89 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
90 |                 if steps >= max_steps:
91 |                     break
92 | 
93 | if __name__ == '__main__':
94 |     main()
95 | 


--------------------------------------------------------------------------------
/sp17_hw/hw4/logz.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | Some simple logging functionality, inspired by rllab's logging.
 4 | Assumes that each diagnostic gets logged each iteration
 5 | 
 6 | Call logz.configure_output_dir() to start logging to a 
 7 | tab-separated-values file (some_folder_name/log.txt)
 8 | 
 9 | To load the learning curves, you can do, for example
10 | 
11 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
12 | A['EpRewMean']
13 | 
14 | """
15 | 
16 | import os.path as osp, shutil, time, atexit, os, subprocess
17 | 
18 | color2num = dict(
19 |     gray=30,
20 |     red=31,
21 |     green=32,
22 |     yellow=33,
23 |     blue=34,
24 |     magenta=35,
25 |     cyan=36,
26 |     white=37,
27 |     crimson=38
28 | )
29 | 
30 | def colorize(string, color, bold=False, highlight=False):
31 |     attr = []
32 |     num = color2num[color]
33 |     if highlight: num += 10
34 |     attr.append(str(num))
35 |     if bold: attr.append('1')
36 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
37 | 
38 | class G:
39 |     output_dir = None
40 |     output_file = None
41 |     first_row = True
42 |     log_headers = []
43 |     log_current_row = {}
44 | 
45 | def configure_output_dir(d=None):
46 |     """
47 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
48 |     """
49 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
50 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
51 |     os.makedirs(G.output_dir)
52 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
53 |     atexit.register(G.output_file.close)
54 |     try:
55 |         cmd = "cd %s && git diff > %s 2>/dev/null"%(osp.dirname(__file__), osp.join(G.output_dir, "a.diff"))
56 |         subprocess.check_call(cmd, shell=True) # Save git diff to experiment directory
57 |     except subprocess.CalledProcessError:
58 |         print("configure_output_dir: not storing the git diff, probably because you're not in a git repo")
59 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
60 | 
61 | def log_tabular(key, val):
62 |     """
63 |     Log a value of some diagnostic
64 |     Call this once for each diagnostic quantity, each iteration
65 |     """
66 |     if G.first_row:
67 |         G.log_headers.append(key)
68 |     else:
69 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
70 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
71 |     G.log_current_row[key] = val
72 | 
73 | def dump_tabular():
74 |     """
75 |     Write all of the diagnostics from the current iteration
76 |     """
77 |     vals = []
78 |     print("-"*37)
79 |     for key in G.log_headers:
80 |         val = G.log_current_row.get(key, "")
81 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
82 |         else: valstr = val
83 |         print("| %15s | %15s |"%(key, valstr))
84 |         vals.append(val)
85 |     print("-"*37)
86 |     if G.output_file is not None:
87 |         if G.first_row:
88 |             G.output_file.write("\t".join(G.log_headers))
89 |             G.output_file.write("\n")
90 |         G.output_file.write("\t".join(map(str,vals)))
91 |         G.output_file.write("\n")
92 |         G.output_file.flush()
93 |     G.log_current_row.clear()
94 |     G.first_row=False


--------------------------------------------------------------------------------
/hw1/DAgger.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | import tf_util
 5 | import gym
 6 | import load_policy
 7 | import model
 8 | 
 9 | def main():
10 |     import argparse
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('expert_policy_file', type=str)
13 |     parser.add_argument('envname', type=str)
14 |     parser.add_argument('--render', action='store_true')
15 |     parser.add_argument("--max_timesteps", type=int)
16 |     parser.add_argument('--num_rollouts', type=int, default=20,
17 |                         help='Number of expert roll outs')
18 |     parser.add_argument('--DAgger_iter', type = int, default=5)
19 |     args = parser.parse_args()
20 | 
21 |     print('loading and building expert policy')
22 |     policy_fn = load_policy.load_policy(args.expert_policy_file)
23 |     print('loaded and built')
24 | 
25 |     with tf.Session():
26 |         tf_util.initialize()
27 |         import gym
28 |         env = gym.make(args.envname)
29 |         max_steps = args.max_timesteps or env.spec.timestep_limit
30 | 
31 |         returns = []
32 |         observations = []
33 |         actions = []
34 |         for i in range(args.num_rollouts):
35 |             print('iter', i)
36 |             obs = env.reset()
37 |             done = False
38 |             totalr = 0.
39 |             steps = 0
40 |             while not done:
41 |                 action = policy_fn(obs[None,:])
42 |                 observations.append(obs)
43 |                 actions.append(action)
44 |                 obs, r, done, _ = env.step(action)
45 |                 totalr += r
46 |                 steps += 1
47 |                 if args.render:
48 |                     env.render()
49 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
50 |                 if steps >= max_steps:
51 |                     break
52 |             returns.append(totalr)
53 | 
54 |         print('returns', returns)
55 |         print('mean return', np.mean(returns))
56 |         print('std of return', np.std(returns))
57 | 
58 |         expert_data = {'observations': np.array(observations),
59 |                        'actions': np.array(actions)}
60 | 
61 |         training_obs = expert_data['observations']
62 |         training_actions = expert_data['actions']
63 | 
64 |         print('observation shape', expert_data['observations'].shape)
65 |         print('action shape', expert_data['actions'].shape)
66 | 
67 |         our_model = model.Model(training_obs, training_actions, args.envname[:-3], 'DAgger')
68 |         our_model.train()
69 | 
70 |         for i in range(args.DAgger_iter):
71 |             new_obs = []
72 |             new_actions = []
73 |             obs = env.reset()
74 |             done = False
75 |             while not done:
76 |                 action = our_model.sample(obs)
77 |                 obs, _, done, _ = env.step(action)
78 |                 if args.render:
79 |                     env.render()
80 |                 corrected_action = policy_fn(obs[None, :])
81 |                 new_obs.append(obs)
82 |                 new_actions.append(corrected_action)
83 | 
84 |             training_obs = np.concatenate((training_obs, obs[None, :]), axis = 0)
85 |             training_actions = np.concatenate((training_actions, corrected_action[None, :]), axis = 0)
86 |             our_model.train(train_data = np.array(new_obs), test_data = np.array(new_actions), number = i)
87 | 
88 | if __name__ == '__main__':
89 |     main()
90 | 


--------------------------------------------------------------------------------
/hw2/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw4/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     if osp.exists(G.output_dir):
 55 |         print("Log dir %s already exists! Delete it first or use a different dir"%G.output_dir)
 56 |     else:
 57 |         os.makedirs(G.output_dir)
 58 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 59 |     atexit.register(G.output_file.close)
 60 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 61 | 
 62 | def log_tabular(key, val):
 63 |     """
 64 |     Log a value of some diagnostic
 65 |     Call this once for each diagnostic quantity, each iteration
 66 |     """
 67 |     if G.first_row:
 68 |         G.log_headers.append(key)
 69 |     else:
 70 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 71 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 72 |     G.log_current_row[key] = val
 73 | 
 74 | def save_params(params):
 75 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 76 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 77 | 
 78 | def pickle_tf_vars():  
 79 |     """
 80 |     Saves tensorflow variables
 81 |     Requires them to be initialized first, also a default session must exist
 82 |     """
 83 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 84 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 85 |         pickle.dump(_dict, f)
 86 |     
 87 | 
 88 | def dump_tabular():
 89 |     """
 90 |     Write all of the diagnostics from the current iteration
 91 |     """
 92 |     vals = []
 93 |     key_lens = [len(key) for key in G.log_headers]
 94 |     max_key_len = max(15,max(key_lens))
 95 |     keystr = '%'+'%d'%max_key_len
 96 |     fmt = "| " + keystr + "s | %15s |"
 97 |     n_slashes = 22 + max_key_len
 98 |     print("-"*n_slashes)
 99 |     for key in G.log_headers:
100 |         val = G.log_current_row.get(key, "")
101 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
102 |         else: valstr = val
103 |         print(fmt%(key, valstr))
104 |         vals.append(val)
105 |     print("-"*n_slashes)
106 |     if G.output_file is not None:
107 |         if G.first_row:
108 |             G.output_file.write("\t".join(G.log_headers))
109 |             G.output_file.write("\n")
110 |         G.output_file.write("\t".join(map(str,vals)))
111 |         G.output_file.write("\n")
112 |         G.output_file.flush()
113 |     G.log_current_row.clear()
114 |     G.first_row=False
115 | 


--------------------------------------------------------------------------------
/hw2/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 |     sns.set(style="darkgrid", font_scale=1.5)
 55 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 56 |     plt.legend(loc='best').draggable()
 57 |     plt.show()
 58 | 
 59 | 
 60 | def get_datasets(fpath, condition=None):
 61 |     unit = 0
 62 |     datasets = []
 63 |     for root, dir, files in os.walk(fpath):
 64 |         if 'log.txt' in files:
 65 |             param_path = open(os.path.join(root,'params.json'))
 66 |             params = json.load(param_path)
 67 |             exp_name = params['exp_name']
 68 |             
 69 |             log_path = os.path.join(root,'log.txt')
 70 |             experiment_data = pd.read_table(log_path)
 71 | 
 72 |             experiment_data.insert(
 73 |                 len(experiment_data.columns),
 74 |                 'Unit',
 75 |                 unit
 76 |                 )
 77 |             experiment_data.insert(
 78 |                 len(experiment_data.columns),
 79 |                 'Condition',
 80 |                 condition or exp_name
 81 |                 )
 82 | 
 83 |             datasets.append(experiment_data)
 84 |             unit += 1
 85 | 
 86 |     return datasets
 87 | 
 88 | 
 89 | def main():
 90 |     import argparse
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument('logdir', nargs='*')
 93 |     parser.add_argument('--legend', nargs='*')
 94 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 95 |     args = parser.parse_args()
 96 | 
 97 |     use_legend = False
 98 |     if args.legend is not None:
 99 |         assert len(args.legend) == len(args.logdir), \
100 |             "Must give a legend title for each set of experiments."
101 |         use_legend = True
102 | 
103 |     data = []
104 |     if use_legend:
105 |         for logdir, legend_title in zip(args.logdir, args.legend):
106 |             data += get_datasets(logdir, legend_title)
107 |     else:
108 |         for logdir in args.logdir:
109 |             data += get_datasets(logdir)
110 | 
111 |     if isinstance(args.value, list):
112 |         values = args.value
113 |     else:
114 |         values = [args.value]
115 |     for value in values:
116 |         plot_data(data, value=value)
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/hw4/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 |     sns.set(style="darkgrid", font_scale=1.5)
 55 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 56 |     plt.legend(loc='best').draggable()
 57 |     plt.show()
 58 | 
 59 | 
 60 | def get_datasets(fpath, condition=None):
 61 |     unit = 0
 62 |     datasets = []
 63 |     for root, dir, files in os.walk(fpath):
 64 |         if 'log.txt' in files:
 65 |             param_path = open(os.path.join(root,'params.json'))
 66 |             params = json.load(param_path)
 67 |             exp_name = params['exp_name']
 68 |             
 69 |             log_path = os.path.join(root,'log.txt')
 70 |             experiment_data = pd.read_table(log_path)
 71 | 
 72 |             experiment_data.insert(
 73 |                 len(experiment_data.columns),
 74 |                 'Unit',
 75 |                 unit
 76 |                 )
 77 |             experiment_data.insert(
 78 |                 len(experiment_data.columns),
 79 |                 'Condition',
 80 |                 condition or exp_name
 81 |                 )
 82 | 
 83 |             datasets.append(experiment_data)
 84 |             unit += 1
 85 | 
 86 |     return datasets
 87 | 
 88 | 
 89 | def main():
 90 |     import argparse
 91 |     parser = argparse.ArgumentParser()
 92 |     parser.add_argument('logdir', nargs='*')
 93 |     parser.add_argument('--legend', nargs='*')
 94 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 95 |     args = parser.parse_args()
 96 | 
 97 |     use_legend = False
 98 |     if args.legend is not None:
 99 |         assert len(args.legend) == len(args.logdir), \
100 |             "Must give a legend title for each set of experiments."
101 |         use_legend = True
102 | 
103 |     data = []
104 |     if use_legend:
105 |         for logdir, legend_title in zip(args.logdir, args.legend):
106 |             data += get_datasets(logdir, legend_title)
107 |     else:
108 |         for logdir in args.logdir:
109 |             data += get_datasets(logdir)
110 | 
111 |     if isinstance(args.value, list):
112 |         values = args.value
113 |     else:
114 |         values = [args.value]
115 |     for value in values:
116 |         plot_data(data, value=value)
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_ram.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(ram_in, num_actions, scope, reuse=False):
 16 |     with tf.variable_scope(scope, reuse=reuse):
 17 |         out = ram_in
 18 |         #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
 19 |         with tf.variable_scope("action_value"):
 20 |             out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
 21 |             out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
 22 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 23 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 24 | 
 25 |         return out
 26 | 
 27 | def atari_learn(env,
 28 |                 session,
 29 |                 num_timesteps):
 30 |     # This is just a rough estimate
 31 |     num_iterations = float(num_timesteps) / 4.0
 32 | 
 33 |     lr_multiplier = 1.0 
 34 |     lr_schedule = PiecewiseSchedule([
 35 |                                          (0,                   1e-4 * lr_multiplier),
 36 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 37 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 38 |                                     ],
 39 |                                     outside_value=5e-5 * lr_multiplier)
 40 |     optimizer = dqn.OptimizerSpec(
 41 |         constructor=tf.train.AdamOptimizer,
 42 |         kwargs=dict(epsilon=1e-4),
 43 |         lr_schedule=lr_schedule
 44 |     )
 45 | 
 46 |     def stopping_criterion(env, t):
 47 |         # notice that here t is the number of steps of the wrapped env,
 48 |         # which is different from the number of steps in the underlying env
 49 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 50 | 
 51 |     exploration_schedule = PiecewiseSchedule(
 52 |         [
 53 |             (0, 0.2),
 54 |             (1e6, 0.1),
 55 |             (num_iterations / 2, 0.01),
 56 |         ], outside_value=0.01
 57 |     )
 58 | 
 59 |     dqn.learn(
 60 |         env,
 61 |         q_func=atari_model,
 62 |         optimizer_spec=optimizer,
 63 |         session=session,
 64 |         exploration=exploration_schedule,
 65 |         stopping_criterion=stopping_criterion,
 66 |         replay_buffer_size=1000000,
 67 |         batch_size=32,
 68 |         gamma=0.99,
 69 |         learning_starts=50000,
 70 |         learning_freq=4,
 71 |         frame_history_len=1,
 72 |         target_update_freq=10000,
 73 |         grad_norm_clipping=10
 74 |     )
 75 |     env.close()
 76 | 
 77 | def get_available_gpus():
 78 |     from tensorflow.python.client import device_lib
 79 |     local_device_protos = device_lib.list_local_devices()
 80 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 81 | 
 82 | def set_global_seeds(i):
 83 |     try:
 84 |         import tensorflow as tf
 85 |     except ImportError:
 86 |         pass
 87 |     else:
 88 |         tf.set_random_seed(i) 
 89 |     np.random.seed(i)
 90 |     random.seed(i)
 91 | 
 92 | def get_session():
 93 |     tf.reset_default_graph()
 94 |     tf_config = tf.ConfigProto(
 95 |         inter_op_parallelism_threads=1,
 96 |         intra_op_parallelism_threads=1)
 97 |     session = tf.Session(config=tf_config)
 98 |     print("AVAILABLE GPUS: ", get_available_gpus())
 99 |     return session
100 | 
101 | def get_env(seed):
102 |     env = gym.make('Pong-ram-v0')
103 | 
104 |     set_global_seeds(seed)
105 |     env.seed(seed)
106 | 
107 |     expt_dir = '/tmp/hw3_vid_dir/'
108 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
109 |     env = wrap_deepmind_ram(env)
110 | 
111 |     return env
112 | 
113 | def main():
114 |     # Run training
115 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
116 |     env = get_env(seed)
117 |     session = get_session()
118 |     atari_learn(env, session, num_timesteps=int(4e7))
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/run_dqn_ram.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(ram_in, num_actions, scope, reuse=False):
 16 |     with tf.variable_scope(scope, reuse=reuse):
 17 |         out = ram_in
 18 |         #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
 19 |         with tf.variable_scope("action_value"):
 20 |             out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
 21 |             out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
 22 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 23 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 24 | 
 25 |         return out
 26 | 
 27 | def atari_learn(env,
 28 |                 session,
 29 |                 num_timesteps):
 30 |     # This is just a rough estimate
 31 |     num_iterations = float(num_timesteps) / 4.0
 32 | 
 33 |     lr_multiplier = 1.0 
 34 |     lr_schedule = PiecewiseSchedule([
 35 |                                          (0,                   1e-4 * lr_multiplier),
 36 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 37 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 38 |                                     ],
 39 |                                     outside_value=5e-5 * lr_multiplier)
 40 |     optimizer = dqn.OptimizerSpec(
 41 |         constructor=tf.train.AdamOptimizer,
 42 |         kwargs=dict(epsilon=1e-4),
 43 |         lr_schedule=lr_schedule
 44 |     )
 45 | 
 46 |     def stopping_criterion(env, t):
 47 |         # notice that here t is the number of steps of the wrapped env,
 48 |         # which is different from the number of steps in the underlying env
 49 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 50 | 
 51 |     exploration_schedule = PiecewiseSchedule(
 52 |         [
 53 |             (0, 0.2),
 54 |             (1e6, 0.1),
 55 |             (num_iterations / 2, 0.01),
 56 |         ], outside_value=0.01
 57 |     )
 58 | 
 59 |     dqn.learn(
 60 |         env,
 61 |         q_func=atari_model,
 62 |         optimizer_spec=optimizer,
 63 |         session=session,
 64 |         exploration=exploration_schedule,
 65 |         stopping_criterion=stopping_criterion,
 66 |         replay_buffer_size=1000000,
 67 |         batch_size=32,
 68 |         gamma=0.99,
 69 |         learning_starts=50000,
 70 |         learning_freq=4,
 71 |         frame_history_len=1,
 72 |         target_update_freq=10000,
 73 |         grad_norm_clipping=10
 74 |     )
 75 |     env.close()
 76 | 
 77 | def get_available_gpus():
 78 |     from tensorflow.python.client import device_lib
 79 |     local_device_protos = device_lib.list_local_devices()
 80 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 81 | 
 82 | def set_global_seeds(i):
 83 |     try:
 84 |         import tensorflow as tf
 85 |     except ImportError:
 86 |         pass
 87 |     else:
 88 |         tf.set_random_seed(i) 
 89 |     np.random.seed(i)
 90 |     random.seed(i)
 91 | 
 92 | def get_session():
 93 |     tf.reset_default_graph()
 94 |     tf_config = tf.ConfigProto(
 95 |         inter_op_parallelism_threads=1,
 96 |         intra_op_parallelism_threads=1)
 97 |     session = tf.Session(config=tf_config)
 98 |     print("AVAILABLE GPUS: ", get_available_gpus())
 99 |     return session
100 | 
101 | def get_env(seed):
102 |     env = gym.make('Pong-ram-v0')
103 | 
104 |     set_global_seeds(seed)
105 |     env.seed(seed)
106 | 
107 |     expt_dir = '/tmp/hw3_vid_dir/'
108 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
109 |     env = wrap_deepmind_ram(env)
110 | 
111 |     return env
112 | 
113 | def main():
114 |     # Run training
115 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
116 |     env = get_env(seed)
117 |     session = get_session()
118 |     atari_learn(env, session, num_timesteps=int(4e7))
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/hw4/dynamics.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import math
 4 | 
 5 | # Predefined function to build a feedforward neural network
 6 | def build_mlp(input_placeholder, 
 7 |               output_size,
 8 |               scope, 
 9 |               n_layers=2, 
10 |               size=500, 
11 |               activation=tf.tanh,
12 |               output_activation=None
13 |               ):
14 |     out = input_placeholder
15 |     with tf.variable_scope(scope):
16 |         for _ in range(n_layers):
17 |             out = tf.layers.dense(out, size, activation=activation)
18 |         out = tf.layers.dense(out, output_size, activation=output_activation)
19 |     return out
20 | 
21 | class NNDynamicsModel():
22 |     def __init__(self, 
23 |                  env, 
24 |                  n_layers,
25 |                  size, 
26 |                  activation, 
27 |                  output_activation, 
28 |                  normalization,
29 |                  batch_size,
30 |                  iterations,
31 |                  learning_rate,
32 |                  sess
33 |                  ):
34 |         """ YOUR CODE HERE """
35 |         """ Note: Be careful about normalization """
36 |         self.mean_obs, self.std_obs, self.mean_deltas, self.std_deltas, self.mean_action, self.std_action = normalization
37 |         self.sess = sess
38 |         self.batch_size = batch_size
39 |         self.iter = iterations
40 | 
41 |         self.state_dim = env.observation_space.shape[0]
42 |         self.action_dim = env.action_space.shape[0]
43 | 
44 |         self.s_a_ph = tf.placeholder(shape = [None, self.state_dim + self.action_dim], name = 's_a_ph', dtype = tf.float32)
45 |         self.delta_ph = tf.placeholder(shape = [None, self.state_dim], name = 'delta_ph', dtype = tf.float32)
46 |         self.delta_pred = build_mlp(self.s_a_ph, self.state_dim, "dynamics", n_layers = n_layers, size = size, 
47 |                                 activation = activation, output_activation = output_activation)
48 |         self.loss = tf.reduce_mean(tf.square(self.delta_pred - self.delta_ph))
49 |         self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
50 | 
51 |     def fit(self, data):
52 |         """
53 |         Write a function to take in a dataset of (unnormalized)states, 
54 |         (unnormalized)actions, (unnormalized)next_states and fit the dynamics model going from normalized states, 
55 |         normalized actions to normalized state differences (s_t+1 - s_t)
56 |         """
57 | 
58 |         un_states = np.concatenate([d['observations'] for d in data])
59 |         un_actions = np.concatenate([d['actions'] for d in data])
60 |         un_states_next = np.concatenate([d['next_observations'] for d in data])
61 |         N = un_states.shape[0]
62 |         indices = np.arange(N)
63 | 
64 |         n_states = (un_states - self.mean_obs) / (self.std_obs + 1e-7)
65 |         n_deltas = ((un_states_next - un_states) - self.mean_deltas) / (self.std_deltas + 1e-7)
66 |         n_actions = (un_actions - self.mean_action) / (self.std_action + 1e-7)
67 | 
68 |         state_action = np.concatenate((n_states, n_actions), axis = 1)
69 | 
70 |         for _ in range(self.iter):
71 |             np.random.shuffle(indices)
72 |             batches = int (math.ceil(N / self.batch_size))
73 |             for i in range(batches):
74 |                 start_idx = i * self.batch_size
75 |                 idxs = indices[start_idx : start_idx + self.batch_size]
76 |                 batch_s_a = state_action[idxs, :]
77 |                 batch_delta = n_deltas[idxs, :]
78 |                 self.sess.run(self.optimizer, feed_dict = {self.s_a_ph : batch_s_a, self.delta_ph : batch_delta})
79 | 
80 |     def predict(self, states, actions):
81 |         """ Write a function to take in a batch of (unnormalized) states and (unnormalized) 
82 |         actions and return the (unnormalized) next states as predicted by using the model """
83 |         
84 |         n_states = (states - self.mean_obs) / (self.std_obs + 1e-7)
85 |         n_actions = (actions - self.mean_action) / (self.std_action + 1e-7)
86 |         print(n_states.shape)
87 |         print(n_actions.shape)
88 |         state_action = np.concatenate((n_states, n_actions), axis = 1)
89 | 
90 |         expected_deltas = self.sess.run(self.delta_pred, feed_dict = {self.s_a_ph : state_action})
91 | 
92 |         return expected_deltas * self.std_deltas + self.mean_deltas + states
93 | 


--------------------------------------------------------------------------------
/sp17_hw/hw4/homework.md:
--------------------------------------------------------------------------------
 1 | # Homework 4
 2 | 
 3 | In `main.py` you will find an implementation of a "vanilla" policy gradient method, applied to an MDP with a discrete action space: an episodic version of the classic "cartpole" task. First, make sure the provided code works on your computer by running `python main.py`. We recommend reading through all of the code and comments in the function `main_cartpole`, starting at the top of the function.
 4 | 
 5 | The code computes some useful diagnostics, which you may find helpful to look at while tuning hyperparameters:
 6 |  
 7 | - **KL[policy before update || policy after update]**. Large spikes in KL divergence mean that the optimization took a large step, and sometimes these spikes cause a collapse in performance.
 8 | - **Entropy of the policy**. If entropy goes down too fast, then you may not explore enough, but if it goes down too slowly, you'll probably not reach optimal performance.
 9 | - **Explained variance of the value function**. If the value function perfectly explains the returns, then it will be 1; if you get a negative result, then it's worse than predicting a constant.
10 | 
11 | Software dependencies: 
12 | 
13 | - tensorflow
14 | - numpy + scipy (Anaconda recommended)
15 | - gym (I'm using 0.8.0, `pip install gym==0.8.0`, but old versions should work just as well)
16 | 
17 | ## Problem 1
18 | 
19 | Here you will modify the `main_cartpole` policy gradient implementation to work on a continuous action space, specifically, the gym environment `Pendulum-v`. Note that in `main_cartpole`, note that the neural network outputs "logits" (i.e., log-probabilities plus-or-minus a constant) that specify a categorical distribution. On the other hand, for the pendulum task, your neural network should output the mean of a Gaussian distribution, a separate parameter vector to parameterize the log standard deviation. For example, you could use the following code:
20 | 
21 | ```python
22 | 
23 |     mean_na = dense(h2, ac_dim, weight_init=normc_initializer(0.1)) # Mean control output
24 |     logstd_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.zeros_initializer) # Variance
25 | 
26 |     sy_sampled_ac = YOUR_CODE_HERE
27 |     sy_logprob_n = YOUR_CODE_HERE
28 | 
29 | ```
30 | 
31 | You should also compute differential entropy (replacing `sy_ent`) and KL-divergence (`sy_kl`) for the Gaussian distribution. 
32 | 
33 | The pendulum problem is slightly harder, and using a fixed stepsize does not work reliably---thus, we instead recommend using an adaptive stepsize, where you adjust it based on the KL divergence between the new and old policy. Code for this stepsize adaptation is provided.
34 | 
35 | You can plot your results using the script `plot_learning_curves.py` or your own plotting code.
36 | 
37 | **Deliverables**
38 | 
39 | - Show a plot with the pendulum converging to EpRewMean of at least `-300`. Include EpRewMean, KL, Entropy in your plots.  
40 | - Describe the hyperparameters used and how many timesteps your algorithm took to learn.
41 | 
42 | ## Problem 2
43 | 
44 | 1. Implement a neural network value function with the same interface as `LinearVF`. Add it to the provided cartpole solver, and compare the performance of the linear and neural network value function (i.e., baseline).
45 | 2. Perform the same comparison--linear vs neural network--for your pendulum solver from Problem 1. You should be able to obtain faster learning using the neural network.
46 | 
47 | 
48 | **Deliverables**
49 | 
50 | - A comparison of linear vs neural network value function on the cartpole. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 
51 | - A comparison of linear vs neural network value function on the pendulum. Show the value function's explained variance (EVBefore) and mean episode reward (EpRewMean). 
52 | 
53 | In both cases, list the hyperparameters used for neural network training.
54 | 
55 | ## Problem 3 (bonus)
56 | 
57 | Implement a more advanced policy gradient method from lecture (such as TRPO, or the advantage function estimator used in A3C or generalized advantage estimation), and apply it to the gym environment `Hopper-v1`. See if you can learn a good gait in less than 500,000 timesteps.
58 | Hint: it may help to standardize your inputs using a running estimate of mean and standard deviation.
59 | 
60 |     ob_rescaled = (ob_raw - mean) / (stdev + epsilon)
61 | 
62 | **Deliverables**
63 | 
64 | A description of what you implemented, and learning curves on the Hopper-v1 environment.


--------------------------------------------------------------------------------
/hw3/run_dqn_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(img_in, num_actions, scope, reuse=False):
 16 |     # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
 17 |     with tf.variable_scope(scope, reuse=reuse):
 18 |         out = img_in
 19 |         with tf.variable_scope("convnet"):
 20 |             # original architecture
 21 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
 22 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
 23 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
 24 |         out = layers.flatten(out)
 25 |         with tf.variable_scope("action_value"):
 26 |             out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
 27 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 28 | 
 29 |         return out
 30 | 
 31 | def atari_learn(env,
 32 |                 session,
 33 |                 num_timesteps):
 34 |     # This is just a rough estimate
 35 |     num_iterations = float(num_timesteps) / 4.0
 36 | 
 37 |     lr_multiplier = 1.0
 38 |     lr_schedule = PiecewiseSchedule([
 39 |                                          (0,                   1e-4 * lr_multiplier),
 40 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 41 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 42 |                                     ],
 43 |                                     outside_value=5e-5 * lr_multiplier)
 44 |     optimizer = dqn.OptimizerSpec(
 45 |         constructor=tf.train.AdamOptimizer,
 46 |         kwargs=dict(epsilon=1e-4),
 47 |         lr_schedule=lr_schedule
 48 |     )
 49 | 
 50 |     def stopping_criterion(env, t):
 51 |         # notice that here t is the number of steps of the wrapped env,
 52 |         # which is different from the number of steps in the underlying env
 53 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 54 | 
 55 |     exploration_schedule = PiecewiseSchedule(
 56 |         [
 57 |             (0, 1.0),
 58 |             (1e6, 0.1),
 59 |             (num_iterations / 2, 0.01),
 60 |         ], outside_value=0.01
 61 |     )
 62 | 
 63 |     dqn.learn(
 64 |         env,
 65 |         q_func=atari_model,
 66 |         optimizer_spec=optimizer,
 67 |         session=session,
 68 |         exploration=exploration_schedule,
 69 |         stopping_criterion=stopping_criterion,
 70 |         replay_buffer_size=1000000,
 71 |         batch_size=32,
 72 |         gamma=0.99,
 73 |         learning_starts=50000,
 74 |         learning_freq=4,
 75 |         frame_history_len=4,
 76 |         target_update_freq=10000,
 77 |         grad_norm_clipping=10
 78 |     )
 79 |     env.close()
 80 | 
 81 | def get_available_gpus():
 82 |     from tensorflow.python.client import device_lib
 83 |     local_device_protos = device_lib.list_local_devices()
 84 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 85 | 
 86 | def set_global_seeds(i):
 87 |     try:
 88 |         import tensorflow as tf
 89 |     except ImportError:
 90 |         pass
 91 |     else:
 92 |         tf.set_random_seed(i) 
 93 |     np.random.seed(i)
 94 |     random.seed(i)
 95 | 
 96 | def get_session():
 97 |     tf.reset_default_graph()
 98 |     tf_config = tf.ConfigProto(
 99 |         inter_op_parallelism_threads=1,
100 |         intra_op_parallelism_threads=1)
101 |     session = tf.Session(config=tf_config)
102 |     print("AVAILABLE GPUS: ", get_available_gpus())
103 |     return session
104 | 
105 | def get_env(task, seed):
106 |     env_id = task.env_id
107 | 
108 |     env = gym.make(env_id)
109 | 
110 |     set_global_seeds(seed)
111 |     env.seed(seed)
112 | 
113 |     expt_dir = '/tmp/hw3_vid_dir2/'
114 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
115 |     env = wrap_deepmind(env)
116 | 
117 |     return env
118 | 
119 | def main():
120 |     # Get Atari games.
121 |     benchmark = gym.benchmark_spec('Atari40M')
122 | 
123 |     # Change the index to select a different game.
124 |     task = benchmark.tasks[3]
125 | 
126 |     # Run training
127 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
128 |     env = get_env(task, seed)
129 |     session = get_session()
130 |     atari_learn(env, session, num_timesteps=task.max_timesteps)
131 | 
132 | if __name__ == "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/run_dqn_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(img_in, num_actions, scope, reuse=False):
 16 |     # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
 17 |     with tf.variable_scope(scope, reuse=reuse):
 18 |         out = img_in
 19 |         with tf.variable_scope("convnet"):
 20 |             # original architecture
 21 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
 22 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
 23 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
 24 |         out = layers.flatten(out)
 25 |         with tf.variable_scope("action_value"):
 26 |             out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
 27 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 28 | 
 29 |         return out
 30 | 
 31 | def atari_learn(env,
 32 |                 session,
 33 |                 num_timesteps):
 34 |     # This is just a rough estimate
 35 |     num_iterations = float(num_timesteps) / 4.0
 36 | 
 37 |     lr_multiplier = 1.0
 38 |     lr_schedule = PiecewiseSchedule([
 39 |                                          (0,                   1e-4 * lr_multiplier),
 40 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 41 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 42 |                                     ],
 43 |                                     outside_value=5e-5 * lr_multiplier)
 44 |     optimizer = dqn.OptimizerSpec(
 45 |         constructor=tf.train.AdamOptimizer,
 46 |         kwargs=dict(epsilon=1e-4),
 47 |         lr_schedule=lr_schedule
 48 |     )
 49 | 
 50 |     def stopping_criterion(env, t):
 51 |         # notice that here t is the number of steps of the wrapped env,
 52 |         # which is different from the number of steps in the underlying env
 53 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 54 | 
 55 |     exploration_schedule = PiecewiseSchedule(
 56 |         [
 57 |             (0, 1.0),
 58 |             (1e6, 0.1),
 59 |             (num_iterations / 2, 0.01),
 60 |         ], outside_value=0.01
 61 |     )
 62 | 
 63 |     dqn.learn(
 64 |         env,
 65 |         q_func=atari_model,
 66 |         optimizer_spec=optimizer,
 67 |         session=session,
 68 |         exploration=exploration_schedule,
 69 |         stopping_criterion=stopping_criterion,
 70 |         replay_buffer_size=1000000,
 71 |         batch_size=32,
 72 |         gamma=0.99,
 73 |         learning_starts=50000,
 74 |         learning_freq=4,
 75 |         frame_history_len=4,
 76 |         target_update_freq=10000,
 77 |         grad_norm_clipping=10
 78 |     )
 79 |     env.close()
 80 | 
 81 | def get_available_gpus():
 82 |     from tensorflow.python.client import device_lib
 83 |     local_device_protos = device_lib.list_local_devices()
 84 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 85 | 
 86 | def set_global_seeds(i):
 87 |     try:
 88 |         import tensorflow as tf
 89 |     except ImportError:
 90 |         pass
 91 |     else:
 92 |         tf.set_random_seed(i) 
 93 |     np.random.seed(i)
 94 |     random.seed(i)
 95 | 
 96 | def get_session():
 97 |     tf.reset_default_graph()
 98 |     tf_config = tf.ConfigProto(
 99 |         inter_op_parallelism_threads=1,
100 |         intra_op_parallelism_threads=1)
101 |     session = tf.Session(config=tf_config)
102 |     print("AVAILABLE GPUS: ", get_available_gpus())
103 |     return session
104 | 
105 | def get_env(task, seed):
106 |     env_id = task.env_id
107 | 
108 |     env = gym.make(env_id)
109 | 
110 |     set_global_seeds(seed)
111 |     env.seed(seed)
112 | 
113 |     expt_dir = '/tmp/hw3_vid_dir2/'
114 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
115 |     env = wrap_deepmind(env)
116 | 
117 |     return env
118 | 
119 | def main():
120 |     # Get Atari games.
121 |     benchmark = gym.benchmark_spec('Atari40M')
122 | 
123 |     # Change the index to select a different game.
124 |     task = benchmark.tasks[3]
125 | 
126 |     # Run training
127 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
128 |     env = get_env(task, seed)
129 |     session = get_session()
130 |     atari_learn(env, session, num_timesteps=task.max_timesteps)
131 | 
132 | if __name__ == "__main__":
133 |     main()
134 | 


--------------------------------------------------------------------------------
/sp17_hw/hw2/frozen_lake.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys
  3 | from six import StringIO, b
  4 | 
  5 | from gym import utils
  6 | import discrete_env
  7 | 
  8 | LEFT = 0
  9 | DOWN = 1
 10 | RIGHT = 2
 11 | UP = 3
 12 | 
 13 | MAPS = {
 14 |     "4x4": [
 15 |         "SFFF",
 16 |         "FHFH",
 17 |         "FFFH",
 18 |         "HFFG"
 19 |     ],
 20 |     "8x8": [
 21 |         "SFFFFFFF",
 22 |         "FFFFFFFF",
 23 |         "FFFHFFFF",
 24 |         "FFFFFHFF",
 25 |         "FFFHFFFF",
 26 |         "FHHFFFHF",
 27 |         "FHFFHFHF",
 28 |         "FFFHFFFG"
 29 |     ],
 30 | }
 31 | 
 32 | class FrozenLakeEnv(discrete_env.DiscreteEnv):
 33 |     """
 34 |     Winter is here. You and your friends were tossing around a frisbee at the park
 35 |     when you made a wild throw that left the frisbee out in the middle of the lake.
 36 |     The water is mostly frozen, but there are a few holes where the ice has melted.
 37 |     If you step into one of those holes, you'll fall into the freezing water.
 38 |     At this time, there's an international frisbee shortage, so it's absolutely imperative that
 39 |     you navigate across the lake and retrieve the disc.
 40 |     However, the ice is slippery, so you won't always move in the direction you intend.
 41 |     The surface is described using a grid like the following
 42 | 
 43 |         SFFF
 44 |         FHFH
 45 |         FFFH
 46 |         HFFG
 47 | 
 48 |     S : starting point, safe
 49 |     F : frozen surface, safe
 50 |     H : hole, fall to your doom
 51 |     G : goal, where the frisbee is located
 52 | 
 53 |     The episode ends when you reach the goal or fall in a hole.
 54 |     You receive a reward of 1 if you reach the goal, and zero otherwise.
 55 | 
 56 |     """
 57 | 
 58 |     metadata = {'render.modes': ['human', 'ansi']}
 59 | 
 60 |     def __init__(self, desc=None, map_name="4x4",is_slippery=True):
 61 |         if desc is None and map_name is None:
 62 |             raise ValueError('Must provide either desc or map_name')
 63 |         elif desc is None:
 64 |             desc = MAPS[map_name]
 65 |         self.desc = desc = np.asarray(desc,dtype='c')
 66 |         self.nrow, self.ncol = nrow, ncol = desc.shape
 67 | 
 68 |         nA = 4
 69 |         nS = nrow * ncol
 70 | 
 71 |         isd = np.array(desc == b'S').astype('float64').ravel()
 72 |         isd /= isd.sum()
 73 | 
 74 |         P = {s : {a : [] for a in range(nA)} for s in range(nS)}
 75 | 
 76 |         def to_s(row, col):
 77 |             return row*ncol + col
 78 |         def inc(row, col, a):
 79 |             if a==0: # left
 80 |                 col = max(col-1,0)
 81 |             elif a==1: # down
 82 |                 row = min(row+1,nrow-1)
 83 |             elif a==2: # right
 84 |                 col = min(col+1,ncol-1)
 85 |             elif a==3: # up
 86 |                 row = max(row-1,0)
 87 |             return (row, col)
 88 | 
 89 |         for row in range(nrow):
 90 |             for col in range(ncol):
 91 |                 s = to_s(row, col)
 92 |                 for a in range(4):
 93 |                     li = P[s][a]
 94 |                     letter = desc[row, col]
 95 |                     if letter in b'GH':
 96 |                         li.append((1.0, s, 0, True))
 97 |                     else:
 98 |                         if is_slippery:
 99 |                             for b in [(a-1)%4, a, (a+1)%4]:
100 |                                 newrow, newcol = inc(row, col, b)
101 |                                 newstate = to_s(newrow, newcol)
102 |                                 newletter = desc[newrow, newcol]
103 |                                 done = bytes(newletter) in b'GH'
104 |                                 rew = float(newletter == b'G')
105 |                                 li.append((0.8 if b==a else 0.1, newstate, rew, done))
106 |                         else:
107 |                             newrow, newcol = inc(row, col, a)
108 |                             newstate = to_s(newrow, newcol)
109 |                             newletter = desc[newrow, newcol]
110 |                             done = bytes(newletter) in b'GH'
111 |                             rew = float(newletter == b'G')
112 |                             li.append((1.0, newstate, rew, done))
113 | 
114 |         super(FrozenLakeEnv, self).__init__(nS, nA, P, isd)
115 | 
116 |     def _render(self, mode='human', close=False):
117 |         if close:
118 |             return
119 |         outfile = StringIO() if mode == 'ansi' else sys.stdout
120 | 
121 |         row, col = self.s // self.ncol, self.s % self.ncol
122 |         desc = self.desc.tolist()
123 |         desc = [[c.decode('utf-8') for c in line] for line in desc]
124 |         desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
125 |         if self.lastaction is not None:
126 |             outfile.write("  ({})\n".format(["Left","Down","Right","Up"][self.lastaction]))
127 |         else:
128 |             outfile.write("\n")
129 |         outfile.write("\n".join(''.join(line) for line in desc)+"\n")
130 | 
131 |         return outfile
132 | 


--------------------------------------------------------------------------------
/hw3/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from collections import deque
  4 | import gym
  5 | from gym import spaces
  6 | 
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env=None, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         super(NoopResetEnv, self).__init__(env)
 14 |         self.noop_max = noop_max
 15 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 16 | 
 17 |     def _reset(self):
 18 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 19 |         self.env.reset()
 20 |         noops = np.random.randint(1, self.noop_max + 1)
 21 |         for _ in range(noops):
 22 |             obs, _, _, _ = self.env.step(0)
 23 |         return obs
 24 | 
 25 | class FireResetEnv(gym.Wrapper):
 26 |     def __init__(self, env=None):
 27 |         """Take action on reset for environments that are fixed until firing."""
 28 |         super(FireResetEnv, self).__init__(env)
 29 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 30 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 31 | 
 32 |     def _reset(self):
 33 |         self.env.reset()
 34 |         obs, _, _, _ = self.env.step(1)
 35 |         obs, _, _, _ = self.env.step(2)
 36 |         return obs
 37 | 
 38 | class EpisodicLifeEnv(gym.Wrapper):
 39 |     def __init__(self, env=None):
 40 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 41 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 42 |         """
 43 |         super(EpisodicLifeEnv, self).__init__(env)
 44 |         self.lives = 0
 45 |         self.was_real_done  = True
 46 |         self.was_real_reset = False
 47 | 
 48 |     def _step(self, action):
 49 |         obs, reward, done, info = self.env.step(action)
 50 |         self.was_real_done = done
 51 |         # check current lives, make loss of life terminal,
 52 |         # then update lives to handle bonus lives
 53 |         lives = self.env.unwrapped.ale.lives()
 54 |         if lives < self.lives and lives > 0:
 55 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 56 |             # so its important to keep lives > 0, so that we only reset once
 57 |             # the environment advertises done.
 58 |             done = True
 59 |         self.lives = lives
 60 |         return obs, reward, done, info
 61 | 
 62 |     def _reset(self):
 63 |         """Reset only when lives are exhausted.
 64 |         This way all states are still reachable even though lives are episodic,
 65 |         and the learner need not know about any of this behind-the-scenes.
 66 |         """
 67 |         if self.was_real_done:
 68 |             obs = self.env.reset()
 69 |             self.was_real_reset = True
 70 |         else:
 71 |             # no-op step to advance from terminal/lost life state
 72 |             obs, _, _, _ = self.env.step(0)
 73 |             self.was_real_reset = False
 74 |         self.lives = self.env.unwrapped.ale.lives()
 75 |         return obs
 76 | 
 77 | class MaxAndSkipEnv(gym.Wrapper):
 78 |     def __init__(self, env=None, skip=4):
 79 |         """Return only every `skip`-th frame"""
 80 |         super(MaxAndSkipEnv, self).__init__(env)
 81 |         # most recent raw observations (for max pooling across time steps)
 82 |         self._obs_buffer = deque(maxlen=2)
 83 |         self._skip       = skip
 84 | 
 85 |     def _step(self, action):
 86 |         total_reward = 0.0
 87 |         done = None
 88 |         for _ in range(self._skip):
 89 |             obs, reward, done, info = self.env.step(action)
 90 |             self._obs_buffer.append(obs)
 91 |             total_reward += reward
 92 |             if done:
 93 |                 break
 94 | 
 95 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 96 | 
 97 |         return max_frame, total_reward, done, info
 98 | 
 99 |     def _reset(self):
100 |         """Clear past frame buffer and init. to first obs. from inner env."""
101 |         self._obs_buffer.clear()
102 |         obs = self.env.reset()
103 |         self._obs_buffer.append(obs)
104 |         return obs
105 | 
106 | def _process_frame84(frame):
107 |     img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
108 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
109 |     resized_screen = cv2.resize(img, (84, 110),  interpolation=cv2.INTER_LINEAR)
110 |     x_t = resized_screen[18:102, :]
111 |     x_t = np.reshape(x_t, [84, 84, 1])
112 |     return x_t.astype(np.uint8)
113 | 
114 | class ProcessFrame84(gym.Wrapper):
115 |     def __init__(self, env=None):
116 |         super(ProcessFrame84, self).__init__(env)
117 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
118 | 
119 |     def _step(self, action):
120 |         obs, reward, done, info = self.env.step(action)
121 |         return _process_frame84(obs), reward, done, info
122 | 
123 |     def _reset(self):
124 |         return _process_frame84(self.env.reset())
125 | 
126 | class ClippedRewardsWrapper(gym.Wrapper):
127 |     def _step(self, action):
128 |         obs, reward, done, info = self.env.step(action)
129 |         return obs, np.sign(reward), done, info
130 | 
131 | def wrap_deepmind_ram(env):
132 |     env = EpisodicLifeEnv(env)
133 |     env = NoopResetEnv(env, noop_max=30)
134 |     env = MaxAndSkipEnv(env, skip=4)
135 |     if 'FIRE' in env.unwrapped.get_action_meanings():
136 |         env = FireResetEnv(env)
137 |     env = ClippedRewardsWrapper(env)
138 |     return env
139 | 
140 | def wrap_deepmind(env):
141 |     assert 'NoFrameskip' in env.spec.id
142 |     env = EpisodicLifeEnv(env)
143 |     env = NoopResetEnv(env, noop_max=30)
144 |     env = MaxAndSkipEnv(env, skip=4)
145 |     if 'FIRE' in env.unwrapped.get_action_meanings():
146 |         env = FireResetEnv(env)
147 |     env = ProcessFrame84(env)
148 |     env = ClippedRewardsWrapper(env)
149 |     return env
150 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from collections import deque
  4 | import gym
  5 | from gym import spaces
  6 | 
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env=None, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         super(NoopResetEnv, self).__init__(env)
 14 |         self.noop_max = noop_max
 15 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 16 | 
 17 |     def _reset(self):
 18 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 19 |         self.env.reset()
 20 |         noops = np.random.randint(1, self.noop_max + 1)
 21 |         for _ in range(noops):
 22 |             obs, _, _, _ = self.env.step(0)
 23 |         return obs
 24 | 
 25 | class FireResetEnv(gym.Wrapper):
 26 |     def __init__(self, env=None):
 27 |         """Take action on reset for environments that are fixed until firing."""
 28 |         super(FireResetEnv, self).__init__(env)
 29 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 30 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 31 | 
 32 |     def _reset(self):
 33 |         self.env.reset()
 34 |         obs, _, _, _ = self.env.step(1)
 35 |         obs, _, _, _ = self.env.step(2)
 36 |         return obs
 37 | 
 38 | class EpisodicLifeEnv(gym.Wrapper):
 39 |     def __init__(self, env=None):
 40 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 41 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 42 |         """
 43 |         super(EpisodicLifeEnv, self).__init__(env)
 44 |         self.lives = 0
 45 |         self.was_real_done  = True
 46 |         self.was_real_reset = False
 47 | 
 48 |     def _step(self, action):
 49 |         obs, reward, done, info = self.env.step(action)
 50 |         self.was_real_done = done
 51 |         # check current lives, make loss of life terminal,
 52 |         # then update lives to handle bonus lives
 53 |         lives = self.env.unwrapped.ale.lives()
 54 |         if lives < self.lives and lives > 0:
 55 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 56 |             # so its important to keep lives > 0, so that we only reset once
 57 |             # the environment advertises done.
 58 |             done = True
 59 |         self.lives = lives
 60 |         return obs, reward, done, info
 61 | 
 62 |     def _reset(self):
 63 |         """Reset only when lives are exhausted.
 64 |         This way all states are still reachable even though lives are episodic,
 65 |         and the learner need not know about any of this behind-the-scenes.
 66 |         """
 67 |         if self.was_real_done:
 68 |             obs = self.env.reset()
 69 |             self.was_real_reset = True
 70 |         else:
 71 |             # no-op step to advance from terminal/lost life state
 72 |             obs, _, _, _ = self.env.step(0)
 73 |             self.was_real_reset = False
 74 |         self.lives = self.env.unwrapped.ale.lives()
 75 |         return obs
 76 | 
 77 | class MaxAndSkipEnv(gym.Wrapper):
 78 |     def __init__(self, env=None, skip=4):
 79 |         """Return only every `skip`-th frame"""
 80 |         super(MaxAndSkipEnv, self).__init__(env)
 81 |         # most recent raw observations (for max pooling across time steps)
 82 |         self._obs_buffer = deque(maxlen=2)
 83 |         self._skip       = skip
 84 | 
 85 |     def _step(self, action):
 86 |         total_reward = 0.0
 87 |         done = None
 88 |         for _ in range(self._skip):
 89 |             obs, reward, done, info = self.env.step(action)
 90 |             self._obs_buffer.append(obs)
 91 |             total_reward += reward
 92 |             if done:
 93 |                 break
 94 | 
 95 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
 96 | 
 97 |         return max_frame, total_reward, done, info
 98 | 
 99 |     def _reset(self):
100 |         """Clear past frame buffer and init. to first obs. from inner env."""
101 |         self._obs_buffer.clear()
102 |         obs = self.env.reset()
103 |         self._obs_buffer.append(obs)
104 |         return obs
105 | 
106 | def _process_frame84(frame):
107 |     img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
108 |     img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
109 |     resized_screen = cv2.resize(img, (84, 110),  interpolation=cv2.INTER_LINEAR)
110 |     x_t = resized_screen[18:102, :]
111 |     x_t = np.reshape(x_t, [84, 84, 1])
112 |     return x_t.astype(np.uint8)
113 | 
114 | class ProcessFrame84(gym.Wrapper):
115 |     def __init__(self, env=None):
116 |         super(ProcessFrame84, self).__init__(env)
117 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
118 | 
119 |     def _step(self, action):
120 |         obs, reward, done, info = self.env.step(action)
121 |         return _process_frame84(obs), reward, done, info
122 | 
123 |     def _reset(self):
124 |         return _process_frame84(self.env.reset())
125 | 
126 | class ClippedRewardsWrapper(gym.Wrapper):
127 |     def _step(self, action):
128 |         obs, reward, done, info = self.env.step(action)
129 |         return obs, np.sign(reward), done, info
130 | 
131 | def wrap_deepmind_ram(env):
132 |     env = EpisodicLifeEnv(env)
133 |     env = NoopResetEnv(env, noop_max=30)
134 |     env = MaxAndSkipEnv(env, skip=4)
135 |     if 'FIRE' in env.unwrapped.get_action_meanings():
136 |         env = FireResetEnv(env)
137 |     env = ClippedRewardsWrapper(env)
138 |     return env
139 | 
140 | def wrap_deepmind(env):
141 |     assert 'NoFrameskip' in env.spec.id
142 |     env = EpisodicLifeEnv(env)
143 |     env = NoopResetEnv(env, noop_max=30)
144 |     env = MaxAndSkipEnv(env, skip=4)
145 |     if 'FIRE' in env.unwrapped.get_action_meanings():
146 |         env = FireResetEnv(env)
147 |     env = ProcessFrame84(env)
148 |     env = ClippedRewardsWrapper(env)
149 |     return env
150 | 


--------------------------------------------------------------------------------
/hw2/TestNoteBook.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
 13 |       "  return f(*args, **kwds)\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import tensorflow as tf"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "def build_mlp(\n",
 30 |     "        input_placeholder, \n",
 31 |     "        output_size,\n",
 32 |     "        scope, \n",
 33 |     "        n_layers=2, \n",
 34 |     "        size=64, \n",
 35 |     "        activation=tf.tanh,\n",
 36 |     "        output_activation=None\n",
 37 |     "        ):\n",
 38 |     "    with tf.variable_scope(scope):\n",
 39 |     "        x = input_placeholder\n",
 40 |     "        while n_layers > 0:\n",
 41 |     "            x = tf.layers.dense(x, size)\n",
 42 |     "            x = activation(x)\n",
 43 |     "            n_layers-=1\n",
 44 |     "        x = tf.layers.dense(x, output_size)\n",
 45 |     "        if output_activation != None:\n",
 46 |     "            x = output_activation(x)\n",
 47 |     "        return x"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {
 54 |     "collapsed": true
 55 |    },
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "in_place = tf.placeholder(tf.float32, shape = (1024, 1024))"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {
 65 |     "collapsed": true
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "mlp = build_mlp(in_place, 64, \"test1\")"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 5,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "data": {
 79 |       "text/plain": [
 80 |        "TensorShape([Dimension(1024), Dimension(64)])"
 81 |       ]
 82 |      },
 83 |      "execution_count": 5,
 84 |      "metadata": {},
 85 |      "output_type": "execute_result"
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "mlp.shape"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 6,
 95 |    "metadata": {
 96 |     "collapsed": true
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "import numpy as np"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 7,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "ob_dim = 50\n",
112 |     "ac_dim = 100\n",
113 |     "discrete = True"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 8,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "sy_ob_no = tf.placeholder(shape=[None, ob_dim], name=\"ob\", dtype=tf.float32)\n",
123 |     "if discrete:\n",
124 |     "    sy_ac_na = tf.placeholder(shape=[None], name=\"ac\", dtype=tf.int32) \n",
125 |     "else:\n",
126 |     "    sy_ac_na = tf.placeholder(shape=[None, ac_dim], name=\"ac\", dtype=tf.float32) \n",
127 |     "\n",
128 |     "sy_adv_n = tf.placeholder(shape = [None], name = \"adv\", dtype = tf.float32)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 9,
134 |    "metadata": {},
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/plain": [
139 |        "TensorShape([Dimension(None)])"
140 |       ]
141 |      },
142 |      "execution_count": 9,
143 |      "metadata": {},
144 |      "output_type": "execute_result"
145 |     }
146 |    ],
147 |    "source": [
148 |     "sy_adv_n.shape"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 10,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "sy_mean = build_mlp(sy_ob_no, ac_dim, \"policy\")\n",
158 |     "sy_logstd = tf.Variable(tf.zeros([1, ac_dim], name = 'logstd'))\n",
159 |     "sy_std = tf.exp(sy_logstd)\n",
160 |     "sy_sampled_ac = sy_mean + sy_std * tf.random_normal(tf.shape(sy_mean))"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 11,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/plain": [
171 |        "TensorShape([Dimension(None), Dimension(100)])"
172 |       ]
173 |      },
174 |      "execution_count": 11,
175 |      "metadata": {},
176 |      "output_type": "execute_result"
177 |     }
178 |    ],
179 |    "source": [
180 |     "sy_sampled_ac.shape"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 48,
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "data": {
190 |       "text/plain": [
191 |        "TensorShape([Dimension(64), Dimension(200)])"
192 |       ]
193 |      },
194 |      "execution_count": 48,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "(tf.Variable(tf.zeros([1, 200])) * tf.Variable(tf.zeros([64, 200]))).shape"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 22,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "x = np.array([1,2,3,4])"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 26,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "32"
223 |       ]
224 |      },
225 |      "execution_count": 26,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "2 ** 5"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 28,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "sample1 = np.random.normal([1024, 64])"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 31,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "real_values = tf.placeholder(shape = [1024, 64], dtype = tf.float32)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 33,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "sample2 = np.random.normal(size = [1024, 64], loc = 2)"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 43,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "sample1 = np.random.normal(size = [1024, 1024])"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": 39,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "loss = tf.nn.l2_loss(mlp - real_values)"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 41,
282 |    "metadata": {},
283 |    "outputs": [],
284 |    "source": [
285 |     "train1 = tf.train.AdamOptimizer(0.005).minimize(loss)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 42,
291 |    "metadata": {
292 |     "collapsed": true
293 |    },
294 |    "outputs": [],
295 |    "source": [
296 |     "sess = tf.InteractiveSession()"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 46,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": [
305 |     "sess.run(train1, feed_dict = {in_place : sample1, real_values : sample2})"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 45,
311 |    "metadata": {
312 |     "collapsed": true
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "tf.global_variables_initializer().run()"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {
323 |     "collapsed": true
324 |    },
325 |    "outputs": [],
326 |    "source": []
327 |   }
328 |  ],
329 |  "metadata": {
330 |   "kernelspec": {
331 |    "display_name": "Python 3",
332 |    "language": "python",
333 |    "name": "python3"
334 |   },
335 |   "language_info": {
336 |    "codemirror_mode": {
337 |     "name": "ipython",
338 |     "version": 3
339 |    },
340 |    "file_extension": ".py",
341 |    "mimetype": "text/x-python",
342 |    "name": "python",
343 |    "nbconvert_exporter": "python",
344 |    "pygments_lexer": "ipython3",
345 |    "version": "3.6.3"
346 |   }
347 |  },
348 |  "nbformat": 4,
349 |  "nbformat_minor": 2
350 | }
351 | 


--------------------------------------------------------------------------------
/hw3/Testing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/anaconda3/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
 13 |       "  return f(*args, **kwds)\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import tensorflow as tf\n",
 20 |     "import gym"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "benchmark = gym.benchmark_spec('Atari40M')"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "task = benchmark.tasks[3]"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 6,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "from run_dqn_atari import *"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 7,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stderr",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "[2018-06-08 23:37:20,714] Making new env: PongNoFrameskip-v4\n",
 66 |       "[2018-06-08 23:37:21,102] Clearing 2 monitor files from previous run (because force=True was provided)\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "env = get_env(task, 0)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 10,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "action = env.action_space.sample()"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 13,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "name": "stderr",
 90 |      "output_type": "stream",
 91 |      "text": [
 92 |       "[2018-06-08 23:38:59,068] Starting new video recorder writing to /tmp/hw3_vid_dir2/gym/openaigym.video.0.6079.video000000.mp4\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "observation = env.reset()"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 15,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "(84, 84, 1)"
109 |       ]
110 |      },
111 |      "execution_count": 15,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "observation.shape"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 16,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/plain": [
128 |        "Discrete(6)"
129 |       ]
130 |      },
131 |      "execution_count": 16,
132 |      "metadata": {},
133 |      "output_type": "execute_result"
134 |     }
135 |    ],
136 |    "source": [
137 |     "env.action_space"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 17,
143 |    "metadata": {},
144 |    "outputs": [
145 |     {
146 |      "data": {
147 |       "text/plain": [
148 |        "(84, 84, 1)"
149 |       ]
150 |      },
151 |      "execution_count": 17,
152 |      "metadata": {},
153 |      "output_type": "execute_result"
154 |     }
155 |    ],
156 |    "source": [
157 |     "env.observation_space.shape"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 28,
163 |    "metadata": {
164 |     "collapsed": true
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "sample_space = tf.placeholder(shape = (64, 84, 84, 4), dtype = tf.float32)"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 19,
174 |    "metadata": {
175 |     "collapsed": true
176 |    },
177 |    "outputs": [],
178 |    "source": [
179 |     "num_actions = env.action_space.n"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 21,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "6"
191 |       ]
192 |      },
193 |      "execution_count": 21,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "num_actions"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 29,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "model1 = atari_model(sample_space, num_actions, \"test2\")"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 30,
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "data": {
218 |       "text/plain": [
219 |        "TensorShape([Dimension(64), Dimension(6)])"
220 |       ]
221 |      },
222 |      "execution_count": 30,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "model1.get_shape()"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 32,
234 |    "metadata": {
235 |     "collapsed": true
236 |    },
237 |    "outputs": [],
238 |    "source": [
239 |     "v1 = tf.Variable([1,6,3,4,5,0], dtype = tf.float32)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 33,
245 |    "metadata": {
246 |     "collapsed": true
247 |    },
248 |    "outputs": [],
249 |    "source": [
250 |     "sess = tf.InteractiveSession()"
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "code",
255 |    "execution_count": 36,
256 |    "metadata": {},
257 |    "outputs": [
258 |     {
259 |      "data": {
260 |       "text/plain": [
261 |        "1"
262 |       ]
263 |      },
264 |      "execution_count": 36,
265 |      "metadata": {},
266 |      "output_type": "execute_result"
267 |     }
268 |    ],
269 |    "source": [
270 |     "sess.run(tf.argmax(v1))"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 57,
276 |    "metadata": {
277 |     "collapsed": true
278 |    },
279 |    "outputs": [],
280 |    "source": [
281 |     "sess.run(tf.global_variables_initializer())"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 37,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "6.0"
293 |       ]
294 |      },
295 |      "execution_count": 37,
296 |      "metadata": {},
297 |      "output_type": "execute_result"
298 |     }
299 |    ],
300 |    "source": [
301 |     "sess.run(tf.reduce_max(v1))"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": 39,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "v2 = tf.Variable([1,6,3,4,5,0], dtype = tf.uint8)"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "code",
315 |    "execution_count": 41,
316 |    "metadata": {},
317 |    "outputs": [
318 |     {
319 |      "data": {
320 |       "text/plain": [
321 |        "array([[ 0.,  1.,  0.,  0.,  0.,  0.,  0.],\n",
322 |        "       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.],\n",
323 |        "       [ 0.,  0.,  0.,  1.,  0.,  0.,  0.],\n",
324 |        "       [ 0.,  0.,  0.,  0.,  1.,  0.,  0.],\n",
325 |        "       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.],\n",
326 |        "       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)"
327 |       ]
328 |      },
329 |      "execution_count": 41,
330 |      "metadata": {},
331 |      "output_type": "execute_result"
332 |     }
333 |    ],
334 |    "source": [
335 |     "sess.run(tf.one_hot(v2, 7))"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 46,
341 |    "metadata": {},
342 |    "outputs": [
343 |     {
344 |      "data": {
345 |       "text/plain": [
346 |        "array([  0.        , -29.96999931,  -5.99399996, -11.98799992,\n",
347 |        "       -19.97999954,   0.        ], dtype=float32)"
348 |       ]
349 |      },
350 |      "execution_count": 46,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "sess.run(0.999 * (1 - v1) * v1)"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 56,
362 |    "metadata": {},
363 |    "outputs": [],
364 |    "source": [
365 |     "ph2 = tf.placeholder(tf.float32, shape = [6])\n",
366 |     "optimizer = tf.train.AdamOptimizer().minimize(tf.nn.l2_loss(tf.reduce_max(ph2) - tf.reduce_max(v1)))"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 64,
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "name": "stdout",
376 |      "output_type": "stream",
377 |      "text": [
378 |       "None\n"
379 |      ]
380 |     }
381 |    ],
382 |    "source": [
383 |     "print(sess.run(optimizer, {ph2: [1,6,3,4,5,0]}))"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 66,
389 |    "metadata": {},
390 |    "outputs": [
391 |     {
392 |      "name": "stdout",
393 |      "output_type": "stream",
394 |      "text": [
395 |       "[ 0.  0.  0.  1.]\n"
396 |      ]
397 |     }
398 |    ],
399 |    "source": [
400 |     "a = tf.placeholder(dtype=tf.float32, shape=[4])\n",
401 |     "b = tf.reduce_max(a)\n",
402 |     "c = tf.gradients([b], [a])[0]\n",
403 |     "with tf.Session() as sess:\n",
404 |     "    v = np.asarray([1, 2, 3, 4], dtype='float32')\n",
405 |     "    print(sess.run(c, feed_dict={a:v}))"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 67,
411 |    "metadata": {
412 |     "collapsed": true
413 |    },
414 |    "outputs": [],
415 |    "source": [
416 |     "import gym"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": 68,
422 |    "metadata": {
423 |     "collapsed": true
424 |    },
425 |    "outputs": [],
426 |    "source": [
427 |     "benchmark = gym.benchmark_spec('Atari40M')"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 69,
433 |    "metadata": {
434 |     "collapsed": true
435 |    },
436 |    "outputs": [],
437 |    "source": [
438 |     "task = benchmark.tasks[3]"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "code",
443 |    "execution_count": 70,
444 |    "metadata": {},
445 |    "outputs": [
446 |     {
447 |      "data": {
448 |       "text/plain": [
449 |        "40000000"
450 |       ]
451 |      },
452 |      "execution_count": 70,
453 |      "metadata": {},
454 |      "output_type": "execute_result"
455 |     },
456 |     {
457 |      "name": "stderr",
458 |      "output_type": "stream",
459 |      "text": [
460 |       "[2018-06-14 13:16:55,320] Finished writing results. You can upload them to the scoreboard via gym.upload('/tmp/hw3_vid_dir2/gym')\n"
461 |      ]
462 |     }
463 |    ],
464 |    "source": [
465 |     "task.max_timesteps"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": null,
471 |    "metadata": {
472 |     "collapsed": true
473 |    },
474 |    "outputs": [],
475 |    "source": []
476 |   }
477 |  ],
478 |  "metadata": {
479 |   "kernelspec": {
480 |    "display_name": "Python 3",
481 |    "language": "python",
482 |    "name": "python3"
483 |   },
484 |   "language_info": {
485 |    "codemirror_mode": {
486 |     "name": "ipython",
487 |     "version": 3
488 |    },
489 |    "file_extension": ".py",
490 |    "mimetype": "text/x-python",
491 |    "name": "python",
492 |    "nbconvert_exporter": "python",
493 |    "pygments_lexer": "ipython3",
494 |    "version": "3.6.3"
495 |   }
496 |  },
497 |  "nbformat": 4,
498 |  "nbformat_minor": 2
499 | }
500 | 


--------------------------------------------------------------------------------
/hw4/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | from dynamics import NNDynamicsModel
  5 | from controllers import MPCcontroller, RandomController
  6 | from cost_functions import cheetah_cost_fn, trajectory_cost_fn
  7 | import time
  8 | import logz
  9 | import os
 10 | import copy
 11 | import matplotlib.pyplot as plt
 12 | from cheetah_env import HalfCheetahEnvNew
 13 | from tqdm import tqdm
 14 | import matplotlib.pyplot as plt
 15 | from time import gmtime, strftime
 16 | 
 17 | def sample(env, 
 18 |            controller, 
 19 |            num_paths=10, 
 20 |            horizon=1000, 
 21 |            render=False,
 22 |            verbose=False):
 23 |     """
 24 |         Write a sampler function which takes in an environment, a controller (either random or the MPC controller), 
 25 |         and returns rollouts by running on the env. 
 26 |         Each path can have elements for observations, next_observations, rewards, returns, actions, etc.
 27 |     """
 28 |     paths = []
 29 |     iterator = range(num_paths)
 30 |     if verbose:
 31 |         iterator = tqdm(iterator)
 32 |     for _ in iterator:
 33 |         ob = env.reset()
 34 |         if render:
 35 |             env.render()
 36 |         obs, next_obs, actions, rewards = [], [], [], []
 37 |         steps = 0
 38 |         while True:
 39 |             obs.append(ob)
 40 |             action = controller.get_action(ob)
 41 |             actions.append(action)
 42 |             ob, reward, done, _ = env.step(action)
 43 |             next_obs.append(ob)
 44 |             rewards.append(reward)
 45 |             steps += 1
 46 |             if done or steps >= horizon:
 47 |                 break
 48 |         paths.append({
 49 |             'observations': np.array(obs),
 50 |             'actions': np.array(actions),
 51 |             'next_observations': np.array(next_obs)
 52 |         })
 53 | 
 54 |     return paths
 55 | 
 56 | # Utility to compute cost a path for a given cost function
 57 | def path_cost(cost_fn, path):
 58 |     return trajectory_cost_fn(cost_fn, path['observations'], path['actions'], path['next_observations'])
 59 | 
 60 | def compute_normalization(data):
 61 |     """
 62 |     Write a function to take in a dataset and compute the means, and stds.
 63 |     Return 6 elements: mean of s_t, std of s_t, mean of (s_t+1 - s_t), std of (s_t+1 - s_t), mean of actions, std of actions
 64 |     """
 65 |     obs = np.concatenate([d['observations'] for d in data])
 66 |     next_obs = np.concatenate([d['next_observations'] for d in data])
 67 |     actions = np.concatenate([d['actions'] for d in data])
 68 | 
 69 |     mean_obs = np.mean(obs)
 70 |     std_obs = np.std(obs)
 71 |     mean_deltas = np.mean(next_obs - obs)
 72 |     std_deltas = np.std(next_obs - obs)
 73 |     mean_action = np.mean(actions)
 74 |     std_action = np.std(actions)
 75 | 
 76 |     return mean_obs, std_obs, mean_deltas, std_deltas, mean_action, std_action
 77 | 
 78 | 
 79 | def plot_comparison(env, dyn_model):
 80 |     """
 81 |     Write a function to generate plots comparing the behavior of the model predictions for each element of the state to the actual ground truth, using randomly sampled actions. 
 82 |     """
 83 |     real_obs, pred_obs = [], []
 84 |     ob = env.reset()
 85 |     i = 0
 86 |     while True:
 87 |         action = env.action_space.sample()
 88 |         pred_ob = dyn_model.predict(ob, action)
 89 |         real_ob, _, done, _ = env.step(action)
 90 |         real_obs.append(real_ob)
 91 |         pred_obs.append(pred_ob)
 92 |         i += 1
 93 |         if done:
 94 |             break
 95 |     abs_diff = np.abs(np.array(real_obs - pred_obs))
 96 |     plt.plot(np.arange(i), abs_diff)
 97 |     time_str = strftime("%Y-%m-%d_%H:%M:%S", gmtime())
 98 |     plt.savefig("figures/" + time_str + ".png")
 99 | 
100 | def train(env, 
101 |          cost_fn,
102 |          logdir=None,
103 |          render=False,
104 |          learning_rate=1e-3,
105 |          onpol_iters=10,
106 |          dynamics_iters=60,
107 |          batch_size=512,
108 |          num_paths_random=10, 
109 |          num_paths_onpol=10, 
110 |          num_simulated_paths=10000,
111 |          env_horizon=1000, 
112 |          mpc_horizon=15,
113 |          n_layers=2,
114 |          size=500,
115 |          activation=tf.nn.relu,
116 |          output_activation=None
117 |          ):
118 | 
119 |     """
120 | 
121 |     Arguments:
122 | 
123 |     onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 
124 | 
125 |     dynamics_iters              Number of iterations of training for the dynamics model
126 |     |_                          which happen per iteration of the aggregation loop.
127 | 
128 |     batch_size                  Batch size for dynamics training.
129 | 
130 |     num_paths_random            Number of paths/trajectories/rollouts generated 
131 |     |                           by a random agent. We use these to train our 
132 |     |_                          initial dynamics model.
133 |     
134 |     num_paths_onpol             Number of paths to collect at each iteration of
135 |     |_                          aggregation, using the Model Predictive Control policy.
136 | 
137 |     num_simulated_paths         How many fictitious rollouts the MPC policy
138 |     |                           should generate each time it is asked for an
139 |     |_                          action.
140 | 
141 |     env_horizon                 Number of timesteps in each path.
142 | 
143 |     mpc_horizon                 The MPC policy generates actions by imagining 
144 |     |                           fictitious rollouts, and picking the first action
145 |     |                           of the best fictitious rollout. This argument is
146 |     |                           how many timesteps should be in each fictitious
147 |     |_                          rollout.
148 | 
149 |     n_layers/size/activations   Neural network architecture arguments. 
150 | 
151 |     """
152 | 
153 |     logz.configure_output_dir(logdir)
154 | 
155 |     #========================================================
156 |     # 
157 |     # First, we need a lot of data generated by a random
158 |     # agent, with which we'll begin to train our dynamics
159 |     # model.
160 | 
161 |     random_controller = RandomController(env)
162 | 
163 |     data = sample(env, random_controller, num_paths_random, env_horizon)
164 | 
165 |     #========================================================
166 |     # 
167 |     # The random data will be used to get statistics (mean
168 |     # and std) for the observations, actions, and deltas
169 |     # (where deltas are o_{t+1} - o_t). These will be used
170 |     # for normalizing inputs and denormalizing outputs
171 |     # from the dynamics network. 
172 |     # 
173 |     normalization = compute_normalization(data)
174 | 
175 | 
176 |     #========================================================
177 |     # 
178 |     # Build dynamics model and MPC controllers.
179 |     # 
180 |     sess = tf.Session()
181 | 
182 |     dyn_model = NNDynamicsModel(env=env, 
183 |                                 n_layers=n_layers, 
184 |                                 size=size, 
185 |                                 activation=activation, 
186 |                                 output_activation=output_activation, 
187 |                                 normalization=normalization,
188 |                                 batch_size=batch_size,
189 |                                 iterations=dynamics_iters,
190 |                                 learning_rate=learning_rate,
191 |                                 sess=sess)
192 | 
193 |     mpc_controller = MPCcontroller(env=env, 
194 |                                    dyn_model=dyn_model, 
195 |                                    horizon=mpc_horizon, 
196 |                                    cost_fn=cost_fn, 
197 |                                    num_simulated_paths=num_simulated_paths)
198 | 
199 | 
200 |     #========================================================
201 |     # 
202 |     # Tensorflow session building.
203 |     # 
204 |     sess.__enter__()
205 |     tf.global_variables_initializer().run()
206 | 
207 |     #========================================================
208 |     # 
209 |     # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. 
210 |     # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
211 |     # 
212 |     for itr in range(onpol_iters):
213 |         dyn_model.fit(data)
214 |         data_new = sample(env, mpc_controller, num_paths_onpol, env_horizon)
215 |         data = np.concatenate([data, data_new])
216 | 
217 |         returns = [np.sum(path['reward']) for path in data_new]
218 |         costs = [path_cost(cost_fn, path) for path in data_new]
219 | 
220 |         # LOGGING
221 |         # Statistics for performance of MPC policy using
222 |         # our learned dynamics model
223 |         logz.log_tabular('Iteration', itr)
224 |         # In terms of cost function which your MPC controller uses to plan
225 |         logz.log_tabular('AverageCost', np.mean(costs))
226 |         logz.log_tabular('StdCost', np.std(costs))
227 |         logz.log_tabular('MinimumCost', np.min(costs))
228 |         logz.log_tabular('MaximumCost', np.max(costs))
229 |         # In terms of true environment reward of your rolled out trajectory using the MPC controller
230 |         logz.log_tabular('AverageReturn', np.mean(returns))
231 |         logz.log_tabular('StdReturn', np.std(returns))
232 |         logz.log_tabular('MinimumReturn', np.min(returns))
233 |         logz.log_tabular('MaximumReturn', np.max(returns))
234 | 
235 |         logz.dump_tabular()
236 | 
237 | def main():
238 | 
239 |     import argparse
240 |     parser = argparse.ArgumentParser()
241 |     parser.add_argument('--env_name', type=str, default='HalfCheetah-v1')
242 |     # Experiment meta-params
243 |     parser.add_argument('--exp_name', type=str, default='mb_mpc')
244 |     parser.add_argument('--seed', type=int, default=3)
245 |     parser.add_argument('--render', action='store_true')
246 |     # Training args
247 |     parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3)
248 |     parser.add_argument('--onpol_iters', '-n', type=int, default=1)
249 |     parser.add_argument('--dyn_iters', '-nd', type=int, default=60)
250 |     parser.add_argument('--batch_size', '-b', type=int, default=512)
251 |     # Data collection
252 |     parser.add_argument('--random_paths', '-r', type=int, default=10)
253 |     parser.add_argument('--onpol_paths', '-d', type=int, default=10)
254 |     parser.add_argument('--simulated_paths', '-sp', type=int, default=1000)
255 |     parser.add_argument('--ep_len', '-ep', type=int, default=1000)
256 |     # Neural network architecture args
257 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
258 |     parser.add_argument('--size', '-s', type=int, default=500)
259 |     # MPC Controller
260 |     parser.add_argument('--mpc_horizon', '-m', type=int, default=15)
261 |     args = parser.parse_args()
262 | 
263 |     # Set seed
264 |     np.random.seed(args.seed)
265 |     tf.set_random_seed(args.seed)
266 | 
267 |     # Make data directory if it does not already exist
268 |     if not(os.path.exists('data')):
269 |         os.makedirs('data')
270 |     logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
271 |     logdir = os.path.join('data', logdir)
272 |     if not(os.path.exists(logdir)):
273 |         os.makedirs(logdir)
274 | 
275 |     # Make env
276 |     if args.env_name is "HalfCheetah-v1":
277 |         env = gym.make('HalfCheetah-v2')
278 |         cost_fn = cheetah_cost_fn
279 |     train(env=env, 
280 |                  cost_fn=cost_fn,
281 |                  logdir=logdir,
282 |                  render=args.render,
283 |                  learning_rate=args.learning_rate,
284 |                  onpol_iters=args.onpol_iters,
285 |                  dynamics_iters=args.dyn_iters,
286 |                  batch_size=args.batch_size,
287 |                  num_paths_random=args.random_paths, 
288 |                  num_paths_onpol=args.onpol_paths, 
289 |                  num_simulated_paths=args.simulated_paths,
290 |                  env_horizon=args.ep_len, 
291 |                  mpc_horizon=args.mpc_horizon,
292 |                  n_layers = args.n_layers,
293 |                  size=args.size,
294 |                  activation=tf.nn.relu,
295 |                  output_activation=None,
296 |                  )
297 | 
298 | if __name__ == "__main__":
299 |     main()
300 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/dqn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gym.spaces
  3 | import itertools
  4 | import numpy as np
  5 | import random
  6 | import tensorflow                as tf
  7 | import tensorflow.contrib.layers as layers
  8 | from collections import namedtuple
  9 | from dqn_utils import *
 10 | 
 11 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
 12 | 
 13 | def learn(env,
 14 |           q_func,
 15 |           optimizer_spec,
 16 |           session,
 17 |           exploration=LinearSchedule(1000000, 0.1),
 18 |           stopping_criterion=None,
 19 |           replay_buffer_size=1000000,
 20 |           batch_size=32,
 21 |           gamma=0.99,
 22 |           learning_starts=50000,
 23 |           learning_freq=4,
 24 |           frame_history_len=4,
 25 |           target_update_freq=10000,
 26 |           grad_norm_clipping=10):
 27 |     """Run Deep Q-learning algorithm.
 28 | 
 29 |     You can specify your own convnet using q_func.
 30 | 
 31 |     All schedules are w.r.t. total number of steps taken in the environment.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     env: gym.Env
 36 |         gym environment to train on.
 37 |     q_func: function
 38 |         Model to use for computing the q function. It should accept the
 39 |         following named arguments:
 40 |             img_in: tf.Tensor
 41 |                 tensorflow tensor representing the input image
 42 |             num_actions: int
 43 |                 number of actions
 44 |             scope: str
 45 |                 scope in which all the model related variables
 46 |                 should be created
 47 |             reuse: bool
 48 |                 whether previously created variables should be reused.
 49 |     optimizer_spec: OptimizerSpec
 50 |         Specifying the constructor and kwargs, as well as learning rate schedule
 51 |         for the optimizer
 52 |     session: tf.Session
 53 |         tensorflow session to use.
 54 |     exploration: rl_algs.deepq.utils.schedules.Schedule
 55 |         schedule for probability of chosing random action.
 56 |     stopping_criterion: (env, t) -> bool
 57 |         should return true when it's ok for the RL algorithm to stop.
 58 |         takes in env and the number of steps executed so far.
 59 |     replay_buffer_size: int
 60 |         How many memories to store in the replay buffer.
 61 |     batch_size: int
 62 |         How many transitions to sample each time experience is replayed.
 63 |     gamma: float
 64 |         Discount Factor
 65 |     learning_starts: int
 66 |         After how many environment steps to start replaying experiences
 67 |     learning_freq: int
 68 |         How many steps of environment to take between every experience replay
 69 |     frame_history_len: int
 70 |         How many past frames to include as input to the model.
 71 |     target_update_freq: int
 72 |         How many experience replay rounds (not steps!) to perform between
 73 |         each update to the target Q network
 74 |     grad_norm_clipping: float or None
 75 |         If not None gradients' norms are clipped to this value.
 76 |     """
 77 |     assert type(env.observation_space) == gym.spaces.Box
 78 |     assert type(env.action_space)      == gym.spaces.Discrete
 79 | 
 80 |     ###############
 81 |     # BUILD MODEL #
 82 |     ###############
 83 | 
 84 |     if len(env.observation_space.shape) == 1:
 85 |         # This means we are running on low-dimensional observations (e.g. RAM)
 86 |         input_shape = env.observation_space.shape
 87 |     else:
 88 |         img_h, img_w, img_c = env.observation_space.shape
 89 |         input_shape = (img_h, img_w, frame_history_len * img_c)
 90 |     num_actions = env.action_space.n
 91 | 
 92 |     # set up placeholders
 93 |     # placeholder for current observation (or state)
 94 |     obs_t_ph              = tf.placeholder(tf.uint8, [None] + list(input_shape))
 95 |     # placeholder for current action
 96 |     act_t_ph              = tf.placeholder(tf.int32,   [None])
 97 |     # placeholder for current reward
 98 |     rew_t_ph              = tf.placeholder(tf.float32, [None])
 99 |     # placeholder for next observation (or state)
100 |     obs_tp1_ph            = tf.placeholder(tf.uint8, [None] + list(input_shape))
101 |     # placeholder for end of episode mask
102 |     # this value is 1 if the next state corresponds to the end of an episode,
103 |     # in which case there is no Q-value at the next state; at the end of an
104 |     # episode, only the current state reward contributes to the target, not the
105 |     # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
106 |     done_mask_ph          = tf.placeholder(tf.float32, [None])
107 | 
108 |     # casting to float on GPU ensures lower data transfer times.
109 |     obs_t_float   = tf.cast(obs_t_ph,   tf.float32) / 255.0
110 |     obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0
111 | 
112 |     # Here, you should fill in your own code to compute the Bellman error. This requires
113 |     # evaluating the current and next Q-values and constructing the corresponding error.
114 |     # TensorFlow will differentiate this error for you, you just need to pass it to the
115 |     # optimizer. See assignment text for details.
116 |     # Your code should produce one scalar-valued tensor: total_error
117 |     # This will be passed to the optimizer in the provided code below.
118 |     # Your code should also produce two collections of variables:
119 |     # q_func_vars
120 |     # target_q_func_vars
121 |     # These should hold all of the variables of the Q-function network and target network,
122 |     # respectively. A convenient way to get these is to make use of TF's "scope" feature.
123 |     # For example, you can create your Q-function network with the scope "q_func" like this:
124 |     # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
125 |     # And then you can obtain the variables like this:
126 |     # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
127 |     # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
128 |     ######
129 |     
130 |     # YOUR CODE HERE
131 | 
132 |     ######
133 | 
134 |     # construct optimization op (with gradient clipping)
135 |     learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
136 |     optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs)
137 |     train_fn = minimize_and_clip(optimizer, total_error,
138 |                  var_list=q_func_vars, clip_val=grad_norm_clipping)
139 | 
140 |     # update_target_fn will be called periodically to copy Q network to target Q network
141 |     update_target_fn = []
142 |     for var, var_target in zip(sorted(q_func_vars,        key=lambda v: v.name),
143 |                                sorted(target_q_func_vars, key=lambda v: v.name)):
144 |         update_target_fn.append(var_target.assign(var))
145 |     update_target_fn = tf.group(*update_target_fn)
146 | 
147 |     # construct the replay buffer
148 |     replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
149 | 
150 |     ###############
151 |     # RUN ENV     #
152 |     ###############
153 |     model_initialized = False
154 |     num_param_updates = 0
155 |     mean_episode_reward      = -float('nan')
156 |     best_mean_episode_reward = -float('inf')
157 |     last_obs = env.reset()
158 |     LOG_EVERY_N_STEPS = 10000
159 | 
160 |     for t in itertools.count():
161 |         ### 1. Check stopping criterion
162 |         if stopping_criterion is not None and stopping_criterion(env, t):
163 |             break
164 | 
165 |         ### 2. Step the env and store the transition
166 |         # At this point, "last_obs" contains the latest observation that was
167 |         # recorded from the simulator. Here, your code needs to store this
168 |         # observation and its outcome (reward, next observation, etc.) into
169 |         # the replay buffer while stepping the simulator forward one step.
170 |         # At the end of this block of code, the simulator should have been
171 |         # advanced one step, and the replay buffer should contain one more
172 |         # transition.
173 |         # Specifically, last_obs must point to the new latest observation.
174 |         # Useful functions you'll need to call:
175 |         # obs, reward, done, info = env.step(action)
176 |         # this steps the environment forward one step
177 |         # obs = env.reset()
178 |         # this resets the environment if you reached an episode boundary.
179 |         # Don't forget to call env.reset() to get a new observation if done
180 |         # is true!!
181 |         # Note that you cannot use "last_obs" directly as input
182 |         # into your network, since it needs to be processed to include context
183 |         # from previous frames. You should check out the replay buffer
184 |         # implementation in dqn_utils.py to see what functionality the replay
185 |         # buffer exposes. The replay buffer has a function called
186 |         # encode_recent_observation that will take the latest observation
187 |         # that you pushed into the buffer and compute the corresponding
188 |         # input that should be given to a Q network by appending some
189 |         # previous frames.
190 |         # Don't forget to include epsilon greedy exploration!
191 |         # And remember that the first time you enter this loop, the model
192 |         # may not yet have been initialized (but of course, the first step
193 |         # might as well be random, since you haven't trained your net...)
194 | 
195 |         #####
196 |         
197 |         # YOUR CODE HERE
198 | 
199 |         #####
200 | 
201 |         # at this point, the environment should have been advanced one step (and
202 |         # reset if done was true), and last_obs should point to the new latest
203 |         # observation
204 | 
205 |         ### 3. Perform experience replay and train the network.
206 |         # note that this is only done if the replay buffer contains enough samples
207 |         # for us to learn something useful -- until then, the model will not be
208 |         # initialized and random actions should be taken
209 |         if (t > learning_starts and
210 |                 t % learning_freq == 0 and
211 |                 replay_buffer.can_sample(batch_size)):
212 |             # Here, you should perform training. Training consists of four steps:
213 |             # 3.a: use the replay buffer to sample a batch of transitions (see the
214 |             # replay buffer code for function definition, each batch that you sample
215 |             # should consist of current observations, current actions, rewards,
216 |             # next observations, and done indicator).
217 |             # 3.b: initialize the model if it has not been initialized yet; to do
218 |             # that, call
219 |             #    initialize_interdependent_variables(session, tf.global_variables(), {
220 |             #        obs_t_ph: obs_t_batch,
221 |             #        obs_tp1_ph: obs_tp1_batch,
222 |             #    })
223 |             # where obs_t_batch and obs_tp1_batch are the batches of observations at
224 |             # the current and next time step. The boolean variable model_initialized
225 |             # indicates whether or not the model has been initialized.
226 |             # Remember that you have to update the target network too (see 3.d)!
227 |             # 3.c: train the model. To do this, you'll need to use the train_fn and
228 |             # total_error ops that were created earlier: total_error is what you
229 |             # created to compute the total Bellman error in a batch, and train_fn
230 |             # will actually perform a gradient step and update the network parameters
231 |             # to reduce total_error. When calling session.run on these you'll need to
232 |             # populate the following placeholders:
233 |             # obs_t_ph
234 |             # act_t_ph
235 |             # rew_t_ph
236 |             # obs_tp1_ph
237 |             # done_mask_ph
238 |             # (this is needed for computing total_error)
239 |             # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
240 |             # (this is needed by the optimizer to choose the learning rate)
241 |             # 3.d: periodically update the target network by calling
242 |             # session.run(update_target_fn)
243 |             # you should update every target_update_freq steps, and you may find the
244 |             # variable num_param_updates useful for this (it was initialized to 0)
245 |             #####
246 |             
247 |             # YOUR CODE HERE
248 | 
249 |             #####
250 | 
251 |         ### 4. Log progress
252 |         episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
253 |         if len(episode_rewards) > 0:
254 |             mean_episode_reward = np.mean(episode_rewards[-100:])
255 |         if len(episode_rewards) > 100:
256 |             best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
257 |         if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
258 |             print("Timestep %d" % (t,))
259 |             print("mean reward (100 episodes) %f" % mean_episode_reward)
260 |             print("best mean reward %f" % best_mean_episode_reward)
261 |             print("episodes %d" % len(episode_rewards))
262 |             print("exploration %f" % exploration.value(t))
263 |             print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
264 |             sys.stdout.flush()
265 | 


--------------------------------------------------------------------------------
/sp17_hw/hw4/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | import gym
  4 | import logz
  5 | import scipy.signal
  6 | 
  7 | def normc_initializer(std=1.0):
  8 |     """
  9 |     Initialize array with normalized columns
 10 |     """
 11 |     def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613
 12 |         out = np.random.randn(*shape).astype(np.float32)
 13 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
 14 |         return tf.constant(out)
 15 |     return _initializer
 16 | 
 17 | 
 18 | def dense(x, size, name, weight_init=None):
 19 |     """
 20 |     Dense (fully connected) layer
 21 |     """
 22 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
 23 |     b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer())
 24 |     return tf.matmul(x, w) + b
 25 | 
 26 | def fancy_slice_2d(X, inds0, inds1):
 27 |     """
 28 |     Like numpy's X[inds0, inds1]
 29 |     """
 30 |     inds0 = tf.cast(inds0, tf.int64)
 31 |     inds1 = tf.cast(inds1, tf.int64)
 32 |     shape = tf.cast(tf.shape(X), tf.int64)
 33 |     ncols = shape[1]
 34 |     Xflat = tf.reshape(X, [-1])
 35 |     return tf.gather(Xflat, inds0 * ncols + inds1)
 36 | 
 37 | def discount(x, gamma):
 38 |     """
 39 |     Compute discounted sum of future values
 40 |     out[i] = in[i] + gamma * in[i+1] + gamma^2 * in[i+2] + ...
 41 |     """
 42 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
 43 | 
 44 | def explained_variance_1d(ypred,y):
 45 |     """
 46 |     Var[ypred - y] / var[y]. 
 47 |     https://www.quora.com/What-is-the-meaning-proportion-of-variance-explained-in-linear-regression
 48 |     """
 49 |     assert y.ndim == 1 and ypred.ndim == 1    
 50 |     vary = np.var(y)
 51 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
 52 | 
 53 | def categorical_sample_logits(logits):
 54 |     """
 55 |     Samples (symbolically) from categorical distribution, where logits is a NxK
 56 |     matrix specifying N categorical distributions with K categories
 57 | 
 58 |     specifically, exp(logits) / sum( exp(logits), axis=1 ) is the 
 59 |     probabilities of the different classes
 60 | 
 61 |     Cleverly uses gumbell trick, based on
 62 |     https://github.com/tensorflow/tensorflow/issues/456
 63 |     """
 64 |     U = tf.random_uniform(tf.shape(logits))
 65 |     return tf.argmax(logits - tf.log(-tf.log(U)), dimension=1)
 66 | 
 67 | def pathlength(path):
 68 |     return len(path["reward"])
 69 | 
 70 | class LinearValueFunction(object):
 71 |     coef = None
 72 |     def fit(self, X, y):
 73 |         Xp = self.preproc(X)
 74 |         A = Xp.T.dot(Xp)
 75 |         nfeats = Xp.shape[1]
 76 |         A[np.arange(nfeats), np.arange(nfeats)] += 1e-3 # a little ridge regression
 77 |         b = Xp.T.dot(y)
 78 |         self.coef = np.linalg.solve(A, b)
 79 |     def predict(self, X):
 80 |         if self.coef is None:
 81 |             return np.zeros(X.shape[0])
 82 |         else:
 83 |             return self.preproc(X).dot(self.coef)
 84 |     def preproc(self, X):
 85 |         return np.concatenate([np.ones([X.shape[0], 1]), X, np.square(X)/2.0], axis=1)
 86 | 
 87 | class NnValueFunction(object):
 88 |     pass # YOUR CODE HERE
 89 | 
 90 | def lrelu(x, leak=0.2):
 91 |     f1 = 0.5 * (1 + leak)
 92 |     f2 = 0.5 * (1 - leak)
 93 |     return f1 * x + f2 * abs(x)
 94 | 
 95 | 
 96 | 
 97 | def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None):
 98 |     env = gym.make("CartPole-v0")
 99 |     ob_dim = env.observation_space.shape[0]
100 |     num_actions = env.action_space.n
101 |     logz.configure_output_dir(logdir)
102 |     vf = LinearValueFunction()
103 | 
104 |     # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
105 |     # that are computed later in these function
106 |     sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
107 |     sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation
108 |     sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate
109 |     sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
110 |     sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer
111 |     # we use a small initialization for the last layer, so the initial policy has maximal entropy
112 |     sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic)
113 |     sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions
114 |     sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient)
115 |     sy_n = tf.shape(sy_ob_no)[0]
116 |     sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation
117 | 
118 |     # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
119 |     sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na)
120 |     sy_oldp_na = tf.exp(sy_oldlogp_na) 
121 |     sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n)
122 |     sy_p_na = tf.exp(sy_logp_na)
123 |     sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n)
124 |     # <<<<<<<<<<<<<
125 | 
126 |     sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")
127 | 
128 |     sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
129 |     update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)
130 | 
131 |     tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 
132 |     # use single thread. on such a small problem, multithreading gives you a slowdown
133 |     # this way, we can better use multiple cores for different experiments
134 |     sess = tf.Session(config=tf_config)
135 |     sess.__enter__() # equivalent to `with sess:`
136 |     tf.global_variables_initializer().run() #pylint: disable=E1101
137 | 
138 |     total_timesteps = 0
139 | 
140 |     for i in range(n_iter):
141 |         print("********** Iteration %i ************"%i)
142 | 
143 |         # Collect paths until we have enough timesteps
144 |         timesteps_this_batch = 0
145 |         paths = []
146 |         while True:
147 |             ob = env.reset()
148 |             terminated = False
149 |             obs, acs, rewards = [], [], []
150 |             animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
151 |             while True:
152 |                 if animate_this_episode:
153 |                     env.render()
154 |                 obs.append(ob)
155 |                 ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
156 |                 acs.append(ac)
157 |                 ob, rew, done, _ = env.step(ac)
158 |                 rewards.append(rew)
159 |                 if done:
160 |                     break                    
161 |             path = {"observation" : np.array(obs), "terminated" : terminated,
162 |                     "reward" : np.array(rewards), "action" : np.array(acs)}
163 |             paths.append(path)
164 |             timesteps_this_batch += pathlength(path)
165 |             if timesteps_this_batch > min_timesteps_per_batch:
166 |                 break
167 |         total_timesteps += timesteps_this_batch
168 |         # Estimate advantage function
169 |         vtargs, vpreds, advs = [], [], []
170 |         for path in paths:
171 |             rew_t = path["reward"]
172 |             return_t = discount(rew_t, gamma)
173 |             vpred_t = vf.predict(path["observation"])
174 |             adv_t = return_t - vpred_t
175 |             advs.append(adv_t)
176 |             vtargs.append(return_t)
177 |             vpreds.append(vpred_t)
178 | 
179 |         # Build arrays for policy update
180 |         ob_no = np.concatenate([path["observation"] for path in paths])
181 |         ac_n = np.concatenate([path["action"] for path in paths])
182 |         adv_n = np.concatenate(advs)
183 |         standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
184 |         vtarg_n = np.concatenate(vtargs)
185 |         vpred_n = np.concatenate(vpreds)
186 |         vf.fit(ob_no, vtarg_n)
187 | 
188 |         # Policy update
189 |         _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
190 |         kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na})
191 | 
192 |         # Log diagnostics
193 |         logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
194 |         logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
195 |         logz.log_tabular("KLOldNew", kl)
196 |         logz.log_tabular("Entropy", ent)
197 |         logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
198 |         logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
199 |         logz.log_tabular("TimestepsSoFar", total_timesteps)
200 |         # If you're overfitting, EVAfter will be way larger than EVBefore.
201 |         # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
202 |         logz.dump_tabular()
203 | 
204 | def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False):
205 |     tf.set_random_seed(seed)
206 |     np.random.seed(seed)
207 |     env = gym.make("Pendulum-v0")
208 |     ob_dim = env.observation_space.shape[0]
209 |     ac_dim = env.action_space.shape[0]
210 |     logz.configure_output_dir(logdir)
211 |     if vf_type == 'linear':
212 |         vf = LinearValueFunction(**vf_params)
213 |     elif vf_type == 'nn':
214 |         vf = NnValueFunction(ob_dim=ob_dim, **vf_params)
215 | 
216 | 
217 |     YOUR_CODE_HERE
218 | 
219 | 
220 |     sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")
221 | 
222 |     sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
223 |     update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)
224 | 
225 |     sess = tf.Session()
226 |     sess.__enter__() # equivalent to `with sess:`
227 |     tf.global_variables_initializer().run() #pylint: disable=E1101
228 | 
229 |     total_timesteps = 0
230 |     stepsize = initial_stepsize
231 | 
232 |     for i in range(n_iter):
233 |         print("********** Iteration %i ************"%i)
234 | 
235 |         YOUR_CODE_HERE
236 | 
237 |         if kl > desired_kl * 2: 
238 |             stepsize /= 1.5
239 |             print('stepsize -> %s'%stepsize)
240 |         elif kl < desired_kl / 2: 
241 |             stepsize *= 1.5
242 |             print('stepsize -> %s'%stepsize)
243 |         else:
244 |             print('stepsize OK')
245 | 
246 | 
247 |         # Log diagnostics
248 |         logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
249 |         logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
250 |         logz.log_tabular("KLOldNew", kl)
251 |         logz.log_tabular("Entropy", ent)
252 |         logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
253 |         logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
254 |         logz.log_tabular("TimestepsSoFar", total_timesteps)
255 |         # If you're overfitting, EVAfter will be way larger than EVBefore.
256 |         # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
257 |         logz.dump_tabular()
258 | 
259 | 
260 | def main_pendulum1(d):
261 |     return main_pendulum(**d)
262 | 
263 | if __name__ == "__main__":
264 |     if 1:
265 |         main_cartpole(logdir=None) # when you want to start collecting results, set the logdir
266 |     if 0:
267 |         general_params = dict(gamma=0.97, animate=False, min_timesteps_per_batch=2500, n_iter=300, initial_stepsize=1e-3)
268 |         params = [
269 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
270 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed0', seed=0, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
271 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
272 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed1', seed=1, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
273 |             dict(logdir='/tmp/ref/linearvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='linear', vf_params={}, **general_params),
274 |             dict(logdir='/tmp/ref/nnvf-kl2e-3-seed2', seed=2, desired_kl=2e-3, vf_type='nn', vf_params=dict(n_epochs=10, stepsize=1e-3), **general_params),
275 |         ]
276 |         import multiprocessing
277 |         p = multiprocessing.Pool()
278 |         p.map(main_pendulum1, params)
279 | 


--------------------------------------------------------------------------------
/hw3/dqn_utils.py:
--------------------------------------------------------------------------------
  1 | """This file includes a collection of utility functions that are useful for
  2 | implementing DQN."""
  3 | import gym
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | def huber_loss(x, delta=1.0):
  9 |     # https://en.wikipedia.org/wiki/Huber_loss
 10 |     return tf.select(
 11 |         tf.abs(x) < delta,
 12 |         tf.square(x) * 0.5,
 13 |         delta * (tf.abs(x) - 0.5 * delta)
 14 |     )
 15 | 
 16 | def sample_n_unique(sampling_f, n):
 17 |     """Helper function. Given a function `sampling_f` that returns
 18 |     comparable objects, sample n such unique objects.
 19 |     """
 20 |     res = []
 21 |     while len(res) < n:
 22 |         candidate = sampling_f()
 23 |         if candidate not in res:
 24 |             res.append(candidate)
 25 |     return res
 26 | 
 27 | class Schedule(object):
 28 |     def value(self, t):
 29 |         """Value of the schedule at time t"""
 30 |         raise NotImplementedError()
 31 | 
 32 | class ConstantSchedule(object):
 33 |     def __init__(self, value):
 34 |         """Value remains constant over time.
 35 |         Parameters
 36 |         ----------
 37 |         value: float
 38 |             Constant value of the schedule
 39 |         """
 40 |         self._v = value
 41 | 
 42 |     def value(self, t):
 43 |         """See Schedule.value"""
 44 |         return self._v
 45 | 
 46 | def linear_interpolation(l, r, alpha):
 47 |     return l + alpha * (r - l)
 48 | 
 49 | class PiecewiseSchedule(object):
 50 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 51 |         """Piecewise schedule.
 52 |         endpoints: [(int, int)]
 53 |             list of pairs `(time, value)` meanining that schedule should output
 54 |             `value` when `t==time`. All the values for time must be sorted in
 55 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 56 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 57 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 58 |             time passed between `time_a` and `time_b` for time `t`.
 59 |         interpolation: lambda float, float, float: float
 60 |             a function that takes value to the left and to the right of t according
 61 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 62 |             right endpoint that t has covered. See linear_interpolation for example.
 63 |         outside_value: float
 64 |             if the value is requested outside of all the intervals sepecified in
 65 |             `endpoints` this value is returned. If None then AssertionError is
 66 |             raised when outside value is requested.
 67 |         """
 68 |         idxes = [e[0] for e in endpoints]
 69 |         assert idxes == sorted(idxes)
 70 |         self._interpolation = interpolation
 71 |         self._outside_value = outside_value
 72 |         self._endpoints      = endpoints
 73 | 
 74 |     def value(self, t):
 75 |         """See Schedule.value"""
 76 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 77 |             if l_t <= t and t < r_t:
 78 |                 alpha = float(t - l_t) / (r_t - l_t)
 79 |                 return self._interpolation(l, r, alpha)
 80 | 
 81 |         # t does not belong to any of the pieces, so doom.
 82 |         assert self._outside_value is not None
 83 |         return self._outside_value
 84 | 
 85 | class LinearSchedule(object):
 86 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 87 |         """Linear interpolation between initial_p and final_p over
 88 |         schedule_timesteps. After this many timesteps pass final_p is
 89 |         returned.
 90 |         Parameters
 91 |         ----------
 92 |         schedule_timesteps: int
 93 |             Number of timesteps for which to linearly anneal initial_p
 94 |             to final_p
 95 |         initial_p: float
 96 |             initial output value
 97 |         final_p: float
 98 |             final output value
 99 |         """
100 |         self.schedule_timesteps = schedule_timesteps
101 |         self.final_p            = final_p
102 |         self.initial_p          = initial_p
103 | 
104 |     def value(self, t):
105 |         """See Schedule.value"""
106 |         fraction  = min(float(t) / self.schedule_timesteps, 1.0)
107 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
108 | 
109 | def compute_exponential_averages(variables, decay):
110 |     """Given a list of tensorflow scalar variables
111 |     create ops corresponding to their exponential
112 |     averages
113 |     Parameters
114 |     ----------
115 |     variables: [tf.Tensor]
116 |         List of scalar tensors.
117 |     Returns
118 |     -------
119 |     averages: [tf.Tensor]
120 |         List of scalar tensors corresponding to averages
121 |         of al the `variables` (in order)
122 |     apply_op: tf.runnable
123 |         Op to be run to update the averages with current value
124 |         of variables.
125 |     """
126 |     averager = tf.train.ExponentialMovingAverage(decay=decay)
127 |     apply_op = averager.apply(variables)
128 |     return [averager.average(v) for v in variables], apply_op
129 | 
130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
131 |     """Minimized `objective` using `optimizer` w.r.t. variables in
132 |     `var_list` while ensure the norm of the gradients for each
133 |     variable is clipped to `clip_val`
134 |     """
135 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
136 |     for i, (grad, var) in enumerate(gradients):
137 |         if grad is not None:
138 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
139 |     return optimizer.apply_gradients(gradients)
140 | 
141 | def initialize_interdependent_variables(session, vars_list, feed_dict):
142 |     """Initialize a list of variables one at a time, which is useful if
143 |     initialization of some variables depends on initialization of the others.
144 |     """
145 |     vars_left = vars_list
146 |     while len(vars_left) > 0:
147 |         new_vars_left = []
148 |         for v in vars_left:
149 |             try:
150 |                 # If using an older version of TensorFlow, uncomment the line
151 |                 # below and comment out the line after it.
152 | 		#session.run(tf.initialize_variables([v]), feed_dict)
153 |                 session.run(tf.variables_initializer([v]), feed_dict)
154 |             except tf.errors.FailedPreconditionError:
155 |                 new_vars_left.append(v)
156 |         if len(new_vars_left) >= len(vars_left):
157 |             # This can happend if the variables all depend on each other, or more likely if there's
158 |             # another variable outside of the list, that still needs to be initialized. This could be
159 |             # detected here, but life's finite.
160 |             raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
161 |         else:
162 |             vars_left = new_vars_left
163 | 
164 | def get_wrapper_by_name(env, classname):
165 |     currentenv = env
166 |     while True:
167 |         if classname in currentenv.__class__.__name__:
168 |             return currentenv
169 |         elif isinstance(env, gym.Wrapper):
170 |             currentenv = currentenv.env
171 |         else:
172 |             raise ValueError("Couldn't find wrapper named %s"%classname)
173 | 
174 | class ReplayBuffer(object):
175 |     def __init__(self, size, frame_history_len):
176 |         """This is a memory efficient implementation of the replay buffer.
177 | 
178 |         The sepecific memory optimizations use here are:
179 |             - only store each frame once rather than k times
180 |               even if every observation normally consists of k last frames
181 |             - store frames as np.uint8 (actually it is most time-performance
182 |               to cast them back to float32 on GPU to minimize memory transfer
183 |               time)
184 |             - store frame_t and frame_(t+1) in the same buffer.
185 | 
186 |         For the tipical use case in Atari Deep RL buffer with 1M frames the total
187 |         memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes
188 | 
189 |         Warning! Assumes that returning frame of zeros at the beginning
190 |         of the episode, when there is less frames than `frame_history_len`,
191 |         is acceptable.
192 | 
193 |         Parameters
194 |         ----------
195 |         size: int
196 |             Max number of transitions to store in the buffer. When the buffer
197 |             overflows the old memories are dropped.
198 |         frame_history_len: int
199 |             Number of memories to be retried for each observation.
200 |         """
201 |         self.size = size
202 |         self.frame_history_len = frame_history_len
203 | 
204 |         self.next_idx      = 0
205 |         self.num_in_buffer = 0
206 | 
207 |         self.obs      = None
208 |         self.action   = None
209 |         self.reward   = None
210 |         self.done     = None
211 | 
212 |     def can_sample(self, batch_size):
213 |         """Returns true if `batch_size` different transitions can be sampled from the buffer."""
214 |         return batch_size + 1 <= self.num_in_buffer
215 | 
216 |     def _encode_sample(self, idxes):
217 |         obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
218 |         act_batch      = self.action[idxes]
219 |         rew_batch      = self.reward[idxes]
220 |         next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
221 |         done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)
222 | 
223 |         return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
224 | 
225 | 
226 |     def sample(self, batch_size):
227 |         """Sample `batch_size` different transitions.
228 | 
229 |         i-th sample transition is the following:
230 | 
231 |         when observing `obs_batch[i]`, action `act_batch[i]` was taken,
232 |         after which reward `rew_batch[i]` was received and subsequent
233 |         observation  next_obs_batch[i] was observed, unless the epsiode
234 |         was done which is represented by `done_mask[i]` which is equal
235 |         to 1 if episode has ended as a result of that action.
236 | 
237 |         Parameters
238 |         ----------
239 |         batch_size: int
240 |             How many transitions to sample.
241 | 
242 |         Returns
243 |         -------
244 |         obs_batch: np.array
245 |             Array of shape
246 |             (batch_size, img_h, img_w, img_c * frame_history_len)
247 |             and dtype np.uint8
248 |         act_batch: np.array
249 |             Array of shape (batch_size,) and dtype np.int32
250 |         rew_batch: np.array
251 |             Array of shape (batch_size,) and dtype np.float32
252 |         next_obs_batch: np.array
253 |             Array of shape
254 |             (batch_size, img_h, img_w, img_c * frame_history_len)
255 |             and dtype np.uint8
256 |         done_mask: np.array
257 |             Array of shape (batch_size,) and dtype np.float32
258 |         """
259 |         assert self.can_sample(batch_size)
260 |         idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
261 |         return self._encode_sample(idxes)
262 | 
263 |     def encode_recent_observation(self):
264 |         """Return the most recent `frame_history_len` frames.
265 | 
266 |         Returns
267 |         -------
268 |         observation: np.array
269 |             Array of shape (img_h, img_w, img_c * frame_history_len)
270 |             and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
271 |             encodes frame at time `t - frame_history_len + i`
272 |         """
273 |         assert self.num_in_buffer > 0
274 |         return self._encode_observation((self.next_idx - 1) % self.size)
275 | 
276 |     def _encode_observation(self, idx):
277 |         end_idx   = idx + 1 # make noninclusive
278 |         start_idx = end_idx - self.frame_history_len
279 |         # this checks if we are using low-dimensional observations, such as RAM
280 |         # state, in which case we just directly return the latest RAM.
281 |         if len(self.obs.shape) == 2:
282 |             return self.obs[end_idx-1]
283 |         # if there weren't enough frames ever in the buffer for context
284 |         if start_idx < 0 and self.num_in_buffer != self.size:
285 |             start_idx = 0
286 |         for idx in range(start_idx, end_idx - 1):
287 |             if self.done[idx % self.size]:
288 |                 start_idx = idx + 1
289 |         missing_context = self.frame_history_len - (end_idx - start_idx)
290 |         # if zero padding is needed for missing context
291 |         # or we are on the boundry of the buffer
292 |         if start_idx < 0 or missing_context > 0:
293 |             frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
294 |             for idx in range(start_idx, end_idx):
295 |                 frames.append(self.obs[idx % self.size])
296 |             return np.concatenate(frames, 2)
297 |         else:
298 |             # this optimization has potential to saves about 30% compute time \o/
299 |             img_h, img_w = self.obs.shape[1], self.obs.shape[2]
300 |             return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)
301 | 
302 |     def store_frame(self, frame):
303 |         """Store a single frame in the buffer at the next available index, overwriting
304 |         old frames if necessary.
305 | 
306 |         Parameters
307 |         ----------
308 |         frame: np.array
309 |             Array of shape (img_h, img_w, img_c) and dtype np.uint8
310 |             the frame to be stored
311 | 
312 |         Returns
313 |         -------
314 |         idx: int
315 |             Index at which the frame is stored. To be used for `store_effect` later.
316 |         """
317 |         if self.obs is None:
318 |             self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.uint8)
319 |             self.action   = np.empty([self.size],                     dtype=np.int32)
320 |             self.reward   = np.empty([self.size],                     dtype=np.float32)
321 |             self.done     = np.empty([self.size],                     dtype=np.bool)
322 |         self.obs[self.next_idx] = frame
323 | 
324 |         ret = self.next_idx
325 |         self.next_idx = (self.next_idx + 1) % self.size
326 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
327 | 
328 |         return ret
329 | 
330 |     def store_effect(self, idx, action, reward, done):
331 |         """Store effects of action taken after obeserving frame stored
332 |         at index idx. The reason `store_frame` and `store_effect` is broken
333 |         up into two functions is so that once can call `encode_recent_observation`
334 |         in between.
335 | 
336 |         Paramters
337 |         ---------
338 |         idx: int
339 |             Index in buffer of recently observed frame (returned by `store_frame`).
340 |         action: int
341 |             Action that was performed upon observing this frame.
342 |         reward: float
343 |             Reward that was received when the actions was performed.
344 |         done: bool
345 |             True if episode was finished after performing that action.
346 |         """
347 |         self.action[idx] = action
348 |         self.reward[idx] = reward
349 |         self.done[idx]   = done
350 | 
351 | 


--------------------------------------------------------------------------------
/sp17_hw/hw3/dqn_utils.py:
--------------------------------------------------------------------------------
  1 | """This file includes a collection of utility functions that are useful for
  2 | implementing DQN."""
  3 | import gym
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import random
  7 | 
  8 | def huber_loss(x, delta=1.0):
  9 |     # https://en.wikipedia.org/wiki/Huber_loss
 10 |     return tf.select(
 11 |         tf.abs(x) < delta,
 12 |         tf.square(x) * 0.5,
 13 |         delta * (tf.abs(x) - 0.5 * delta)
 14 |     )
 15 | 
 16 | def sample_n_unique(sampling_f, n):
 17 |     """Helper function. Given a function `sampling_f` that returns
 18 |     comparable objects, sample n such unique objects.
 19 |     """
 20 |     res = []
 21 |     while len(res) < n:
 22 |         candidate = sampling_f()
 23 |         if candidate not in res:
 24 |             res.append(candidate)
 25 |     return res
 26 | 
 27 | class Schedule(object):
 28 |     def value(self, t):
 29 |         """Value of the schedule at time t"""
 30 |         raise NotImplementedError()
 31 | 
 32 | class ConstantSchedule(object):
 33 |     def __init__(self, value):
 34 |         """Value remains constant over time.
 35 |         Parameters
 36 |         ----------
 37 |         value: float
 38 |             Constant value of the schedule
 39 |         """
 40 |         self._v = value
 41 | 
 42 |     def value(self, t):
 43 |         """See Schedule.value"""
 44 |         return self._v
 45 | 
 46 | def linear_interpolation(l, r, alpha):
 47 |     return l + alpha * (r - l)
 48 | 
 49 | class PiecewiseSchedule(object):
 50 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 51 |         """Piecewise schedule.
 52 |         endpoints: [(int, int)]
 53 |             list of pairs `(time, value)` meanining that schedule should output
 54 |             `value` when `t==time`. All the values for time must be sorted in
 55 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 56 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 57 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 58 |             time passed between `time_a` and `time_b` for time `t`.
 59 |         interpolation: lambda float, float, float: float
 60 |             a function that takes value to the left and to the right of t according
 61 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 62 |             right endpoint that t has covered. See linear_interpolation for example.
 63 |         outside_value: float
 64 |             if the value is requested outside of all the intervals sepecified in
 65 |             `endpoints` this value is returned. If None then AssertionError is
 66 |             raised when outside value is requested.
 67 |         """
 68 |         idxes = [e[0] for e in endpoints]
 69 |         assert idxes == sorted(idxes)
 70 |         self._interpolation = interpolation
 71 |         self._outside_value = outside_value
 72 |         self._endpoints      = endpoints
 73 | 
 74 |     def value(self, t):
 75 |         """See Schedule.value"""
 76 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 77 |             if l_t <= t and t < r_t:
 78 |                 alpha = float(t - l_t) / (r_t - l_t)
 79 |                 return self._interpolation(l, r, alpha)
 80 | 
 81 |         # t does not belong to any of the pieces, so doom.
 82 |         assert self._outside_value is not None
 83 |         return self._outside_value
 84 | 
 85 | class LinearSchedule(object):
 86 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 87 |         """Linear interpolation between initial_p and final_p over
 88 |         schedule_timesteps. After this many timesteps pass final_p is
 89 |         returned.
 90 |         Parameters
 91 |         ----------
 92 |         schedule_timesteps: int
 93 |             Number of timesteps for which to linearly anneal initial_p
 94 |             to final_p
 95 |         initial_p: float
 96 |             initial output value
 97 |         final_p: float
 98 |             final output value
 99 |         """
100 |         self.schedule_timesteps = schedule_timesteps
101 |         self.final_p            = final_p
102 |         self.initial_p          = initial_p
103 | 
104 |     def value(self, t):
105 |         """See Schedule.value"""
106 |         fraction  = min(float(t) / self.schedule_timesteps, 1.0)
107 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
108 | 
109 | def compute_exponential_averages(variables, decay):
110 |     """Given a list of tensorflow scalar variables
111 |     create ops corresponding to their exponential
112 |     averages
113 |     Parameters
114 |     ----------
115 |     variables: [tf.Tensor]
116 |         List of scalar tensors.
117 |     Returns
118 |     -------
119 |     averages: [tf.Tensor]
120 |         List of scalar tensors corresponding to averages
121 |         of al the `variables` (in order)
122 |     apply_op: tf.runnable
123 |         Op to be run to update the averages with current value
124 |         of variables.
125 |     """
126 |     averager = tf.train.ExponentialMovingAverage(decay=decay)
127 |     apply_op = averager.apply(variables)
128 |     return [averager.average(v) for v in variables], apply_op
129 | 
130 | def minimize_and_clip(optimizer, objective, var_list, clip_val=10):
131 |     """Minimized `objective` using `optimizer` w.r.t. variables in
132 |     `var_list` while ensure the norm of the gradients for each
133 |     variable is clipped to `clip_val`
134 |     """
135 |     gradients = optimizer.compute_gradients(objective, var_list=var_list)
136 |     for i, (grad, var) in enumerate(gradients):
137 |         if grad is not None:
138 |             gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
139 |     return optimizer.apply_gradients(gradients)
140 | 
141 | def initialize_interdependent_variables(session, vars_list, feed_dict):
142 |     """Initialize a list of variables one at a time, which is useful if
143 |     initialization of some variables depends on initialization of the others.
144 |     """
145 |     vars_left = vars_list
146 |     while len(vars_left) > 0:
147 |         new_vars_left = []
148 |         for v in vars_left:
149 |             try:
150 |                 # If using an older version of TensorFlow, uncomment the line
151 |                 # below and comment out the line after it.
152 | 		#session.run(tf.initialize_variables([v]), feed_dict)
153 |                 session.run(tf.variables_initializer([v]), feed_dict)
154 |             except tf.errors.FailedPreconditionError:
155 |                 new_vars_left.append(v)
156 |         if len(new_vars_left) >= len(vars_left):
157 |             # This can happend if the variables all depend on each other, or more likely if there's
158 |             # another variable outside of the list, that still needs to be initialized. This could be
159 |             # detected here, but life's finite.
160 |             raise Exception("Cycle in variable dependencies, or extenrnal precondition unsatisfied.")
161 |         else:
162 |             vars_left = new_vars_left
163 | 
164 | def get_wrapper_by_name(env, classname):
165 |     currentenv = env
166 |     while True:
167 |         if classname in currentenv.__class__.__name__:
168 |             return currentenv
169 |         elif isinstance(env, gym.Wrapper):
170 |             currentenv = currentenv.env
171 |         else:
172 |             raise ValueError("Couldn't find wrapper named %s"%classname)
173 | 
174 | class ReplayBuffer(object):
175 |     def __init__(self, size, frame_history_len):
176 |         """This is a memory efficient implementation of the replay buffer.
177 | 
178 |         The sepecific memory optimizations use here are:
179 |             - only store each frame once rather than k times
180 |               even if every observation normally consists of k last frames
181 |             - store frames as np.uint8 (actually it is most time-performance
182 |               to cast them back to float32 on GPU to minimize memory transfer
183 |               time)
184 |             - store frame_t and frame_(t+1) in the same buffer.
185 | 
186 |         For the tipical use case in Atari Deep RL buffer with 1M frames the total
187 |         memory footprint of this buffer is 10^6 * 84 * 84 bytes ~= 7 gigabytes
188 | 
189 |         Warning! Assumes that returning frame of zeros at the beginning
190 |         of the episode, when there is less frames than `frame_history_len`,
191 |         is acceptable.
192 | 
193 |         Parameters
194 |         ----------
195 |         size: int
196 |             Max number of transitions to store in the buffer. When the buffer
197 |             overflows the old memories are dropped.
198 |         frame_history_len: int
199 |             Number of memories to be retried for each observation.
200 |         """
201 |         self.size = size
202 |         self.frame_history_len = frame_history_len
203 | 
204 |         self.next_idx      = 0
205 |         self.num_in_buffer = 0
206 | 
207 |         self.obs      = None
208 |         self.action   = None
209 |         self.reward   = None
210 |         self.done     = None
211 | 
212 |     def can_sample(self, batch_size):
213 |         """Returns true if `batch_size` different transitions can be sampled from the buffer."""
214 |         return batch_size + 1 <= self.num_in_buffer
215 | 
216 |     def _encode_sample(self, idxes):
217 |         obs_batch      = np.concatenate([self._encode_observation(idx)[None] for idx in idxes], 0)
218 |         act_batch      = self.action[idxes]
219 |         rew_batch      = self.reward[idxes]
220 |         next_obs_batch = np.concatenate([self._encode_observation(idx + 1)[None] for idx in idxes], 0)
221 |         done_mask      = np.array([1.0 if self.done[idx] else 0.0 for idx in idxes], dtype=np.float32)
222 | 
223 |         return obs_batch, act_batch, rew_batch, next_obs_batch, done_mask
224 | 
225 | 
226 |     def sample(self, batch_size):
227 |         """Sample `batch_size` different transitions.
228 | 
229 |         i-th sample transition is the following:
230 | 
231 |         when observing `obs_batch[i]`, action `act_batch[i]` was taken,
232 |         after which reward `rew_batch[i]` was received and subsequent
233 |         observation  next_obs_batch[i] was observed, unless the epsiode
234 |         was done which is represented by `done_mask[i]` which is equal
235 |         to 1 if episode has ended as a result of that action.
236 | 
237 |         Parameters
238 |         ----------
239 |         batch_size: int
240 |             How many transitions to sample.
241 | 
242 |         Returns
243 |         -------
244 |         obs_batch: np.array
245 |             Array of shape
246 |             (batch_size, img_h, img_w, img_c * frame_history_len)
247 |             and dtype np.uint8
248 |         act_batch: np.array
249 |             Array of shape (batch_size,) and dtype np.int32
250 |         rew_batch: np.array
251 |             Array of shape (batch_size,) and dtype np.float32
252 |         next_obs_batch: np.array
253 |             Array of shape
254 |             (batch_size, img_h, img_w, img_c * frame_history_len)
255 |             and dtype np.uint8
256 |         done_mask: np.array
257 |             Array of shape (batch_size,) and dtype np.float32
258 |         """
259 |         assert self.can_sample(batch_size)
260 |         idxes = sample_n_unique(lambda: random.randint(0, self.num_in_buffer - 2), batch_size)
261 |         return self._encode_sample(idxes)
262 | 
263 |     def encode_recent_observation(self):
264 |         """Return the most recent `frame_history_len` frames.
265 | 
266 |         Returns
267 |         -------
268 |         observation: np.array
269 |             Array of shape (img_h, img_w, img_c * frame_history_len)
270 |             and dtype np.uint8, where observation[:, :, i*img_c:(i+1)*img_c]
271 |             encodes frame at time `t - frame_history_len + i`
272 |         """
273 |         assert self.num_in_buffer > 0
274 |         return self._encode_observation((self.next_idx - 1) % self.size)
275 | 
276 |     def _encode_observation(self, idx):
277 |         end_idx   = idx + 1 # make noninclusive
278 |         start_idx = end_idx - self.frame_history_len
279 |         # this checks if we are using low-dimensional observations, such as RAM
280 |         # state, in which case we just directly return the latest RAM.
281 |         if len(self.obs.shape) == 2:
282 |             return self.obs[end_idx-1]
283 |         # if there weren't enough frames ever in the buffer for context
284 |         if start_idx < 0 and self.num_in_buffer != self.size:
285 |             start_idx = 0
286 |         for idx in range(start_idx, end_idx - 1):
287 |             if self.done[idx % self.size]:
288 |                 start_idx = idx + 1
289 |         missing_context = self.frame_history_len - (end_idx - start_idx)
290 |         # if zero padding is needed for missing context
291 |         # or we are on the boundry of the buffer
292 |         if start_idx < 0 or missing_context > 0:
293 |             frames = [np.zeros_like(self.obs[0]) for _ in range(missing_context)]
294 |             for idx in range(start_idx, end_idx):
295 |                 frames.append(self.obs[idx % self.size])
296 |             return np.concatenate(frames, 2)
297 |         else:
298 |             # this optimization has potential to saves about 30% compute time \o/
299 |             img_h, img_w = self.obs.shape[1], self.obs.shape[2]
300 |             return self.obs[start_idx:end_idx].transpose(1, 2, 0, 3).reshape(img_h, img_w, -1)
301 | 
302 |     def store_frame(self, frame):
303 |         """Store a single frame in the buffer at the next available index, overwriting
304 |         old frames if necessary.
305 | 
306 |         Parameters
307 |         ----------
308 |         frame: np.array
309 |             Array of shape (img_h, img_w, img_c) and dtype np.uint8
310 |             the frame to be stored
311 | 
312 |         Returns
313 |         -------
314 |         idx: int
315 |             Index at which the frame is stored. To be used for `store_effect` later.
316 |         """
317 |         if self.obs is None:
318 |             self.obs      = np.empty([self.size] + list(frame.shape), dtype=np.uint8)
319 |             self.action   = np.empty([self.size],                     dtype=np.int32)
320 |             self.reward   = np.empty([self.size],                     dtype=np.float32)
321 |             self.done     = np.empty([self.size],                     dtype=np.bool)
322 |         self.obs[self.next_idx] = frame
323 | 
324 |         ret = self.next_idx
325 |         self.next_idx = (self.next_idx + 1) % self.size
326 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
327 | 
328 |         return ret
329 | 
330 |     def store_effect(self, idx, action, reward, done):
331 |         """Store effects of action taken after obeserving frame stored
332 |         at index idx. The reason `store_frame` and `store_effect` is broken
333 |         up into two functions is so that once can call `encode_recent_observation`
334 |         in between.
335 | 
336 |         Paramters
337 |         ---------
338 |         idx: int
339 |             Index in buffer of recently observed frame (returned by `store_frame`).
340 |         action: int
341 |             Action that was performed upon observing this frame.
342 |         reward: float
343 |             Reward that was received when the actions was performed.
344 |         done: bool
345 |             True if episode was finished after performing that action.
346 |         """
347 |         self.action[idx] = action
348 |         self.reward[idx] = reward
349 |         self.done[idx]   = done
350 | 
351 | 


--------------------------------------------------------------------------------
/hw3/dqn.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import gym.spaces
  3 | import itertools
  4 | import numpy as np
  5 | import random
  6 | import tensorflow                as tf
  7 | import tensorflow.contrib.layers as layers
  8 | from collections import namedtuple
  9 | from dqn_utils import *
 10 | 
 11 | OptimizerSpec = namedtuple("OptimizerSpec", ["constructor", "kwargs", "lr_schedule"])
 12 | 
 13 | def learn(env,
 14 |           q_func,
 15 |           optimizer_spec,
 16 |           session,
 17 |           exploration=LinearSchedule(1000000, 0.1),
 18 |           stopping_criterion=None,
 19 |           replay_buffer_size=1000000,
 20 |           batch_size=32,
 21 |           gamma=0.99,
 22 |           learning_starts=50000,
 23 |           learning_freq=4,
 24 |           frame_history_len=4,
 25 |           target_update_freq=10000,
 26 |           grad_norm_clipping=10):
 27 |     """Run Deep Q-learning algorithm.
 28 | 
 29 |     You can specify your own convnet using q_func.
 30 | 
 31 |     All schedules are w.r.t. total number of steps taken in the environment.
 32 | 
 33 |     Parameters
 34 |     ----------
 35 |     env: gym.Env
 36 |         gym environment to train on.
 37 |     q_func: function
 38 |         Model to use for computing the q function. It should accept the
 39 |         following named arguments:
 40 |             img_in: tf.Tensor
 41 |                 tensorflow tensor representing the input image
 42 |             num_actions: int
 43 |                 number of actions
 44 |             scope: str
 45 |                 scope in which all the model related variables
 46 |                 should be created
 47 |             reuse: bool
 48 |                 whether previously created variables should be reused.
 49 |     optimizer_spec: OptimizerSpec
 50 |         Specifying the constructor and kwargs, as well as learning rate schedule
 51 |         for the optimizer
 52 |     session: tf.Session
 53 |         tensorflow session to use.
 54 |     exploration: rl_algs.deepq.utils.schedules.Schedule
 55 |         schedule for probability of chosing random action.
 56 |     stopping_criterion: (env, t) -> bool
 57 |         should return true when it's ok for the RL algorithm to stop.
 58 |         takes in env and the number of steps executed so far.
 59 |     replay_buffer_size: int
 60 |         How many memories to store in the replay buffer.
 61 |     batch_size: int
 62 |         How many transitions to sample each time experience is replayed.
 63 |     gamma: float
 64 |         Discount Factor
 65 |     learning_starts: int
 66 |         After how many environment steps to start replaying experiences
 67 |     learning_freq: int
 68 |         How many steps of environment to take between every experience replay
 69 |     frame_history_len: int
 70 |         How many past frames to include as input to the model.
 71 |     target_update_freq: int
 72 |         How many experience replay rounds (not steps!) to perform between
 73 |         each update to the target Q network
 74 |     grad_norm_clipping: float or None
 75 |         If not None gradients' norms are clipped to this value.
 76 |     """
 77 |     assert type(env.observation_space) == gym.spaces.Box
 78 |     assert type(env.action_space)      == gym.spaces.Discrete
 79 | 
 80 |     ###############
 81 |     # BUILD MODEL #
 82 |     ###############
 83 | 
 84 |     if len(env.observation_space.shape) == 1:
 85 |         # This means we are running on low-dimensional observations (e.g. RAM)
 86 |         input_shape = env.observation_space.shape
 87 |     else:
 88 |         img_h, img_w, img_c = env.observation_space.shape
 89 |         input_shape = (img_h, img_w, frame_history_len * img_c)
 90 |     num_actions = env.action_space.n
 91 | 
 92 |     # set up placeholders
 93 |     # placeholder for current observation (or state)
 94 |     obs_t_ph              = tf.placeholder(tf.uint8, [None] + list(input_shape))
 95 |     # placeholder for current action
 96 |     act_t_ph              = tf.placeholder(tf.int32,   [None])
 97 |     # placeholder for current reward
 98 |     rew_t_ph              = tf.placeholder(tf.float32, [None])
 99 |     # placeholder for next observation (or state)
100 |     obs_tp1_ph            = tf.placeholder(tf.uint8, [None] + list(input_shape))
101 |     # placeholder for end of episode mask
102 |     # this value is 1 if the next state corresponds to the end of an episode,
103 |     # in which case there is no Q-value at the next state; at the end of an
104 |     # episode, only the current state reward contributes to the target, not the
105 |     # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
106 |     done_mask_ph          = tf.placeholder(tf.float32, [None])
107 | 
108 |     # casting to float on GPU ensures lower data transfer times.
109 |     obs_t_float   = tf.cast(obs_t_ph,   tf.float32) / 255.0
110 |     obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0
111 | 
112 |     # Here, you should fill in your own code to compute the Bellman error. This requires
113 |     # evaluating the current and next Q-values and constructing the corresponding error.
114 |     # TensorFlow will differentiate this error for you, you just need to pass it to the
115 |     # optimizer. See assignment text for details.
116 |     # Your code should produce one scalar-valued tensor: total_error
117 |     # This will be passed to the optimizer in the provided code below.
118 |     # Your code should also produce two collections of variables:
119 |     # q_func_vars
120 |     # target_q_func_vars
121 |     # These should hold all of the variables of the Q-function network and target network,
122 |     # respectively. A convenient way to get these is to make use of TF's "scope" feature.
123 |     # For example, you can create your Q-function network with the scope "q_func" like this:
124 |     # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
125 |     # And then you can obtain the variables like this:
126 |     # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
127 |     # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
128 |     ######
129 |     
130 |     q_t = q_func(obs_t_float, num_actions, scope = "q_func", reuse = False)
131 |     q_t1 = q_func(obs_tp1_float, num_actions, scope = "target_q_func", reuse = False)
132 |     best_action = tf.argmax(q_t, axis = 1)
133 |     max_q = tf.reduce_max(q_t1, axis = 1)
134 |     y = rew_t_ph + gamma * tf.multiply((1.0 - done_mask_ph), max_q)
135 |     q_t_taken = tf.reduce_sum(tf.multiply(q_t, tf.one_hot(act_t_ph, num_actions)), axis = 1)
136 | 
137 |     total_error = tf.losses.mean_squared_error(y, q_t_taken)
138 | 
139 |     q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
140 |     target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func')
141 | 
142 |     ######
143 | 
144 |     # construct optimization op (with gradient clipping)
145 |     learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
146 |     optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs)
147 |     train_fn = minimize_and_clip(optimizer, total_error,
148 |                  var_list=q_func_vars, clip_val=grad_norm_clipping)
149 | 
150 |     # update_target_fn will be called periodically to copy Q network to target Q network
151 |     update_target_fn = []
152 |     for var, var_target in zip(sorted(q_func_vars,        key=lambda v: v.name),
153 |                                sorted(target_q_func_vars, key=lambda v: v.name)):
154 |         update_target_fn.append(var_target.assign(var))
155 |     update_target_fn = tf.group(*update_target_fn)
156 | 
157 |     # construct the replay buffer
158 |     replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
159 | 
160 |     ###############
161 |     # RUN ENV     #
162 |     ###############
163 |     model_initialized = False
164 |     num_param_updates = 0
165 |     mean_episode_reward      = -float('nan')
166 |     best_mean_episode_reward = -float('inf')
167 |     last_obs = env.reset()
168 |     LOG_EVERY_N_STEPS = 1000 #10000
169 | 
170 |     for t in itertools.count():
171 |         ### 1. Check stopping criterion
172 |         if stopping_criterion is not None and stopping_criterion(env, t):
173 |             break
174 | 
175 |         ### 2. Step the env and store the transition
176 |         # At this point, "last_obs" contains the latest observation that was
177 |         # recorded from the simulator. Here, your code needs to store this
178 |         # observation and its outcome (reward, next observation, etc.) into
179 |         # the replay buffer while stepping the simulator forward one step.
180 |         # At the end of this block of code, the simulator should have been
181 |         # advanced one step, and the replay buffer should contain one more
182 |         # transition.
183 |         # Specifically, last_obs must point to the new latest observation.
184 |         # Useful functions you'll need to call:
185 |         # obs, reward, done, info = env.step(action)
186 |         # this steps the environment forward one step
187 |         # obs = env.reset()
188 |         # this resets the environment if you reached an episode boundary.
189 |         # Don't forget to call env.reset() to get a new observation if done
190 |         # is true!!
191 |         # Note that you cannot use "last_obs" directly as input
192 |         # into your network, since it needs to be processed to include context
193 |         # from previous frames. You should check out the replay buffer
194 |         # implementation in dqn_utils.py to see what functionality the replay
195 |         # buffer exposes. The replay buffer has a function called
196 |         # encode_recent_observation that will take the latest observation
197 |         # that you pushed into the buffer and compute the corresponding
198 |         # input that should be given to a Q network by appending some
199 |         # previous frames.
200 |         # Don't forget to include epsilon greedy exploration!
201 |         # And remember that the first time you enter this loop, the model
202 |         # may not yet have been initialized (but of course, the first step
203 |         # might as well be random, since you haven't trained your net...)
204 | 
205 |         #####
206 |         
207 |         idx = replay_buffer.store_frame(last_obs)
208 | 
209 |         if not model_initialized:
210 |             action = random.randint(0, num_actions - 1)
211 |         else:
212 |             obs = replay_buffer.encode_recent_observation()
213 |             action = session.run(best_action, feed_dict = {obs_t_ph: [obs]})
214 |             if random.random() < exploration.value(t) * num_actions / (num_actions - 1):
215 |                 action = random.randint(0, num_actions - 1)
216 |         
217 |         next_obs, reward, done, _ = env.step(action)
218 |         replay_buffer.store_effect(idx, action, reward, done)
219 |         if done:
220 |             last_obs = env.reset()
221 |         else:
222 |             last_obs = next_obs
223 | 
224 |         #####
225 | 
226 |         # at this point, the environment should have been advanced one step (and
227 |         # reset if done was true), and last_obs should point to the new latest
228 |         # observation
229 | 
230 |         ### 3. Perform experience replay and train the network.
231 |         # note that this is only done if the replay buffer contains enough samples
232 |         # for us to learn something useful -- until then, the model will not be
233 |         # initialized and random actions should be taken
234 |         if (t > learning_starts and
235 |                 t % learning_freq == 0 and
236 |                 replay_buffer.can_sample(batch_size)):
237 |             # Here, you should perform training. Training consists of four steps:
238 |             # 3.a: use the replay buffer to sample a batch of transitions (see the
239 |             # replay buffer code for function definition, each batch that you sample
240 |             # should consist of current observations, current actions, rewards,
241 |             # next observations, and done indicator).
242 |             # 3.b: initialize the model if it has not been initialized yet; to do
243 |             # that, call
244 |             #    initialize_interdependent_variables(session, tf.global_variables(), {
245 |             #        obs_t_ph: obs_t_batch,
246 |             #        obs_tp1_ph: obs_tp1_batch,
247 |             #    })
248 |             # where obs_t_batch and obs_tp1_batch are the batches of observations at
249 |             # the current and next time step. The boolean variable model_initialized
250 |             # indicates whether or not the model has been initialized.
251 |             # Remember that you have to update the target network too (see 3.d)!
252 |             # 3.c: train the model. To do this, you'll need to use the train_fn and
253 |             # total_error ops that were created earlier: total_error is what you
254 |             # created to compute the total Bellman error in a batch, and train_fn
255 |             # will actually perform a gradient step and update the network parameters
256 |             # to reduce total_error. When calling session.run on these you'll need to
257 |             # populate the following placeholders:
258 |             # obs_t_ph
259 |             # act_t_ph
260 |             # rew_t_ph
261 |             # obs_tp1_ph
262 |             # done_mask_ph
263 |             # (this is needed for computing total_error)
264 |             # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
265 |             # (this is needed by the optimizer to choose the learning rate)
266 |             # 3.d: periodically update the target network by calling
267 |             # session.run(update_target_fn)
268 |             # you should update every target_update_freq steps, and you may find the
269 |             # variable num_param_updates useful for this (it was initialized to 0)
270 |             #####
271 |             obs_batch, act_batch, rew_batch, next_obs_batch, done_batch = replay_buffer.sample(batch_size)
272 | 
273 |             if not model_initialized:
274 |                 initialize_interdependent_variables(session, tf.global_variables(), {
275 |                     obs_t_ph: obs_batch,
276 |                     obs_tp1_ph: next_obs_batch,
277 |                 }) 
278 |                 model_initialized = True
279 | 
280 |             session.run(train_fn, {
281 |                 obs_t_ph: obs_batch,
282 |                 act_t_ph: act_batch,
283 |                 rew_t_ph: rew_batch,
284 |                 obs_tp1_ph: next_obs_batch,
285 |                 done_mask_ph: done_batch,
286 |                 learning_rate: optimizer_spec.lr_schedule.value(t)
287 |             })
288 | 
289 |             num_param_updates += 1
290 |             if num_param_updates % target_update_freq == 0:
291 |                 session.run(update_target_fn)
292 |             
293 |             #####
294 | 
295 |         ### 4. Log progress
296 |         episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
297 |         if len(episode_rewards) > 0:
298 |             mean_episode_reward = np.mean(episode_rewards[-100:])
299 |         if len(episode_rewards) > 100:
300 |             best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)
301 |         if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
302 |             print("Timestep %d" % (t,))
303 |             print("mean reward (100 episodes) %f" % mean_episode_reward)
304 |             print("best mean reward %f" % best_mean_episode_reward)
305 |             print("episodes %d" % len(episode_rewards))
306 |             print("exploration %f" % exploration.value(t))
307 |             print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
308 |             sys.stdout.flush()
309 | 


--------------------------------------------------------------------------------
/hw1/tf_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf # pylint: ignore-module
  3 | #import builtins
  4 | import functools
  5 | import copy
  6 | import os
  7 | import collections
  8 | 
  9 | # ================================================================
 10 | # Import all names into common namespace
 11 | # ================================================================
 12 | 
 13 | clip = tf.clip_by_value
 14 | 
 15 | # Make consistent with numpy
 16 | # ----------------------------------------
 17 | 
 18 | def sum(x, axis=None, keepdims=False):
 19 |     return tf.reduce_sum(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 20 | def mean(x, axis=None, keepdims=False):
 21 |     return tf.reduce_mean(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 22 | def var(x, axis=None, keepdims=False):
 23 |     meanx = mean(x, axis=axis, keepdims=keepdims)
 24 |     return mean(tf.square(x - meanx), axis=axis, keepdims=keepdims)
 25 | def std(x, axis=None, keepdims=False):
 26 |     return tf.sqrt(var(x, axis=axis, keepdims=keepdims))
 27 | def max(x, axis=None, keepdims=False):
 28 |     return tf.reduce_max(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 29 | def min(x, axis=None, keepdims=False):
 30 |     return tf.reduce_min(x, reduction_indices=None if axis is None else [axis], keep_dims = keepdims)
 31 | def concatenate(arrs, axis=0):
 32 |     return tf.concat(axis, arrs)
 33 | def argmax(x, axis=None):
 34 |     return tf.argmax(x, dimension=axis)
 35 | 
 36 | def switch(condition, then_expression, else_expression):
 37 |     '''Switches between two operations depending on a scalar value (int or bool).
 38 |     Note that both `then_expression` and `else_expression`
 39 |     should be symbolic tensors of the *same shape*.
 40 | 
 41 |     # Arguments
 42 |         condition: scalar tensor.
 43 |         then_expression: TensorFlow operation.
 44 |         else_expression: TensorFlow operation.
 45 |     '''
 46 |     x_shape = copy.copy(then_expression.get_shape())
 47 |     x = tf.cond(tf.cast(condition, 'bool'),
 48 |                 lambda: then_expression,
 49 |                 lambda: else_expression)
 50 |     x.set_shape(x_shape)
 51 |     return x
 52 | 
 53 | # Extras
 54 | # ----------------------------------------
 55 | def l2loss(params):
 56 |     if len(params) == 0:
 57 |         return tf.constant(0.0)
 58 |     else:
 59 |         return tf.add_n([sum(tf.square(p)) for p in params])
 60 | def lrelu(x, leak=0.2):
 61 |     f1 = 0.5 * (1 + leak)
 62 |     f2 = 0.5 * (1 - leak)
 63 |     return f1 * x + f2 * abs(x)
 64 | def categorical_sample_logits(X):
 65 |     # https://github.com/tensorflow/tensorflow/issues/456
 66 |     U = tf.random_uniform(tf.shape(X))
 67 |     return argmax(X - tf.log(-tf.log(U)), axis=1)
 68 | 
 69 | # ================================================================
 70 | # Global session
 71 | # ================================================================
 72 | 
 73 | def get_session():
 74 |     return tf.get_default_session()
 75 | 
 76 | def single_threaded_session():
 77 |     tf_config = tf.ConfigProto(
 78 |         inter_op_parallelism_threads=1,
 79 |         intra_op_parallelism_threads=1)
 80 |     return tf.Session(config=tf_config)
 81 | 
 82 | def make_session(num_cpu):
 83 |     tf_config = tf.ConfigProto(
 84 |         inter_op_parallelism_threads=num_cpu,
 85 |         intra_op_parallelism_threads=num_cpu)
 86 |     return tf.Session(config=tf_config)
 87 | 
 88 | 
 89 | ALREADY_INITIALIZED = set()
 90 | def initialize():
 91 |     new_variables = set(tf.all_variables()) - ALREADY_INITIALIZED
 92 |     get_session().run(tf.initialize_variables(new_variables))
 93 |     ALREADY_INITIALIZED.update(new_variables)
 94 | 
 95 | 
 96 | def eval(expr, feed_dict=None):
 97 |     if feed_dict is None: feed_dict = {}
 98 |     return get_session().run(expr, feed_dict=feed_dict)
 99 | 
100 | def set_value(v, val):
101 |     get_session().run(v.assign(val))
102 | 
103 | def load_state(fname):
104 |     saver = tf.train.Saver()
105 |     saver.restore(get_session(), fname)
106 | 
107 | def save_state(fname):
108 |     os.makedirs(os.path.dirname(fname), exist_ok=True)
109 |     saver = tf.train.Saver()
110 |     saver.save(get_session(), fname)
111 | 
112 | # ================================================================
113 | # Model components
114 | # ================================================================
115 | 
116 | 
117 | def normc_initializer(std=1.0):
118 |     def _initializer(shape, dtype=None, partition_info=None): #pylint: disable=W0613
119 |         out = np.random.randn(*shape).astype(np.float32)
120 |         out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
121 |         return tf.constant(out)
122 |     return _initializer
123 | 
124 | 
125 | def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME", dtype=tf.float32, collections=None,
126 |            summary_tag=None):
127 |     with tf.variable_scope(name):
128 |         stride_shape = [1, stride[0], stride[1], 1]
129 |         filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]), num_filters]
130 | 
131 |         # there are "num input feature maps * filter height * filter width"
132 |         # inputs to each hidden unit
133 |         fan_in = intprod(filter_shape[:3])
134 |         # each unit in the lower layer receives a gradient from:
135 |         # "num output feature maps * filter height * filter width" /
136 |         #   pooling size
137 |         fan_out = intprod(filter_shape[:2]) * num_filters
138 |         # initialize weights with random weights
139 |         w_bound = np.sqrt(6. / (fan_in + fan_out))
140 | 
141 |         w = tf.get_variable("W", filter_shape, dtype, tf.random_uniform_initializer(-w_bound, w_bound),
142 |                             collections=collections)
143 |         b = tf.get_variable("b", [1, 1, 1, num_filters], initializer=tf.zeros_initializer,
144 |                             collections=collections)
145 | 
146 |         if summary_tag is not None:
147 |             tf.image_summary(summary_tag,
148 |                              tf.transpose(tf.reshape(w, [filter_size[0], filter_size[1], -1, 1]),
149 |                                           [2, 0, 1, 3]),
150 |                              max_images=10)
151 | 
152 |         return tf.nn.conv2d(x, w, stride_shape, pad) + b
153 | 
154 | 
155 | def dense(x, size, name, weight_init=None, bias=True):
156 |     w = tf.get_variable(name + "/w", [x.get_shape()[1], size], initializer=weight_init)
157 |     ret = tf.matmul(x, w)
158 |     if bias:
159 |         b = tf.get_variable(name + "/b", [size], initializer=tf.zeros_initializer)
160 |         return ret + b
161 |     else:
162 |         return ret
163 | 
164 | def wndense(x, size, name, init_scale=1.0):
165 |     v = tf.get_variable(name + "/V", [int(x.get_shape()[1]), size],
166 |                         initializer=tf.random_normal_initializer(0, 0.05))
167 |     g = tf.get_variable(name + "/g", [size], initializer=tf.constant_initializer(init_scale))
168 |     b = tf.get_variable(name + "/b", [size], initializer=tf.constant_initializer(0.0))
169 | 
170 |     # use weight normalization (Salimans & Kingma, 2016)
171 |     x = tf.matmul(x, v)
172 |     scaler = g / tf.sqrt(sum(tf.square(v), axis=0, keepdims=True))
173 |     return tf.reshape(scaler, [1, size]) * x + tf.reshape(b, [1, size])
174 | 
175 | def densenobias(x, size, name, weight_init=None):
176 |     return dense(x, size, name, weight_init=weight_init, bias=False)
177 | 
178 | def dropout(x, pkeep, phase=None, mask=None):
179 |     mask = tf.floor(pkeep + tf.random_uniform(tf.shape(x))) if mask is None else mask
180 |     if phase is None:
181 |         return mask * x
182 |     else:
183 |         return switch(phase, mask*x, pkeep*x)
184 | 
185 | def batchnorm(x, name, phase, updates, gamma=0.96):
186 |     k = x.get_shape()[1]
187 |     runningmean = tf.get_variable(name+"/mean", shape=[1, k], initializer=tf.constant_initializer(0.0), trainable=False)
188 |     runningvar = tf.get_variable(name+"/var", shape=[1, k], initializer=tf.constant_initializer(1e-4), trainable=False)
189 |     testy = (x - runningmean) / tf.sqrt(runningvar)
190 | 
191 |     mean_ = mean(x, axis=0, keepdims=True)
192 |     var_ = mean(tf.square(x), axis=0, keepdims=True)
193 |     std = tf.sqrt(var_)
194 |     trainy = (x - mean_) / std
195 | 
196 |     updates.extend([
197 |         tf.assign(runningmean, runningmean * gamma + mean_ * (1 - gamma)),
198 |         tf.assign(runningvar, runningvar * gamma + var_ * (1 - gamma))
199 |     ])
200 | 
201 |     y = switch(phase, trainy, testy)
202 | 
203 |     out = y * tf.get_variable(name+"/scaling", shape=[1, k], initializer=tf.constant_initializer(1.0), trainable=True)\
204 |             + tf.get_variable(name+"/translation", shape=[1,k], initializer=tf.constant_initializer(0.0), trainable=True)
205 |     return out
206 | 
207 | 
208 | 
209 | # ================================================================
210 | # Basic Stuff
211 | # ================================================================
212 | 
213 | def function(inputs, outputs, updates=None, givens=None):
214 |     if isinstance(outputs, list):
215 |         return _Function(inputs, outputs, updates, givens=givens)
216 |     elif isinstance(outputs, (dict, collections.OrderedDict)):
217 |         f = _Function(inputs, outputs.values(), updates, givens=givens)
218 |         return lambda *inputs : type(outputs)(zip(outputs.keys(), f(*inputs)))
219 |     else:
220 |         f = _Function(inputs, [outputs], updates, givens=givens)
221 |         return lambda *inputs : f(*inputs)[0]
222 | 
223 | class _Function(object):
224 |     def __init__(self, inputs, outputs, updates, givens, check_nan=False):
225 |         assert all(len(i.op.inputs)==0 for i in inputs), "inputs should all be placeholders"
226 |         self.inputs = inputs
227 |         updates = updates or []
228 |         self.update_group = tf.group(*updates)
229 |         self.outputs_update = list(outputs) + [self.update_group]
230 |         self.givens = {} if givens is None else givens
231 |         self.check_nan = check_nan
232 |     def __call__(self, *inputvals):
233 |         assert len(inputvals) == len(self.inputs)
234 |         feed_dict = dict(zip(self.inputs, inputvals))
235 |         feed_dict.update(self.givens)
236 |         results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1]
237 |         if self.check_nan:
238 |             if any(np.isnan(r).any() for r in results):
239 |                 raise RuntimeError("Nan detected")
240 |         return results
241 | 
242 | def mem_friendly_function(nondata_inputs, data_inputs, outputs, batch_size):
243 |     if isinstance(outputs, list):
244 |         return _MemFriendlyFunction(nondata_inputs, data_inputs, outputs, batch_size)
245 |     else:
246 |         f = _MemFriendlyFunction(nondata_inputs, data_inputs, [outputs], batch_size)
247 |         return lambda *inputs : f(*inputs)[0]
248 | 
249 | class _MemFriendlyFunction(object):
250 |     def __init__(self, nondata_inputs, data_inputs, outputs, batch_size):
251 |         self.nondata_inputs = nondata_inputs
252 |         self.data_inputs = data_inputs
253 |         self.outputs = list(outputs)
254 |         self.batch_size = batch_size
255 |     def __call__(self, *inputvals):
256 |         assert len(inputvals) == len(self.nondata_inputs) + len(self.data_inputs)
257 |         nondata_vals = inputvals[0:len(self.nondata_inputs)]
258 |         data_vals = inputvals[len(self.nondata_inputs):]
259 |         feed_dict = dict(zip(self.nondata_inputs, nondata_vals))
260 |         n = data_vals[0].shape[0]
261 |         for v in data_vals[1:]:
262 |             assert v.shape[0] == n
263 |         for i_start in range(0, n, self.batch_size):
264 |             slice_vals = [v[i_start:min(i_start+self.batch_size, n)] for v in data_vals]
265 |             for (var,val) in zip(self.data_inputs, slice_vals):
266 |                 feed_dict[var]=val
267 |             results = tf.get_default_session().run(self.outputs, feed_dict=feed_dict)
268 |             if i_start==0:
269 |                 sum_results = results
270 |             else:
271 |                 for i in range(len(results)):
272 |                     sum_results[i] = sum_results[i] + results[i]
273 |         for i in range(len(results)):
274 |             sum_results[i] = sum_results[i] / n
275 |         return sum_results
276 | 
277 | # ================================================================
278 | # Modules
279 | # ================================================================
280 | 
281 | class Module(object):
282 |     def __init__(self, name):
283 |         self.name = name
284 |         self.first_time = True
285 |         self.scope = None
286 |         self.cache = {}
287 |     def __call__(self, *args):
288 |         if args in self.cache:
289 |             print("(%s) retrieving value from cache"%self.name)
290 |             return self.cache[args]
291 |         with tf.variable_scope(self.name, reuse=not self.first_time):
292 |             scope = tf.get_variable_scope().name
293 |             if self.first_time:
294 |                 self.scope = scope
295 |                 print("(%s) running function for the first time"%self.name)
296 |             else:
297 |                 assert self.scope == scope, "Tried calling function with a different scope"
298 |                 print("(%s) running function on new inputs"%self.name)
299 |             self.first_time = False
300 |             out = self._call(*args)
301 |         self.cache[args] = out
302 |         return out
303 |     def _call(self, *args):
304 |         raise NotImplementedError
305 | 
306 |     @property
307 |     def trainable_variables(self):
308 |         assert self.scope is not None, "need to call module once before getting variables"
309 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
310 | 
311 |     @property
312 |     def variables(self):
313 |         assert self.scope is not None, "need to call module once before getting variables"
314 |         return tf.get_collection(tf.GraphKeys.VARIABLES, self.scope)
315 | 
316 | 
317 | def module(name):
318 |     @functools.wraps
319 |     def wrapper(f):
320 |         class WrapperModule(Module):
321 |             def _call(self, *args):
322 |                 return f(*args)
323 |         return WrapperModule(name)
324 |     return wrapper
325 | 
326 | # ================================================================
327 | # Graph traversal
328 | # ================================================================
329 | 
330 | VARIABLES = {}
331 | 
332 | 
333 | def get_parents(node):
334 |     return node.op.inputs
335 | 
336 | def topsorted(outputs):
337 |     """
338 |     Topological sort via non-recursive depth-first search
339 |     """
340 |     assert isinstance(outputs, (list,tuple))
341 |     marks = {}
342 |     out = []
343 |     stack = [] #pylint: disable=W0621
344 |     # i: node
345 |     # jidx = number of children visited so far from that node
346 |     # marks: state of each node, which is one of
347 |     #   0: haven't visited
348 |     #   1: have visited, but not done visiting children
349 |     #   2: done visiting children
350 |     for x in outputs:
351 |         stack.append((x,0))
352 |         while stack:
353 |             (i,jidx) = stack.pop()
354 |             if jidx == 0:
355 |                 m = marks.get(i,0)
356 |                 if m == 0:
357 |                     marks[i] = 1
358 |                 elif m == 1:
359 |                     raise ValueError("not a dag")
360 |                 else:
361 |                     continue
362 |             ps = get_parents(i)
363 |             if jidx == len(ps):
364 |                 marks[i] = 2
365 |                 out.append(i)
366 |             else:
367 |                 stack.append((i,jidx+1))
368 |                 j = ps[jidx]
369 |                 stack.append((j,0))
370 |     return out
371 | 
372 | 
373 | # ================================================================
374 | # Flat vectors
375 | # ================================================================
376 | 
377 | def var_shape(x):
378 |     out = [k.value for k in x.get_shape()]
379 |     assert all(isinstance(a, int) for a in out), \
380 |         "shape function assumes that shape is fully known"
381 |     return out
382 | 
383 | def numel(x):
384 |     return intprod(var_shape(x))
385 | 
386 | def intprod(x):
387 |     return int(np.prod(x))
388 | 
389 | def flatgrad(loss, var_list):
390 |     grads = tf.gradients(loss, var_list)
391 |     return tf.concat(0, [tf.reshape(grad, [numel(v)])
392 |         for (v, grad) in zip(var_list, grads)])
393 | 
394 | class SetFromFlat(object):
395 |     def __init__(self, var_list, dtype=tf.float32):
396 |         assigns = []
397 |         shapes = list(map(var_shape, var_list))
398 |         total_size = np.sum([intprod(shape) for shape in shapes])
399 | 
400 |         self.theta = theta = tf.placeholder(dtype,[total_size])
401 |         start=0
402 |         assigns = []
403 |         for (shape,v) in zip(shapes,var_list):
404 |             size = intprod(shape)
405 |             assigns.append(tf.assign(v, tf.reshape(theta[start:start+size],shape)))
406 |             start+=size
407 |         self.op = tf.group(*assigns)
408 |     def __call__(self, theta):
409 |         get_session().run(self.op, feed_dict={self.theta:theta})
410 | 
411 | class GetFlat(object):
412 |     def __init__(self, var_list):
413 |         self.op = tf.concat(0, [tf.reshape(v, [numel(v)]) for v in var_list])
414 |     def __call__(self):
415 |         return get_session().run(self.op)
416 | 
417 | # ================================================================
418 | # Misc
419 | # ================================================================
420 | 
421 | 
422 | def fancy_slice_2d(X, inds0, inds1):
423 |     """
424 |     like numpy X[inds0, inds1]
425 |     XXX this implementation is bad
426 |     """
427 |     inds0 = tf.cast(inds0, tf.int64)
428 |     inds1 = tf.cast(inds1, tf.int64)
429 |     shape = tf.cast(tf.shape(X), tf.int64)
430 |     ncols = shape[1]
431 |     Xflat = tf.reshape(X, [-1])
432 |     return tf.gather(Xflat, inds0 * ncols + inds1)
433 | 
434 | 
435 | def scope_vars(scope, trainable_only):
436 |     """
437 |     Get variables inside a scope
438 |     The scope can be specified as a string
439 |     """
440 |     return tf.get_collection(
441 |         tf.GraphKeys.TRAINABLE_VARIABLES if trainable_only else tf.GraphKeys.VARIABLES,
442 |         scope=scope if isinstance(scope, str) else scope.name
443 |     )
444 | 
445 | def lengths_to_mask(lengths_b, max_length):
446 |     """
447 |     Turns a vector of lengths into a boolean mask
448 | 
449 |     Args:
450 |         lengths_b: an integer vector of lengths
451 |         max_length: maximum length to fill the mask
452 | 
453 |     Returns:
454 |         a boolean array of shape (batch_size, max_length)
455 |         row[i] consists of True repeated lengths_b[i] times, followed by False
456 |     """
457 |     lengths_b = tf.convert_to_tensor(lengths_b)
458 |     assert lengths_b.get_shape().ndims == 1
459 |     mask_bt = tf.expand_dims(tf.range(max_length), 0) < tf.expand_dims(lengths_b, 1)
460 |     return mask_bt
461 | 
462 | 
463 | def in_session(f):
464 |     @functools.wraps(f)
465 |     def newfunc(*args, **kwargs):
466 |         with tf.Session():
467 |             f(*args, **kwargs)
468 |     return newfunc
469 | 
470 | 
471 | _PLACEHOLDER_CACHE = {} # name -> (placeholder, dtype, shape)
472 | def get_placeholder(name, dtype, shape):
473 |     print("calling get_placeholder", name)
474 |     if name in _PLACEHOLDER_CACHE:
475 |         out, dtype1, shape1 = _PLACEHOLDER_CACHE[name]
476 |         assert dtype1==dtype and shape1==shape
477 |         return out
478 |     else:
479 |         out = tf.placeholder(dtype=dtype, shape=shape, name=name)
480 |         _PLACEHOLDER_CACHE[name] = (out,dtype,shape)
481 |         return out
482 | def get_placeholder_cached(name):
483 |     return _PLACEHOLDER_CACHE[name][0]
484 | 
485 | def flattenallbut0(x):
486 |     return tf.reshape(x, [-1, intprod(x.get_shape().as_list()[1:])])
487 | 
488 | def reset():
489 |     global _PLACEHOLDER_CACHE
490 |     global VARIABLES
491 |     _PLACEHOLDER_CACHE = {}
492 |     VARIABLES = {}
493 |     tf.reset_default_graph()
494 | 


--------------------------------------------------------------------------------