├── .gitignore
├── README.md
├── hw1
    ├── README.md
    ├── cs285
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── base_agent.py
    │   │   └── bc_agent.py
    │   ├── expert_data
    │   │   ├── expert_data_Ant-v2.pkl
    │   │   ├── expert_data_HalfCheetah-v2.pkl
    │   │   ├── expert_data_Hopper-v2.pkl
    │   │   ├── expert_data_Humanoid-v2.pkl
    │   │   └── expert_data_Walker2d-v2.pkl
    │   ├── infrastructure
    │   │   ├── __init__.py
    │   │   ├── colab_utils.py
    │   │   ├── logger.py
    │   │   ├── pytorch_util.py
    │   │   ├── replay_buffer.py
    │   │   ├── rl_trainer.py
    │   │   └── utils.py
    │   ├── policies
    │   │   ├── MLP_policy.py
    │   │   ├── __init__.py
    │   │   ├── base_policy.py
    │   │   ├── experts
    │   │   │   ├── Ant.pkl
    │   │   │   ├── HalfCheetah.pkl
    │   │   │   ├── Hopper.pkl
    │   │   │   ├── Humanoid.pkl
    │   │   │   └── Walker2d.pkl
    │   │   └── loaded_gaussian_policy.py
    │   └── scripts
    │   │   ├── run_hw1.ipynb
    │   │   └── run_hw1.py
    ├── cs285_hw1.pdf
    ├── installation.md
    ├── requirements.txt
    ├── requirements_colab.txt
    └── setup.py
├── hw2
    ├── README.md
    ├── cs285
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── base_agent.py
    │   │   └── pg_agent.py
    │   ├── infrastructure
    │   │   ├── __init__.py
    │   │   ├── colab_utils.py
    │   │   ├── logger.py
    │   │   ├── pytorch_util.py
    │   │   ├── replay_buffer.py
    │   │   ├── rl_trainer.py
    │   │   └── utils.py
    │   ├── policies
    │   │   ├── MLP_policy.py
    │   │   ├── __init__.py
    │   │   └── base_policy.py
    │   └── scripts
    │   │   ├── read_results.py
    │   │   ├── run_hw2.ipynb
    │   │   └── run_hw2.py
    ├── cs285_hw2.pdf
    ├── requirements.txt
    ├── requirements_colab.txt
    └── setup.py
├── hw3
    ├── README.md
    ├── cs285
    │   ├── agents
    │   │   ├── ac_agent.py
    │   │   ├── base_agent.py
    │   │   └── dqn_agent.py
    │   ├── critics
    │   │   ├── __init__.py
    │   │   ├── base_critic.py
    │   │   ├── bootstrapped_continuous_critic.py
    │   │   └── dqn_critic.py
    │   ├── envs
    │   │   ├── __init__.py
    │   │   └── box2d
    │   │   │   ├── __init__.py
    │   │   │   └── lunar_lander.py
    │   ├── infrastructure
    │   │   ├── atari_wrappers.py
    │   │   ├── colab_utils.py
    │   │   ├── dqn_utils.py
    │   │   ├── logger.py
    │   │   ├── pytorch_util.py
    │   │   ├── replay_buffer.py
    │   │   ├── rl_trainer.py
    │   │   └── utils.py
    │   ├── policies
    │   │   ├── MLP_policy.py
    │   │   ├── argmax_policy.py
    │   │   └── base_policy.py
    │   └── scripts
    │   │   ├── read_results.py
    │   │   ├── run_hw3_actor_critic.ipynb
    │   │   ├── run_hw3_actor_critic.py
    │   │   ├── run_hw3_dqn.ipynb
    │   │   └── run_hw3_dqn.py
    ├── cs285_hw3.pdf
    ├── requirements.txt
    ├── requirements_colab.txt
    └── setup.py
├── hw4
    ├── README.md
    ├── cs285
    │   ├── agents
    │   │   ├── base_agent.py
    │   │   └── mb_agent.py
    │   ├── envs
    │   │   ├── __init__.py
    │   │   ├── cheetah
    │   │   │   ├── __init__.py
    │   │   │   └── cheetah.py
    │   │   ├── obstacles
    │   │   │   ├── __init__.py
    │   │   │   └── obstacles_env.py
    │   │   └── reacher
    │   │   │   ├── __init__.py
    │   │   │   ├── assets
    │   │   │       └── sawyer.xml
    │   │   │   └── reacher_env.py
    │   ├── infrastructure
    │   │   ├── colab_utils.py
    │   │   ├── logger.py
    │   │   ├── pytorch_util.py
    │   │   ├── replay_buffer.py
    │   │   ├── rl_trainer.py
    │   │   └── utils.py
    │   ├── models
    │   │   ├── base_model.py
    │   │   └── ff_model.py
    │   ├── policies
    │   │   ├── MPC_policy.py
    │   │   └── base_policy.py
    │   └── scripts
    │   │   ├── filter_events.py
    │   │   ├── read_results.py
    │   │   ├── run_hw4_mb.ipynb
    │   │   └── run_hw4_mb.py
    ├── cs285_hw4.pdf
    ├── requirements.txt
    ├── requirements_colab.txt
    └── setup.py
└── hw5
    ├── README.md
    ├── cs285
        ├── agents
        │   ├── ac_agent.py
        │   ├── base_agent.py
        │   ├── dqn_agent.py
        │   └── explore_or_exploit_agent.py
        ├── critics
        │   ├── __init__.py
        │   ├── base_critic.py
        │   ├── bootstrapped_continuous_critic.py
        │   ├── cql_critic.py
        │   └── dqn_critic.py
        ├── envs
        │   ├── __init__.py
        │   ├── ant
        │   │   ├── __init__.py
        │   │   └── ant.py
        │   ├── box2d
        │   │   ├── __init__.py
        │   │   └── lunar_lander.py
        │   ├── cheetah
        │   │   ├── __init__.py
        │   │   └── cheetah.py
        │   ├── obstacles
        │   │   ├── __init__.py
        │   │   └── obstacles_env.py
        │   ├── pointmass
        │   │   └── pointmass.py
        │   └── reacher
        │   │   ├── __init__.py
        │   │   ├── assets
        │   │       └── sawyer.xml
        │   │   └── reacher_env.py
        ├── exploration
        │   ├── __init__.py
        │   ├── base_exploration_model.py
        │   └── rnd_model.py
        ├── infrastructure
        │   ├── atari_wrappers.py
        │   ├── colab_utils.py
        │   ├── dqn_utils.py
        │   ├── logger.py
        │   ├── pytorch_util.py
        │   ├── replay_buffer.py
        │   ├── rl_trainer.py
        │   └── utils.py
        ├── policies
        │   ├── MLP_policy.py
        │   ├── argmax_policy.py
        │   └── base_policy.py
        └── scripts
        │   ├── read_results.py
        │   └── run_hw5_expl.py
    ├── hw5.pdf
    ├── requirements.txt
    ├── requirements_colab.txt
    ├── run_hw5_expl.ipynb
    └── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | data/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Assignments for [Berkeley CS 285: Deep Reinforcement Learning, Decision Making, and Control](http://rail.eecs.berkeley.edu/deeprlcourse/).
2 | 


--------------------------------------------------------------------------------
/hw1/README.md:
--------------------------------------------------------------------------------
 1 | ## Setup
 2 | 
 3 | You can run this code on your own machine or on Google Colab. 
 4 | 
 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](installation.md) for instructions.
 6 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below:
 7 | 
 8 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw1/cs285/scripts/run_hw1.ipynb)
 9 | 
10 | ## Complete the code
11 | 
12 | Fill in sections marked with `TODO`. In particular, see
13 |  - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
14 |  - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
15 |  - [infrastructure/replay_buffer.py](cs285/infrastructure/replay_buffer.py)
16 |  - [infrastructure/utils.py](cs285/infrastructure/utils.py)
17 |  - [infrastructure/pytorch_util.py](cs285/infrastructure/pytorch_util.py)
18 | 
19 | Look for sections maked with `HW1` to see how the edits you make will be used.
20 | Some other files that you may find relevant
21 |  - [scripts/run_hw1.py](cs285/scripts/run_hw1.py) (if running locally) or [scripts/run_hw1.ipynb](cs285/scripts/run_hw1.ipynb) (if running on Colab)
22 |  - [agents/bc_agent.py](cs285/agents/bc_agent.py)
23 | 
24 | See the homework pdf for more details.
25 | 
26 | ## Run the code
27 | 
28 | Tip: While debugging, you probably want to keep the flag `--video_log_freq -1` which will disable video logging and speed up the experiment. However, feel free to remove it to save videos of your awesome policy!
29 | 
30 | If running on Colab, adjust the `#@params` in the `Args` class according to the commmand line arguments above.
31 | 
32 | ### Section 1 (Behavior Cloning)
33 | Command for problem 1:
34 | 
35 | ```
36 | python cs285/scripts/run_hw1.py \
37 | 	--expert_policy_file cs285/policies/experts/Ant.pkl \
38 | 	--env_name Ant-v2 --exp_name bc_ant --n_iter 1 \
39 | 	--expert_data cs285/expert_data/expert_data_Ant-v2.pkl
40 | 	--video_log_freq -1
41 | ```
42 | 
43 | Make sure to also try another environment.
44 | See the homework PDF for more details on what else you need to run.
45 | To generate videos of the policy, remove the `--video_log_freq -1` flag.
46 | 
47 | ### Section 2 (DAgger)
48 | Command for section 1:
49 | (Note the `--do_dagger` flag, and the higher value for `n_iter`)
50 | 
51 | ```
52 | python cs285/scripts/run_hw1.py \
53 |     --expert_policy_file cs285/policies/experts/Ant.pkl \
54 |     --env_name Ant-v2 --exp_name dagger_ant --n_iter 10 \
55 |     --do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \
56 | 	--video_log_freq -1
57 | ```
58 | 
59 | Make sure to also try another environment.
60 | See the homework PDF for more details on what else you need to run.
61 | 
62 | ## Visualization the saved tensorboard event file:
63 | 
64 | You can visualize your runs using tensorboard:
65 | ```
66 | tensorboard --logdir data
67 | ```
68 | 
69 | You will see scalar summaries as well as videos of your trained policies (in the 'images' tab).
70 | 
71 | You can choose to visualize specific runs with a comma-separated list:
72 | ```
73 | tensorboard --logdir data/run1,data/run2,data/run3...
74 | ```
75 | 
76 | If running on Colab, you will be using the `%tensorboard` [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) to do the same thing; see the [notebook](cs285/scripts/run_hw1.ipynb) for more details.
77 | 
78 | 


--------------------------------------------------------------------------------
/hw1/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/agents/__init__.py


--------------------------------------------------------------------------------
/hw1/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class BaseAgent(object):
 3 |     def __init__(self, **kwargs):
 4 |         super(BaseAgent, self).__init__(**kwargs)
 5 | 
 6 |     def train(self) -> dict:
 7 |         """Return a dictionary of logging information."""
 8 |         raise NotImplementedError
 9 | 
10 |     def add_to_replay_buffer(self, paths):
11 |         raise NotImplementedError
12 | 
13 |     def sample(self, batch_size):
14 |         raise NotImplementedError
15 | 
16 |     def save(self, path):
17 |         raise NotImplementedError
18 | 


--------------------------------------------------------------------------------
/hw1/cs285/agents/bc_agent.py:
--------------------------------------------------------------------------------
 1 | from cs285.infrastructure.replay_buffer import ReplayBuffer
 2 | from cs285.policies.MLP_policy import MLPPolicySL
 3 | from .base_agent import BaseAgent
 4 | 
 5 | 
 6 | class BCAgent(BaseAgent):
 7 |     def __init__(self, env, agent_params):
 8 |         super(BCAgent, self).__init__()
 9 | 
10 |         # init vars
11 |         self.env = env
12 |         self.agent_params = agent_params
13 | 
14 |         # actor/policy
15 |         self.actor = MLPPolicySL(
16 |             self.agent_params['ac_dim'],
17 |             self.agent_params['ob_dim'],
18 |             self.agent_params['n_layers'],
19 |             self.agent_params['size'],
20 |             discrete=self.agent_params['discrete'],
21 |             learning_rate=self.agent_params['learning_rate'],
22 |         )
23 | 
24 |         # replay buffer
25 |         self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
26 | 
27 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
28 |         # training a BC agent refers to updating its actor using
29 |         # the given observations and corresponding action labels
30 |         log = self.actor.update(ob_no, ac_na)  # HW1: you will modify this
31 |         return log
32 | 
33 |     def add_to_replay_buffer(self, paths):
34 |         self.replay_buffer.add_rollouts(paths)
35 | 
36 |     def sample(self, batch_size):
37 |         return self.replay_buffer.sample_random_data(batch_size)  # HW1: you will modify this
38 | 
39 |     def save(self, path):
40 |         return self.actor.save(path)


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Ant-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Ant-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/infrastructure/__init__.py


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
 1 | from gym.wrappers import Monitor
 2 | import glob
 3 | import io
 4 | import base64
 5 | from IPython.display import HTML
 6 | from IPython import display as ipythondisplay
 7 | 
 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
 9 | 
10 | def show_video():
11 |   mp4list = glob.glob('/content/video/*.mp4')
12 |   if len(mp4list) > 0:
13 |     mp4 = mp4list[0]
14 |     video = io.open(mp4, 'r+b').read()
15 |     encoded = base64.b64encode(video)
16 |     ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
17 |                 loop controls style="height: 400px;">
18 |                 <source src="data:video/mp4;base64,{0}" type="video/mp4" />
19 |              </video>'''.format(encoded.decode('ascii'))))
20 |   else: 
21 |     print("Could not find video")
22 |     
23 | 
24 | def wrap_env(env):
25 |   env = Monitor(env, '/content/video', force=True)
26 |   return env


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tensorboardX import SummaryWriter
 3 | import numpy as np
 4 | 
 5 | class Logger:
 6 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 7 |         self._log_dir = log_dir
 8 |         print('########################')
 9 |         print('logging outputs to ', log_dir)
10 |         print('########################')
11 |         self._n_logged_samples = n_logged_samples
12 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 | 
14 |     def log_scalar(self, scalar, name, step_):
15 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 | 
17 |     def log_scalars(self, scalar_dict, group_name, step, phase):
18 |         """Will log all scalars in the same plot."""
19 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 | 
21 |     def log_image(self, image, name, step):
22 |         assert(len(image.shape) == 3)  # [C, H, W]
23 |         self._summ_writer.add_image('{}'.format(name), image, step)
24 | 
25 |     def log_video(self, video_frames, name, step, fps=10):
26 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 | 
29 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 | 
31 |         # reshape the rollouts
32 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 | 
34 |         # max rollout length
35 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 |         max_length = videos[0].shape[0]
37 |         for i in range(max_videos_to_save):
38 |             if videos[i].shape[0]>max_length:
39 |                 max_length = videos[i].shape[0]
40 | 
41 |         # pad rollouts to all be same length
42 |         for i in range(max_videos_to_save):
43 |             if videos[i].shape[0]<max_length:
44 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
45 |                 videos[i] = np.concatenate([videos[i], padding], 0)
46 | 
47 |         # log videos to tensorboard event file
48 |         videos = np.stack(videos[:max_videos_to_save], 0)
49 |         self.log_video(videos, video_title, step, fps=fps)
50 | 
51 |     def log_figures(self, figure, name, step, phase):
52 |         """figure: matplotlib.pyplot figure handle"""
53 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
54 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 | 
56 |     def log_figure(self, figure, name, step, phase):
57 |         """figure: matplotlib.pyplot figure handle"""
58 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 | 
60 |     def log_graph(self, array, name, step, phase):
61 |         """figure: matplotlib.pyplot figure handle"""
62 |         im = plot_graph(array)
63 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 | 
65 |     def dump_scalars(self, log_path=None):
66 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 |         self._summ_writer.export_scalars_to_json(log_path)
68 | 
69 |     def flush(self):
70 |         self._summ_writer.flush()
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | Activation = Union[str, nn.Module]
 7 | 
 8 | 
 9 | _str_to_activation = {
10 |     'relu': nn.ReLU(),
11 |     'tanh': nn.Tanh(),
12 |     'leaky_relu': nn.LeakyReLU(),
13 |     'sigmoid': nn.Sigmoid(),
14 |     'selu': nn.SELU(),
15 |     'softplus': nn.Softplus(),
16 |     'identity': nn.Identity(),
17 | }
18 | 
19 | 
20 | def build_mlp(
21 |         input_size: int,
22 |         output_size: int,
23 |         n_layers: int,
24 |         size: int,
25 |         activation: Activation = 'tanh',
26 |         output_activation: Activation = 'identity',
27 | ) -> nn.Module:
28 |     """
29 |         Builds a feedforward neural network
30 | 
31 |         arguments:
32 |             n_layers: number of hidden layers
33 |             size: dimension of each hidden layer
34 |             activation: activation of each hidden layer
35 | 
36 |             input_size: size of the input layer
37 |             output_size: size of the output layer
38 |             output_activation: activation of the output layer
39 | 
40 |         returns:
41 |             MLP (nn.Module)
42 |     """
43 |     if isinstance(activation, str):
44 |         activation = _str_to_activation[activation]
45 |     if isinstance(output_activation, str):
46 |         output_activation = _str_to_activation[output_activation]
47 | 
48 |     # TODO: return a MLP. This should be an instance of nn.Module
49 |     # Note: nn.Sequential is an instance of nn.Module.
50 |     raise NotImplementedError
51 | 
52 | 
53 | device = None
54 | 
55 | 
56 | def init_gpu(use_gpu=True, gpu_id=0):
57 |     global device
58 |     if torch.cuda.is_available() and use_gpu:
59 |         device = torch.device("cuda:" + str(gpu_id))
60 |         print("Using GPU id {}".format(gpu_id))
61 |     else:
62 |         device = torch.device("cpu")
63 |         print("GPU not detected. Defaulting to CPU.")
64 | 
65 | 
66 | def set_device(gpu_id):
67 |     torch.cuda.set_device(gpu_id)
68 | 
69 | 
70 | def from_numpy(*args, **kwargs):
71 |     return torch.from_numpy(*args, **kwargs).float().to(device)
72 | 
73 | 
74 | def to_numpy(tensor):
75 |     return tensor.to('cpu').detach().numpy()
76 | 


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | from cs285.infrastructure.utils import *
 2 | 
 3 | 
 4 | class ReplayBuffer(object):
 5 | 
 6 |     def __init__(self, max_size=1000000):
 7 | 
 8 |         self.max_size = max_size
 9 | 
10 |         # store each rollout
11 |         self.paths = []
12 | 
13 |         # store (concatenated) component arrays from each rollout
14 |         self.obs = None
15 |         self.acs = None
16 |         self.rews = None
17 |         self.next_obs = None
18 |         self.terminals = None
19 | 
20 |     def __len__(self):
21 |         if self.obs:
22 |             return self.obs.shape[0]
23 |         else:
24 |             return 0
25 | 
26 |     def add_rollouts(self, paths, concat_rew=True):
27 | 
28 |         # add new rollouts into our list of rollouts
29 |         for path in paths:
30 |             self.paths.append(path)
31 | 
32 |         # convert new rollouts into their component arrays, and append them onto
33 |         # our arrays
34 |         observations, actions, rewards, next_observations, terminals = (
35 |             convert_listofrollouts(paths, concat_rew))
36 | 
37 |         if self.obs is None:
38 |             self.obs = observations[-self.max_size:]
39 |             self.acs = actions[-self.max_size:]
40 |             self.rews = rewards[-self.max_size:]
41 |             self.next_obs = next_observations[-self.max_size:]
42 |             self.terminals = terminals[-self.max_size:]
43 |         else:
44 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
45 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
46 |             if concat_rew:
47 |                 self.rews = np.concatenate(
48 |                     [self.rews, rewards]
49 |                 )[-self.max_size:]
50 |             else:
51 |                 if isinstance(rewards, list):
52 |                     self.rews += rewards
53 |                 else:
54 |                     self.rews.append(rewards)
55 |                 self.rews = self.rews[-self.max_size:]
56 |             self.next_obs = np.concatenate(
57 |                 [self.next_obs, next_observations]
58 |             )[-self.max_size:]
59 |             self.terminals = np.concatenate(
60 |                 [self.terminals, terminals]
61 |             )[-self.max_size:]
62 | 
63 |     ########################################
64 |     ########################################
65 | 
66 |     def sample_random_data(self, batch_size):
67 |         assert (
68 |                 self.obs.shape[0]
69 |                 == self.acs.shape[0]
70 |                 == self.rews.shape[0]
71 |                 == self.next_obs.shape[0]
72 |                 == self.terminals.shape[0]
73 |         )
74 | 
75 |         ## TODO return batch_size number of random entries from each of the 5 component arrays above
76 |         ## HINT 1: use np.random.permutation to sample random indices
77 |         ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
78 |         ## HINT 3: look at the sample_recent_data function below
79 | 
80 |         return TODO, TODO, TODO, TODO, TODO
81 | 
82 |     def sample_recent_data(self, batch_size=1):
83 |         return (
84 |             self.obs[-batch_size:],
85 |             self.acs[-batch_size:],
86 |             self.rews[-batch_size:],
87 |             self.next_obs[-batch_size:],
88 |             self.terminals[-batch_size:],
89 |         )
90 | 


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | 
  4 | ############################################
  5 | ############################################
  6 | 
  7 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
  8 | 
  9 |     # initialize env for the beginning of a new rollout
 10 |     ob = TODO # HINT: should be the output of resetting the env
 11 | 
 12 |     # init vars
 13 |     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
 14 |     steps = 0
 15 |     while True:
 16 | 
 17 |         # render image of the simulated env
 18 |         if render:
 19 |             if 'rgb_array' in render_mode:
 20 |                 if hasattr(env, 'sim'):
 21 |                     image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
 22 |                 else:
 23 |                     image_obs.append(env.render(mode=render_mode))
 24 |             if 'human' in render_mode:
 25 |                 env.render(mode=render_mode)
 26 |                 time.sleep(env.model.opt.timestep)
 27 | 
 28 |         # use the most recent ob to decide what to do
 29 |         obs.append(ob)
 30 |         ac = TODO # HINT: query the policy's get_action function
 31 |         ac = ac[0]
 32 |         acs.append(ac)
 33 | 
 34 |         # take that action and record results
 35 |         ob, rew, done, _ = env.step(ac)
 36 | 
 37 |         # record result of taking that action
 38 |         steps += 1
 39 |         next_obs.append(ob)
 40 |         rewards.append(rew)
 41 | 
 42 |         # TODO end the rollout if the rollout ended
 43 |         # HINT: rollout can end due to done, or due to max_path_length
 44 |         rollout_done = TODO # HINT: this is either 0 or 1
 45 |         terminals.append(rollout_done)
 46 | 
 47 |         if rollout_done:
 48 |             break
 49 | 
 50 |     return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 51 | 
 52 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
 53 |     """
 54 |         Collect rollouts until we have collected min_timesteps_per_batch steps.
 55 | 
 56 |         TODO implement this function
 57 |         Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
 58 |         Hint2: use get_pathlength to count the timesteps collected in each path
 59 |     """
 60 |     timesteps_this_batch = 0
 61 |     paths = []
 62 |     while timesteps_this_batch < min_timesteps_per_batch:
 63 | 
 64 |         TODO
 65 | 
 66 |     return paths, timesteps_this_batch
 67 | 
 68 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
 69 |     """
 70 |         Collect ntraj rollouts.
 71 | 
 72 |         TODO implement this function
 73 |         Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
 74 |     """
 75 |     paths = []
 76 | 
 77 |     TODO
 78 | 
 79 |     return paths
 80 | 
 81 | ############################################
 82 | ############################################
 83 | 
 84 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
 85 |     """
 86 |         Take info (separate arrays) from a single rollout
 87 |         and return it in a single dictionary
 88 |     """
 89 |     if image_obs != []:
 90 |         image_obs = np.stack(image_obs, axis=0)
 91 |     return {"observation" : np.array(obs, dtype=np.float32),
 92 |             "image_obs" : np.array(image_obs, dtype=np.uint8),
 93 |             "reward" : np.array(rewards, dtype=np.float32),
 94 |             "action" : np.array(acs, dtype=np.float32),
 95 |             "next_observation": np.array(next_obs, dtype=np.float32),
 96 |             "terminal": np.array(terminals, dtype=np.float32)}
 97 | 
 98 | 
 99 | def convert_listofrollouts(paths, concat_rew=True):
100 |     """
101 |         Take a list of rollout dictionaries
102 |         and return separate arrays,
103 |         where each array is a concatenation of that array from across the rollouts
104 |     """
105 |     observations = np.concatenate([path["observation"] for path in paths])
106 |     actions = np.concatenate([path["action"] for path in paths])
107 |     if concat_rew:
108 |         rewards = np.concatenate([path["reward"] for path in paths])
109 |     else:
110 |         rewards = [path["reward"] for path in paths]
111 |     next_observations = np.concatenate([path["next_observation"] for path in paths])
112 |     terminals = np.concatenate([path["terminal"] for path in paths])
113 |     return observations, actions, rewards, next_observations, terminals
114 | 
115 | ############################################
116 | ############################################
117 | 
118 | def get_pathlength(path):
119 |     return len(path["reward"])


--------------------------------------------------------------------------------
/hw1/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import itertools
  3 | from typing import Any
  4 | from torch import nn
  5 | from torch.nn import functional as F
  6 | from torch import optim
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | from torch import distributions
 11 | 
 12 | from cs285.infrastructure import pytorch_util as ptu
 13 | from cs285.policies.base_policy import BasePolicy
 14 | 
 15 | 
 16 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
 17 | 
 18 |     def __init__(self,
 19 |                  ac_dim,
 20 |                  ob_dim,
 21 |                  n_layers,
 22 |                  size,
 23 |                  discrete=False,
 24 |                  learning_rate=1e-4,
 25 |                  training=True,
 26 |                  nn_baseline=False,
 27 |                  **kwargs
 28 |                  ):
 29 |         super().__init__(**kwargs)
 30 | 
 31 |         # init vars
 32 |         self.ac_dim = ac_dim
 33 |         self.ob_dim = ob_dim
 34 |         self.n_layers = n_layers
 35 |         self.discrete = discrete
 36 |         self.size = size
 37 |         self.learning_rate = learning_rate
 38 |         self.training = training
 39 |         self.nn_baseline = nn_baseline
 40 | 
 41 |         if self.discrete:
 42 |             self.logits_na = ptu.build_mlp(
 43 |                 input_size=self.ob_dim,
 44 |                 output_size=self.ac_dim,
 45 |                 n_layers=self.n_layers,
 46 |                 size=self.size,
 47 |             )
 48 |             self.logits_na.to(ptu.device)
 49 |             self.mean_net = None
 50 |             self.logstd = None
 51 |             self.optimizer = optim.Adam(self.logits_na.parameters(),
 52 |                                         self.learning_rate)
 53 |         else:
 54 |             self.logits_na = None
 55 |             self.mean_net = ptu.build_mlp(
 56 |                 input_size=self.ob_dim,
 57 |                 output_size=self.ac_dim,
 58 |                 n_layers=self.n_layers, size=self.size,
 59 |             )
 60 |             self.mean_net.to(ptu.device)
 61 |             self.logstd = nn.Parameter(
 62 |                 torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
 63 |             )
 64 |             self.logstd.to(ptu.device)
 65 |             self.optimizer = optim.Adam(
 66 |                 itertools.chain([self.logstd], self.mean_net.parameters()),
 67 |                 self.learning_rate
 68 |             )
 69 | 
 70 |     ##################################
 71 | 
 72 |     def save(self, filepath):
 73 |         torch.save(self.state_dict(), filepath)
 74 | 
 75 |     ##################################
 76 | 
 77 |     def get_action(self, obs: np.ndarray) -> np.ndarray:
 78 |         if len(obs.shape) > 1:
 79 |             observation = obs
 80 |         else:
 81 |             observation = obs[None]
 82 | 
 83 |         # TODO return the action that the policy prescribes
 84 |         raise NotImplementedError
 85 | 
 86 |     # update/train this policy
 87 |     def update(self, observations, actions, **kwargs):
 88 |         raise NotImplementedError
 89 | 
 90 |     # This function defines the forward pass of the network.
 91 |     # You can return anything you want, but you should be able to differentiate
 92 |     # through it. For example, you can return a torch.FloatTensor. You can also
 93 |     # return more flexible objects, such as a
 94 |     # `torch.distributions.Distribution` object. It's up to you!
 95 |     def forward(self, observation: torch.FloatTensor) -> Any:
 96 |         raise NotImplementedError
 97 | 
 98 | 
 99 | #####################################################
100 | #####################################################
101 | 
102 | class MLPPolicySL(MLPPolicy):
103 |     def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs):
104 |         super().__init__(ac_dim, ob_dim, n_layers, size, **kwargs)
105 |         self.loss = nn.MSELoss()
106 | 
107 |     def update(
108 |             self, observations, actions,
109 |             adv_n=None, acs_labels_na=None, qvals=None
110 |     ):
111 |         # TODO: update the policy and return the loss
112 |         loss = TODO
113 |         return {
114 |             # You can add extra logging information here, but keep this line
115 |             'Training Loss': ptu.to_numpy(loss),
116 |         }
117 | 


--------------------------------------------------------------------------------
/hw1/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/__init__.py


--------------------------------------------------------------------------------
/hw1/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import numpy as np
 3 | 
 4 | 
 5 | class BasePolicy(object, metaclass=abc.ABCMeta):
 6 |     def get_action(self, obs: np.ndarray) -> np.ndarray:
 7 |         raise NotImplementedError
 8 | 
 9 |     def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 |         """Return a dictionary of logging information."""
11 |         raise NotImplementedError
12 | 
13 |     def save(self, filepath: str):
14 |         raise NotImplementedError
15 | 


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Ant.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Ant.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/HalfCheetah.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/HalfCheetah.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Hopper.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Hopper.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Humanoid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Humanoid.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Walker2d.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Walker2d.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/loaded_gaussian_policy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from cs285.infrastructure import pytorch_util as ptu
  4 | from .base_policy import BasePolicy
  5 | from torch import nn
  6 | import torch
  7 | import pickle
  8 | 
  9 | 
 10 | def create_linear_layer(W, b) -> nn.Linear:
 11 |     out_features, in_features = W.shape
 12 |     linear_layer = nn.Linear(
 13 |         in_features,
 14 |         out_features,
 15 |     )
 16 |     linear_layer.weight.data = ptu.from_numpy(W.T)
 17 |     linear_layer.bias.data = ptu.from_numpy(b[0])
 18 |     return linear_layer
 19 | 
 20 | 
 21 | def read_layer(l):
 22 |     assert list(l.keys()) == ['AffineLayer']
 23 |     assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
 24 |     return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer'][
 25 |         'b'].astype(np.float32)
 26 | 
 27 | 
 28 | class LoadedGaussianPolicy(BasePolicy, nn.Module):
 29 |     def __init__(self, filename, **kwargs):
 30 |         super().__init__(**kwargs)
 31 | 
 32 |         with open(filename, 'rb') as f:
 33 |             data = pickle.loads(f.read())
 34 | 
 35 |         self.nonlin_type = data['nonlin_type']
 36 |         if self.nonlin_type == 'lrelu':
 37 |             self.non_lin = nn.LeakyReLU(0.01)
 38 |         elif self.nonlin_type == 'tanh':
 39 |             self.non_lin = nn.Tanh()
 40 |         else:
 41 |             raise NotImplementedError()
 42 |         policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
 43 | 
 44 |         assert policy_type == 'GaussianPolicy', (
 45 |             'Policy type {} not supported'.format(policy_type)
 46 |         )
 47 |         self.policy_params = data[policy_type]
 48 | 
 49 |         assert set(self.policy_params.keys()) == {
 50 |             'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'
 51 |         }
 52 | 
 53 |         # Build the policy. First, observation normalization.
 54 |         assert list(self.policy_params['obsnorm'].keys()) == ['Standardizer']
 55 |         obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D']
 56 |         obsnorm_meansq = self.policy_params['obsnorm']['Standardizer'][
 57 |             'meansq_1_D']
 58 |         obsnorm_stdev = np.sqrt(
 59 |             np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
 60 |         print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
 61 | 
 62 |         self.obs_norm_mean = nn.Parameter(ptu.from_numpy(obsnorm_mean))
 63 |         self.obs_norm_std = nn.Parameter(ptu.from_numpy(obsnorm_stdev))
 64 |         self.hidden_layers = nn.ModuleList()
 65 | 
 66 |         # Hidden layers next
 67 |         assert list(self.policy_params['hidden'].keys()) == ['FeedforwardNet']
 68 |         layer_params = self.policy_params['hidden']['FeedforwardNet']
 69 |         for layer_name in sorted(layer_params.keys()):
 70 |             l = layer_params[layer_name]
 71 |             W, b = read_layer(l)
 72 |             linear_layer = create_linear_layer(W, b)
 73 |             self.hidden_layers.append(linear_layer)
 74 | 
 75 |         # Output layer
 76 |         W, b = read_layer(self.policy_params['out'])
 77 |         self.output_layer = create_linear_layer(W, b)
 78 | 
 79 |     def forward(self, obs):
 80 |         normed_obs = (obs - self.obs_norm_mean) / (self.obs_norm_std + 1e-6)
 81 |         h = normed_obs
 82 |         for layer in self.hidden_layers:
 83 |             h = layer(h)
 84 |             h = self.non_lin(h)
 85 |         return self.output_layer(h)
 86 | 
 87 |     ##################################
 88 | 
 89 |     def update(self, obs_no, acs_na, adv_n=None, acs_labels_na=None):
 90 |         raise NotImplementedError("""
 91 |             This policy class simply loads in a particular type of policy and
 92 |             queries it. Do not try to train it.
 93 |         """)
 94 | 
 95 |     def get_action(self, obs):
 96 |         if len(obs.shape) > 1:
 97 |             observation = obs
 98 |         else:
 99 |             observation = obs[None, :]
100 |         observation = ptu.from_numpy(observation.astype(np.float32))
101 |         action = self(observation)
102 |         return ptu.to_numpy(action)
103 | 
104 |     def save(self, filepath):
105 |         torch.save(self.state_dict(), filepath)
106 | 


--------------------------------------------------------------------------------
/hw1/cs285_hw1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285_hw1.pdf


--------------------------------------------------------------------------------
/hw1/installation.md:
--------------------------------------------------------------------------------
 1 | ## Install mujoco:
 2 | ```
 3 | mkdir ~/.mujoco
 4 | cd ~/.mujoco
 5 | wget https://www.roboti.us/download/mujoco200_linux.zip
 6 | unzip mujoco200_linux.zip
 7 | mv mujoco200_linux mujoco200
 8 | rm mujoco200_linux.zip
 9 | cp <location_of_mjkey.txt> .
10 | ```
11 | The above instructions download MuJoCo for Linux. If you are on Mac or Windows, you will need to change the `wget` address to either 
12 | `https://www.roboti.us/download/mujoco200_macos.zip` or `https://www.roboti.us/download/mujoco200_win64.zip`.
13 | 
14 | Finally, add the following to bottom of your bashrc:
15 | ```
16 | export LD_LIBRARY_PATH=~/.mujoco/mujoco200/bin/
17 | ```
18 | 
19 | ## Install other dependencies
20 | 
21 | 
22 | There are two options:
23 | 
24 | A. (Recommended) Install with conda:
25 | 
26 | 	1. Install conda, if you don't already have it, by following the instructions at [this link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/)
27 | 
28 | 	```
29 | 
30 | 	This install will modify the `PATH` variable in your bashrc.
31 | 	You need to open a new terminal for that path change to take place (to be able to find 'conda' in the next step).
32 | 
33 | 	2. Create a conda environment that will contain python 3:
34 | 	```
35 | 	conda create -n cs285 python=3.6
36 | 	```
37 | 
38 | 	3. activate the environment (do this every time you open a new terminal and want to run code):
39 | 	```
40 | 	source activate cs285
41 | 	```
42 | 
43 | 	4. Install the requirements into this conda environment
44 | 	```
45 | 	pip install --user -r requirements.txt
46 | 	```
47 | 
48 | 	5. Allow your code to be able to see 'cs285'
49 | 	```
50 | 	cd <path_to_hw1>
51 | 	$ pip install -e .
52 | 	```
53 | 
54 | This conda environment requires activating it every time you open a new terminal (in order to run code), but the benefit is that the required dependencies for this codebase will not affect existing/other versions of things on your computer. This stand-alone environment will have everything that is necessary.
55 | 
56 | 
57 | B. Install on system Python:
58 | 	```
59 | 	pip install -r requirements.txt
60 | 	```


--------------------------------------------------------------------------------
/hw1/requirements.txt:
--------------------------------------------------------------------------------
 1 | gym==0.17.2
 2 | mujoco-py==2.0.2.2
 3 | tensorboard==2.3.0
 4 | tensorboardX==1.8
 5 | matplotlib==2.2.2
 6 | ipython==6.4.0
 7 | moviepy==1.0.0
 8 | pyvirtualdisplay==1.3.2
 9 | torch==1.6.0
10 | opencv-python==4.4.0.42
11 | ipdb==0.13.3
12 | box2d-py
13 | 


--------------------------------------------------------------------------------
/hw1/requirements_colab.txt:
--------------------------------------------------------------------------------
 1 | gym==0.17.2
 2 | tensorboard==2.3.0
 3 | tensorboardX==1.8
 4 | matplotlib==2.2.2
 5 | ipython==6.4.0
 6 | moviepy==1.0.0
 7 | pyvirtualdisplay==1.3.2
 8 | torch==1.6.0
 9 | opencv-python==4.4.0.42
10 | ipdb==0.13.3
11 | box2d-py
12 | 


--------------------------------------------------------------------------------
/hw1/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------
/hw2/README.md:
--------------------------------------------------------------------------------
 1 | ## Setup
 2 | 
 3 | You can run this code on your own machine or on Google Colab. 
 4 | 
 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. If you completed this installation for homework 1, you do not need to repeat it.
 6 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below:
 7 | 
 8 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw2/cs285/scripts/run_hw2.ipynb)
 9 | 
10 | ## Complete the code
11 | 
12 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with "TODO: get this from hw1".
13 | 
14 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
15 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
16 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
17 | 
18 | You will then need to complete the following new files for homework 2. The relevant sections are marked with "TODO".
19 | - [agents/pg_agent.py](cs285/agents/pg_agent.py)
20 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
21 | 
22 | You will also want to look through [scripts/run_hw2.py](cs285/scripts/run_hw2.py) (if running locally) or [scripts/run_hw2.ipynb](cs285/scripts/run_hw1.2pynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook.
23 | 
24 | You will be running your policy gradients implementation in four experiments total, investigating the effects of design decisions like reward-to-go estimators, neural network baselines for variance reduction, and advantage normalization. See the [assignment PDF](cs285_hw2.pdf) for more details.
25 | 
26 | ## Plotting your results
27 | 
28 | We have provided a snippet that may be used for reading your Tensorboard eventfiles in [scripts/read_results.py](cs285/scripts/read_results.py). Reading these eventfiles and plotting them with [matplotlib](https://matplotlib.org/) or [seaborn](https://seaborn.pydata.org/) will produce the cleanest results for your submission. For debugging purposes, we recommend visualizing the Tensorboard logs using `tensorboard --logdir data`.
29 | 


--------------------------------------------------------------------------------
/hw2/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_agent import BaseAgent
2 | from .pg_agent import PGAgent
3 | 
4 | 


--------------------------------------------------------------------------------
/hw2/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
 1 | class BaseAgent(object):
 2 |     def __init__(self, **kwargs):
 3 |         super(BaseAgent, self).__init__(**kwargs)
 4 | 
 5 |     def train(self) -> dict:
 6 |         """Return a dictionary of logging information."""
 7 |         raise NotImplementedError
 8 | 
 9 |     def add_to_replay_buffer(self, paths):
10 |         raise NotImplementedError
11 | 
12 |     def sample(self, batch_size):
13 |         raise NotImplementedError
14 | 
15 |     def save(self, path):
16 |         raise NotImplementedError


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285/infrastructure/__init__.py


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
 1 | from gym.wrappers import Monitor
 2 | import glob
 3 | import io
 4 | import base64
 5 | from IPython.display import HTML
 6 | from IPython import display as ipythondisplay
 7 | 
 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
 9 | 
10 | def show_video():
11 |   mp4list = glob.glob('/content/video/*.mp4')
12 |   if len(mp4list) > 0:
13 |     mp4 = mp4list[0]
14 |     video = io.open(mp4, 'r+b').read()
15 |     encoded = base64.b64encode(video)
16 |     ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
17 |                 loop controls style="height: 400px;">
18 |                 <source src="data:video/mp4;base64,{0}" type="video/mp4" />
19 |              </video>'''.format(encoded.decode('ascii'))))
20 |   else: 
21 |     print("Could not find video")
22 |     
23 | 
24 | def wrap_env(env):
25 |   env = Monitor(env, '/content/video', force=True)
26 |   return env


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tensorboardX import SummaryWriter
 3 | import numpy as np
 4 | 
 5 | class Logger:
 6 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 7 |         self._log_dir = log_dir
 8 |         print('########################')
 9 |         print('logging outputs to ', log_dir)
10 |         print('########################')
11 |         self._n_logged_samples = n_logged_samples
12 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 | 
14 |     def log_scalar(self, scalar, name, step_):
15 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 | 
17 |     def log_scalars(self, scalar_dict, group_name, step, phase):
18 |         """Will log all scalars in the same plot."""
19 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 | 
21 |     def log_image(self, image, name, step):
22 |         assert(len(image.shape) == 3)  # [C, H, W]
23 |         self._summ_writer.add_image('{}'.format(name), image, step)
24 | 
25 |     def log_video(self, video_frames, name, step, fps=10):
26 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 | 
29 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 | 
31 |         # reshape the rollouts
32 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 | 
34 |         # max rollout length
35 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 |         max_length = videos[0].shape[0]
37 |         for i in range(max_videos_to_save):
38 |             if videos[i].shape[0]>max_length:
39 |                 max_length = videos[i].shape[0]
40 | 
41 |         # pad rollouts to all be same length
42 |         for i in range(max_videos_to_save):
43 |             if videos[i].shape[0]<max_length:
44 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
45 |                 videos[i] = np.concatenate([videos[i], padding], 0)
46 | 
47 |         # log videos to tensorboard event file
48 |         videos = np.stack(videos[:max_videos_to_save], 0)
49 |         self.log_video(videos, video_title, step, fps=fps)
50 | 
51 |     def log_figures(self, figure, name, step, phase):
52 |         """figure: matplotlib.pyplot figure handle"""
53 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
54 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 | 
56 |     def log_figure(self, figure, name, step, phase):
57 |         """figure: matplotlib.pyplot figure handle"""
58 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 | 
60 |     def log_graph(self, array, name, step, phase):
61 |         """figure: matplotlib.pyplot figure handle"""
62 |         im = plot_graph(array)
63 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 | 
65 |     def dump_scalars(self, log_path=None):
66 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 |         self._summ_writer.export_scalars_to_json(log_path)
68 | 
69 |     def flush(self):
70 |         self._summ_writer.flush()
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | Activation = Union[str, nn.Module]
 7 | 
 8 | 
 9 | _str_to_activation = {
10 |     'relu': nn.ReLU(),
11 |     'tanh': nn.Tanh(),
12 |     'leaky_relu': nn.LeakyReLU(),
13 |     'sigmoid': nn.Sigmoid(),
14 |     'selu': nn.SELU(),
15 |     'softplus': nn.Softplus(),
16 |     'identity': nn.Identity(),
17 | }
18 | 
19 | 
20 | def build_mlp(
21 |         input_size: int,
22 |         output_size: int,
23 |         n_layers: int,
24 |         size: int,
25 |         activation: Activation = 'tanh',
26 |         output_activation: Activation = 'identity',
27 | ):
28 |     """
29 |         Builds a feedforward neural network
30 | 
31 |         arguments:
32 |             input_placeholder: placeholder variable for the state (batch_size, input_size)
33 |             scope: variable scope of the network
34 | 
35 |             n_layers: number of hidden layers
36 |             size: dimension of each hidden layer
37 |             activation: activation of each hidden layer
38 | 
39 |             input_size: size of the input layer
40 |             output_size: size of the output layer
41 |             output_activation: activation of the output layer
42 | 
43 |         returns:
44 |             output_placeholder: the result of a forward pass through the hidden layers + the output layer
45 |     """
46 |     if isinstance(activation, str):
47 |         activation = _str_to_activation[activation]
48 |     if isinstance(output_activation, str):
49 |         output_activation = _str_to_activation[output_activation]
50 |     layers = []
51 |     in_size = input_size
52 |     for _ in range(n_layers):
53 |         layers.append(nn.Linear(in_size, size))
54 |         layers.append(activation)
55 |         in_size = size
56 |     layers.append(nn.Linear(in_size, output_size))
57 |     layers.append(output_activation)
58 |     return nn.Sequential(*layers)
59 | 
60 | 
61 | device = None
62 | 
63 | 
64 | def init_gpu(use_gpu=True, gpu_id=0):
65 |     global device
66 |     if torch.cuda.is_available() and use_gpu:
67 |         device = torch.device("cuda:" + str(gpu_id))
68 |         print("Using GPU id {}".format(gpu_id))
69 |     else:
70 |         device = torch.device("cpu")
71 |         print("GPU not detected. Defaulting to CPU.")
72 | 
73 | 
74 | def set_device(gpu_id):
75 |     torch.cuda.set_device(gpu_id)
76 | 
77 | 
78 | def from_numpy(*args, **kwargs):
79 |     return torch.from_numpy(*args, **kwargs).float().to(device)
80 | 
81 | 
82 | def to_numpy(tensor):
83 |     return tensor.to('cpu').detach().numpy()
84 | 


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | from cs285.infrastructure.utils import *
 2 | 
 3 | 
 4 | class ReplayBuffer(object):
 5 | 
 6 |     def __init__(self, max_size=1000000):
 7 | 
 8 |         self.max_size = max_size
 9 |         self.paths = []
10 |         self.obs = None
11 |         self.acs = None
12 |         self.concatenated_rews = None
13 |         self.unconcatenated_rews = None
14 |         self.next_obs = None
15 |         self.terminals = None
16 | 
17 |     def add_rollouts(self, paths, noised=False):
18 | 
19 |         # add new rollouts into our list of rollouts
20 |         for path in paths:
21 |             self.paths.append(path)
22 | 
23 |         # convert new rollouts into their component arrays, and append them onto our arrays
24 |         observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
25 | 
26 |         if noised:
27 |             observations = add_noise(observations)
28 |             next_observations = add_noise(next_observations)
29 | 
30 |         if self.obs is None:
31 |             self.obs = observations[-self.max_size:]
32 |             self.acs = actions[-self.max_size:]
33 |             self.next_obs = next_observations[-self.max_size:]
34 |             self.terminals = terminals[-self.max_size:]
35 |             self.concatenated_rews = concatenated_rews[-self.max_size:]
36 |             self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
37 |         else:
38 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
39 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
40 |             self.next_obs = np.concatenate(
41 |                 [self.next_obs, next_observations]
42 |             )[-self.max_size:]
43 |             self.terminals = np.concatenate(
44 |                 [self.terminals, terminals]
45 |             )[-self.max_size:]
46 |             self.concatenated_rews = np.concatenate(
47 |                 [self.concatenated_rews, concatenated_rews]
48 |             )[-self.max_size:]
49 |             if isinstance(unconcatenated_rews, list):
50 |                 self.unconcatenated_rews += unconcatenated_rews  # TODO keep only latest max_size around
51 |             else:
52 |                 self.unconcatenated_rews.append(unconcatenated_rews)  # TODO keep only latest max_size around
53 | 
54 |     ########################################
55 |     ########################################
56 | 
57 |     def sample_random_rollouts(self, num_rollouts):
58 |         rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
59 |         return self.paths[rand_indices]
60 | 
61 |     def sample_recent_rollouts(self, num_rollouts=1):
62 |         return self.paths[-num_rollouts:]
63 | 
64 |     ########################################
65 |     ########################################
66 | 
67 |     def sample_random_data(self, batch_size):
68 | 
69 |         assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
70 |         rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
71 |         return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
72 | 
73 |     def sample_recent_data(self, batch_size=1, concat_rew=True):
74 | 
75 |         if concat_rew:
76 |             return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
77 |         else:
78 |             num_recent_rollouts_to_return = 0
79 |             num_datapoints_so_far = 0
80 |             index = -1
81 |             while num_datapoints_so_far < batch_size:
82 |                 recent_rollout = self.paths[index]
83 |                 index -=1
84 |                 num_recent_rollouts_to_return +=1
85 |                 num_datapoints_so_far += get_pathlength(recent_rollout)
86 |             rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
87 |             observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
88 |             return observations, actions, unconcatenated_rews, next_observations, terminals


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import copy
  4 | 
  5 | ############################################
  6 | ############################################
  7 | 
  8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics):
  9 | 
 10 |     model = models[0]
 11 | 
 12 |     # true
 13 |     true_states = perform_actions(env, action_sequence)['observation']
 14 | 
 15 |     # predicted
 16 |     ob = np.expand_dims(true_states[0],0)
 17 |     pred_states = []
 18 |     for ac in action_sequence:
 19 |         pred_states.append(ob)
 20 |         action = np.expand_dims(ac,0)
 21 |         ob = model.get_prediction(ob, action, data_statistics)
 22 |     pred_states = np.squeeze(pred_states)
 23 | 
 24 |     # mpe
 25 |     mpe = mean_squared_error(pred_states, true_states)
 26 | 
 27 |     return mpe, true_states, pred_states
 28 | 
 29 | def perform_actions(env, actions):
 30 |     ob = env.reset()
 31 |     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
 32 |     steps = 0
 33 |     for ac in actions:
 34 |         obs.append(ob)
 35 |         acs.append(ac)
 36 |         ob, rew, done, _ = env.step(ac)
 37 |         # add the observation after taking a step to next_obs
 38 |         next_obs.append(ob)
 39 |         rewards.append(rew)
 40 |         steps += 1
 41 |         # If the episode ended, the corresponding terminal value is 1
 42 |         # otherwise, it is 0
 43 |         if done:
 44 |             terminals.append(1)
 45 |             break
 46 |         else:
 47 |             terminals.append(0)
 48 | 
 49 |     return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 50 | 
 51 | def mean_squared_error(a, b):
 52 |     return np.mean((a-b)**2)
 53 | 
 54 | ############################################
 55 | ############################################
 56 | 
 57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
 58 |     # TODO: get this from hw1
 59 |     return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 60 |     
 61 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
 62 |     # TODO: get this from hw1
 63 |     return paths, timesteps_this_batch
 64 | 
 65 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
 66 |     # TODO: get this from hw1
 67 |     return paths
 68 | 
 69 | ############################################
 70 | ############################################
 71 | 
 72 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
 73 |     """
 74 |         Take info (separate arrays) from a single rollout
 75 |         and return it in a single dictionary
 76 |     """
 77 |     if image_obs != []:
 78 |         image_obs = np.stack(image_obs, axis=0)
 79 |     return {"observation" : np.array(obs, dtype=np.float32),
 80 |             "image_obs" : np.array(image_obs, dtype=np.uint8),
 81 |             "reward" : np.array(rewards, dtype=np.float32),
 82 |             "action" : np.array(acs, dtype=np.float32),
 83 |             "next_observation": np.array(next_obs, dtype=np.float32),
 84 |             "terminal": np.array(terminals, dtype=np.float32)}
 85 | 
 86 | 
 87 | def convert_listofrollouts(paths):
 88 |     """
 89 |         Take a list of rollout dictionaries
 90 |         and return separate arrays,
 91 |         where each array is a concatenation of that array from across the rollouts
 92 |     """
 93 |     observations = np.concatenate([path["observation"] for path in paths])
 94 |     actions = np.concatenate([path["action"] for path in paths])
 95 |     next_observations = np.concatenate([path["next_observation"] for path in paths])
 96 |     terminals = np.concatenate([path["terminal"] for path in paths])
 97 |     concatenated_rewards = np.concatenate([path["reward"] for path in paths])
 98 |     unconcatenated_rewards = [path["reward"] for path in paths]
 99 |     return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
100 | 
101 | ############################################
102 | ############################################
103 | 
104 | def get_pathlength(path):
105 |     return len(path["reward"])
106 | 
107 | def normalize(data, mean, std, eps=1e-8):
108 |     return (data-mean)/(std+eps)
109 | 
110 | def unnormalize(data, mean, std):
111 |     return data*std+mean
112 | 
113 | def add_noise(data_inp, noiseToSignal=0.01):
114 | 
115 |     data = copy.deepcopy(data_inp) #(num data points, dim)
116 | 
117 |     #mean of data
118 |     mean_data = np.mean(data, axis=0)
119 | 
120 |     #if mean is 0,
121 |     #make it 0.001 to avoid 0 issues later for dividing by std
122 |     mean_data[mean_data == 0] = 0.000001
123 | 
124 |     #width of normal distribution to sample noise from
125 |     #larger magnitude number = could have larger magnitude noise
126 |     std_of_noise = mean_data * noiseToSignal
127 |     for j in range(mean_data.shape[0]):
128 |         data[:, j] = np.copy(data[:, j] + np.random.normal(
129 |             0, np.absolute(std_of_noise[j]), (data.shape[0],)))
130 | 
131 |     return data


--------------------------------------------------------------------------------
/hw2/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285/policies/__init__.py


--------------------------------------------------------------------------------
/hw2/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import numpy as np
 3 | 
 4 | 
 5 | class BasePolicy(object, metaclass=abc.ABCMeta):
 6 |     def get_action(self, obs: np.ndarray) -> np.ndarray:
 7 |         raise NotImplementedError
 8 | 
 9 |     def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 |         """Return a dictionary of logging information."""
11 |         raise NotImplementedError
12 | 
13 |     def save(self, filepath: str):
14 |         raise NotImplementedError
15 | 


--------------------------------------------------------------------------------
/hw2/cs285/scripts/read_results.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import tensorflow as tf
 3 | 
 4 | def get_section_results(file):
 5 |     """
 6 |         requires tensorflow==1.12.0
 7 |     """
 8 |     X = []
 9 |     Y = []
10 |     for e in tf.train.summary_iterator(file):
11 |         for v in e.summary.value:
12 |             if v.tag == 'Train_EnvstepsSoFar':
13 |                 X.append(v.simple_value)
14 |             elif v.tag == 'Eval_AverageReturn':
15 |                 Y.append(v.simple_value)
16 |     return X, Y
17 | 
18 | if __name__ == '__main__':
19 |     import glob
20 | 
21 |     logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*'
22 |     eventfile = glob.glob(logdir)[0]
23 | 
24 |     X, Y = get_section_results(eventfile)
25 |     for i, (x, y) in enumerate(zip(X, Y)):
26 |         print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y))


--------------------------------------------------------------------------------
/hw2/cs285/scripts/run_hw2.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | from cs285.infrastructure.rl_trainer import RL_Trainer
  5 | from cs285.agents.pg_agent import PGAgent
  6 | 
  7 | class PG_Trainer(object):
  8 | 
  9 |     def __init__(self, params):
 10 | 
 11 |         #####################
 12 |         ## SET AGENT PARAMS
 13 |         #####################
 14 | 
 15 |         computation_graph_args = {
 16 |             'n_layers': params['n_layers'],
 17 |             'size': params['size'],
 18 |             'learning_rate': params['learning_rate'],
 19 |             }
 20 | 
 21 |         estimate_advantage_args = {
 22 |             'gamma': params['discount'],
 23 |             'standardize_advantages': not(params['dont_standardize_advantages']),
 24 |             'reward_to_go': params['reward_to_go'],
 25 |             'nn_baseline': params['nn_baseline'],
 26 |         }
 27 | 
 28 |         train_args = {
 29 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
 30 |         }
 31 | 
 32 |         agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}
 33 | 
 34 |         self.params = params
 35 |         self.params['agent_class'] = PGAgent
 36 |         self.params['agent_params'] = agent_params
 37 |         self.params['batch_size_initial'] = self.params['batch_size']
 38 | 
 39 |         ################
 40 |         ## RL TRAINER
 41 |         ################
 42 | 
 43 |         self.rl_trainer = RL_Trainer(self.params)
 44 | 
 45 |     def run_training_loop(self):
 46 | 
 47 |         self.rl_trainer.run_training_loop(
 48 |             self.params['n_iter'],
 49 |             collect_policy = self.rl_trainer.agent.actor,
 50 |             eval_policy = self.rl_trainer.agent.actor,
 51 |             )
 52 | 
 53 | 
 54 | def main():
 55 | 
 56 |     import argparse
 57 |     parser = argparse.ArgumentParser()
 58 |     parser.add_argument('--env_name', type=str)
 59 |     parser.add_argument('--exp_name', type=str, default='todo')
 60 |     parser.add_argument('--n_iter', '-n', type=int, default=200)
 61 | 
 62 |     parser.add_argument('--reward_to_go', '-rtg', action='store_true')
 63 |     parser.add_argument('--nn_baseline', action='store_true')
 64 |     parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true')
 65 |     parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration
 66 |     parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
 67 | 
 68 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
 69 |     parser.add_argument('--discount', type=float, default=1.0)
 70 |     parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
 71 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
 72 |     parser.add_argument('--size', '-s', type=int, default=64)
 73 | 
 74 |     parser.add_argument('--ep_len', type=int) #students shouldn't change this away from env's default
 75 |     parser.add_argument('--seed', type=int, default=1)
 76 |     parser.add_argument('--no_gpu', '-ngpu', action='store_true')
 77 |     parser.add_argument('--which_gpu', '-gpu_id', default=0)
 78 |     parser.add_argument('--video_log_freq', type=int, default=-1)
 79 |     parser.add_argument('--scalar_log_freq', type=int, default=1)
 80 | 
 81 |     parser.add_argument('--save_params', action='store_true')
 82 | 
 83 |     args = parser.parse_args()
 84 | 
 85 |     # convert to dictionary
 86 |     params = vars(args)
 87 | 
 88 |     ## ensure compatibility with hw1 code
 89 |     params['train_batch_size'] = params['batch_size']
 90 | 
 91 |     ##################################
 92 |     ### CREATE DIRECTORY FOR LOGGING
 93 |     ##################################
 94 | 
 95 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data')
 96 | 
 97 |     if not (os.path.exists(data_path)):
 98 |         os.makedirs(data_path)
 99 | 
100 |     logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
101 |     logdir = os.path.join(data_path, logdir)
102 |     params['logdir'] = logdir
103 |     if not(os.path.exists(logdir)):
104 |         os.makedirs(logdir)
105 | 
106 |     ###################
107 |     ### RUN TRAINING
108 |     ###################
109 | 
110 |     trainer = PG_Trainer(params)
111 |     trainer.run_training_loop()
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/hw2/cs285_hw2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285_hw2.pdf


--------------------------------------------------------------------------------
/hw2/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==1.5.1
 2 | gym==0.17.2
 3 | mujoco-py==2.0.2.2
 4 | tensorboard==2.3.0
 5 | tensorboardX==1.8
 6 | matplotlib==2.2.2
 7 | ipython==6.4.0
 8 | moviepy==1.0.0
 9 | pyvirtualdisplay==1.3.2
10 | ipdb==0.13.3
11 | box2d-py
12 | tensorflow==1.12.0


--------------------------------------------------------------------------------
/hw2/requirements_colab.txt:
--------------------------------------------------------------------------------
 1 | torch==1.5.1+cu101
 2 | gym==0.17.2
 3 | tensorboard==2.3.0
 4 | tensorboardX==1.8
 5 | matplotlib==2.2.2
 6 | ipython==6.4.0
 7 | moviepy==1.0.0
 8 | pyvirtualdisplay==1.3.2
 9 | ipdb==0.13.3
10 | box2d-py
11 | tensorflow==2.3.0


--------------------------------------------------------------------------------
/hw2/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------
/hw3/README.md:
--------------------------------------------------------------------------------
 1 | ## Setup
 2 | 
 3 | You can run this code on your own machine or on Google Colab. 
 4 | 
 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally.
 6 | 
 7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below:
 8 | 
 9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw3/cs285/scripts/run_hw3_dqn.ipynb) **Part I (Q-learning)** 
10 | 
11 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw3/cs285/scripts/run_hw3_actor_critic.ipynb)     **Part II (Actor-critic)** 
12 | 
13 | ## Complete the code
14 | 
15 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with `TODO: get this from hw1 or hw2`.
16 | 
17 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
18 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
19 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
20 | 
21 | You will then need to implement new routines in the following files for homework 3 part 1 (Q-learning):
22 | - [agents/dqn_agent.py](cs285/agents/dqn_agent.py)
23 | - [critics/dqn_critic.py](cs285/critics/dqn_critic.py)
24 | - [policies/argmax_policy.py](cs285/policies/argmax_policy.py)
25 | 
26 | and in the following files for part 2 (actor-critic):
27 | - [agents/ac_agent.py](cs285/agents/ac_agent.py)
28 | - [critics/bootstrapped_continuous_critic.py](cs285/critics/bootstrapped_continuous_critic.py)
29 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
30 | 
31 | The relevant sections are marked with `TODO`.
32 | 
33 | You may also want to look through [run_hw3_dqn.py](cs285/scripts/run_hw3_dqn.py) and [run_hw3_actor_critic.py](cs285/scripts/run_hw3_actor_critic.py) (if running locally) or [run_hw3_dqn.ipynb](cs285/scripts/run_hw3_dqn.ipynb) and [run_hw3_actor_critic.ipynb](cs285/scripts/run_hw3_actor_critic.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook.
34 | 
35 | See the [assignment PDF](cs285_hw3.pdf) for more details on what files to edit.
36 | 
37 | 


--------------------------------------------------------------------------------
/hw3/cs285/agents/ac_agent.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | from cs285.critics.bootstrapped_continuous_critic import \
 4 |     BootstrappedContinuousCritic
 5 | from cs285.infrastructure.replay_buffer import ReplayBuffer
 6 | from cs285.infrastructure.utils import *
 7 | from cs285.policies.MLP_policy import MLPPolicyAC
 8 | from .base_agent import BaseAgent
 9 | 
10 | 
11 | class ACAgent(BaseAgent):
12 |     def __init__(self, env, agent_params):
13 |         super(ACAgent, self).__init__()
14 | 
15 |         self.env = env
16 |         self.agent_params = agent_params
17 | 
18 |         self.gamma = self.agent_params['gamma']
19 |         self.standardize_advantages = self.agent_params['standardize_advantages']
20 | 
21 |         self.actor = MLPPolicyAC(
22 |             self.agent_params['ac_dim'],
23 |             self.agent_params['ob_dim'],
24 |             self.agent_params['n_layers'],
25 |             self.agent_params['size'],
26 |             self.agent_params['discrete'],
27 |             self.agent_params['learning_rate'],
28 |         )
29 |         self.critic = BootstrappedContinuousCritic(self.agent_params)
30 | 
31 |         self.replay_buffer = ReplayBuffer()
32 | 
33 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
34 |         # TODO Implement the following pseudocode:
35 |         # for agent_params['num_critic_updates_per_agent_update'] steps,
36 |         #     update the critic
37 | 
38 |         # advantage = estimate_advantage(...)
39 | 
40 |         # for agent_params['num_actor_updates_per_agent_update'] steps,
41 |         #     update the actor
42 | 
43 |         loss = OrderedDict()
44 |         loss['Critic_Loss'] = TODO
45 |         loss['Actor_Loss'] = TODO
46 | 
47 |         return loss
48 | 
49 |     def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
50 |         # TODO Implement the following pseudocode:
51 |         # 1) query the critic with ob_no, to get V(s)
52 |         # 2) query the critic with next_ob_no, to get V(s')
53 |         # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
54 |         # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
55 |         # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
56 |         adv_n = TODO
57 | 
58 |         if self.standardize_advantages:
59 |             adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
60 |         return adv_n
61 | 
62 |     def add_to_replay_buffer(self, paths):
63 |         self.replay_buffer.add_rollouts(paths)
64 | 
65 |     def sample(self, batch_size):
66 |         return self.replay_buffer.sample_recent_data(batch_size)
67 | 


--------------------------------------------------------------------------------
/hw3/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
 1 | class BaseAgent(object):
 2 |     def __init__(self, **kwargs):
 3 |         super(BaseAgent, self).__init__(**kwargs)
 4 | 
 5 |     def train(self) -> dict:
 6 |         """Return a dictionary of logging information."""
 7 |         raise NotImplementedError
 8 | 
 9 |     def add_to_replay_buffer(self, paths):
10 |         raise NotImplementedError
11 | 
12 |     def sample(self, batch_size):
13 |         raise NotImplementedError
14 | 
15 |     def save(self, path):
16 |         raise NotImplementedError


--------------------------------------------------------------------------------
/hw3/cs285/agents/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule
  4 | from cs285.policies.argmax_policy import ArgMaxPolicy
  5 | from cs285.critics.dqn_critic import DQNCritic
  6 | 
  7 | 
  8 | class DQNAgent(object):
  9 |     def __init__(self, env, agent_params):
 10 | 
 11 |         self.env = env
 12 |         self.agent_params = agent_params
 13 |         self.batch_size = agent_params['batch_size']
 14 |         # import ipdb; ipdb.set_trace()
 15 |         self.last_obs = self.env.reset()
 16 | 
 17 |         self.num_actions = agent_params['ac_dim']
 18 |         self.learning_starts = agent_params['learning_starts']
 19 |         self.learning_freq = agent_params['learning_freq']
 20 |         self.target_update_freq = agent_params['target_update_freq']
 21 | 
 22 |         self.replay_buffer_idx = None
 23 |         self.exploration = agent_params['exploration_schedule']
 24 |         self.optimizer_spec = agent_params['optimizer_spec']
 25 | 
 26 |         self.critic = DQNCritic(agent_params, self.optimizer_spec)
 27 |         self.actor = ArgMaxPolicy(self.critic)
 28 | 
 29 |         lander = agent_params['env_name'].startswith('LunarLander')
 30 |         self.replay_buffer = MemoryOptimizedReplayBuffer(
 31 |             agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander)
 32 |         self.t = 0
 33 |         self.num_param_updates = 0
 34 | 
 35 |     def add_to_replay_buffer(self, paths):
 36 |         pass
 37 | 
 38 |     def step_env(self):
 39 |         """
 40 |             Step the env and store the transition
 41 |             At the end of this block of code, the simulator should have been
 42 |             advanced one step, and the replay buffer should contain one more transition.
 43 |             Note that self.last_obs must always point to the new latest observation.
 44 |         """        
 45 | 
 46 |         # TODO store the latest observation ("frame") into the replay buffer
 47 |         # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer`
 48 |             # in dqn_utils.py
 49 |         self.replay_buffer_idx = TODO
 50 | 
 51 |         eps = self.exploration.value(self.t)
 52 | 
 53 |         # TODO use epsilon greedy exploration when selecting action
 54 |         perform_random_action = TODO
 55 |         if perform_random_action:
 56 |             # HINT: take random action 
 57 |                 # with probability eps (see np.random.random())
 58 |                 # OR if your current step number (see self.t) is less that self.learning_starts
 59 |             action = TODO
 60 |         else:
 61 |             # HINT: Your actor will take in multiple previous observations ("frames") in order
 62 |                 # to deal with the partial observability of the environment. Get the most recent 
 63 |                 # `frame_history_len` observations using functionality from the replay buffer,
 64 |                 # and then use those observations as input to your actor. 
 65 |             action = TODO
 66 |         
 67 |         # TODO take a step in the environment using the action from the policy
 68 |         # HINT1: remember that self.last_obs must always point to the newest/latest observation
 69 |         # HINT2: remember the following useful function that you've seen before:
 70 |             #obs, reward, done, info = env.step(action)
 71 |         TODO
 72 | 
 73 |         # TODO store the result of taking this action into the replay buffer
 74 |         # HINT1: see your replay buffer's `store_effect` function
 75 |         # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
 76 |         TODO
 77 | 
 78 |         # TODO if taking this step resulted in done, reset the env (and the latest observation)
 79 |         TODO
 80 | 
 81 |     def sample(self, batch_size):
 82 |         if self.replay_buffer.can_sample(self.batch_size):
 83 |             return self.replay_buffer.sample(batch_size)
 84 |         else:
 85 |             return [],[],[],[],[]
 86 | 
 87 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
 88 |         log = {}
 89 |         if (self.t > self.learning_starts
 90 |                 and self.t % self.learning_freq == 0
 91 |                 and self.replay_buffer.can_sample(self.batch_size)
 92 |         ):
 93 | 
 94 |             # TODO fill in the call to the update function using the appropriate tensors
 95 |             log = self.critic.update(
 96 |                 TODO
 97 |             )
 98 | 
 99 |             # TODO update the target network periodically 
100 |             # HINT: your critic already has this functionality implemented
101 |             if self.num_param_updates % self.target_update_freq == 0:
102 |                 TODO
103 | 
104 |             self.num_param_updates += 1
105 | 
106 |         self.t += 1
107 |         return log
108 | 


--------------------------------------------------------------------------------
/hw3/cs285/critics/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/hw3/cs285/critics/base_critic.py:
--------------------------------------------------------------------------------
1 | class BaseCritic(object):
2 |     def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n):
3 |         raise NotImplementedError
4 | 


--------------------------------------------------------------------------------
/hw3/cs285/critics/bootstrapped_continuous_critic.py:
--------------------------------------------------------------------------------
 1 | from .base_critic import BaseCritic
 2 | from torch import nn
 3 | from torch import optim
 4 | 
 5 | from cs285.infrastructure import pytorch_util as ptu
 6 | 
 7 | 
 8 | class BootstrappedContinuousCritic(nn.Module, BaseCritic):
 9 |     """
10 |         Notes on notation:
11 | 
12 |         Prefixes and suffixes:
13 |         ob - observation
14 |         ac - action
15 |         _no - this tensor should have shape (batch self.size /n/, observation dim)
16 |         _na - this tensor should have shape (batch self.size /n/, action dim)
17 |         _n  - this tensor should have shape (batch self.size /n/)
18 | 
19 |         Note: batch self.size /n/ is defined at runtime.
20 |         is None
21 |     """
22 |     def __init__(self, hparams):
23 |         super().__init__()
24 |         self.ob_dim = hparams['ob_dim']
25 |         self.ac_dim = hparams['ac_dim']
26 |         self.discrete = hparams['discrete']
27 |         self.size = hparams['size']
28 |         self.n_layers = hparams['n_layers']
29 |         self.learning_rate = hparams['learning_rate']
30 | 
31 |         # critic parameters
32 |         self.num_target_updates = hparams['num_target_updates']
33 |         self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update']
34 |         self.gamma = hparams['gamma']
35 |         self.critic_network = ptu.build_mlp(
36 |             self.ob_dim,
37 |             1,
38 |             n_layers=self.n_layers,
39 |             size=self.size,
40 |         )
41 |         self.critic_network.to(ptu.device)
42 |         self.loss = nn.MSELoss()
43 |         self.optimizer = optim.Adam(
44 |             self.critic_network.parameters(),
45 |             self.learning_rate,
46 |         )
47 | 
48 |     def forward(self, obs):
49 |         return self.critic_network(obs).squeeze(1)
50 | 
51 |     def forward_np(self, obs):
52 |         obs = ptu.from_numpy(obs)
53 |         predictions = self(obs)
54 |         return ptu.to_numpy(predictions)
55 | 
56 |     def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
57 |         """
58 |             Update the parameters of the critic.
59 | 
60 |             let sum_of_path_lengths be the sum of the lengths of the paths sampled from
61 |                 Agent.sample_trajectories
62 |             let num_paths be the number of paths sampled from Agent.sample_trajectories
63 | 
64 |             arguments:
65 |                 ob_no: shape: (sum_of_path_lengths, ob_dim)
66 |                 next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
67 |                 reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
68 |                     the reward for each timestep
69 |                 terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
70 |                     at that timestep of 0 if the episode did not end
71 | 
72 |             returns:
73 |                 training loss
74 |         """
75 |         # TODO: Implement the pseudocode below: do the following (
76 |         # self.num_grad_steps_per_target_update * self.num_target_updates)
77 |         # times:
78 |         # every self.num_grad_steps_per_target_update steps (which includes the
79 |         # first step), recompute the target values by
80 |         #     a) calculating V(s') by querying the critic with next_ob_no
81 |         #     b) and computing the target values as r(s, a) + gamma * V(s')
82 |         # every time, update this critic using the observations and targets
83 |         #
84 |         # HINT: don't forget to use terminal_n to cut off the V(s') (ie set it
85 |         #       to 0) when a terminal state is reached
86 |         # HINT: make sure to squeeze the output of the critic_network to ensure
87 |         #       that its dimensions match the reward
88 | 
89 |         return loss.item()
90 | 


--------------------------------------------------------------------------------
/hw3/cs285/critics/dqn_critic.py:
--------------------------------------------------------------------------------
  1 | from .base_critic import BaseCritic
  2 | import torch
  3 | import torch.optim as optim
  4 | from torch.nn import utils
  5 | from torch import nn
  6 | 
  7 | from cs285.infrastructure import pytorch_util as ptu
  8 | 
  9 | 
 10 | class DQNCritic(BaseCritic):
 11 | 
 12 |     def __init__(self, hparams, optimizer_spec, **kwargs):
 13 |         super().__init__(**kwargs)
 14 |         self.env_name = hparams['env_name']
 15 |         self.ob_dim = hparams['ob_dim']
 16 | 
 17 |         if isinstance(self.ob_dim, int):
 18 |             self.input_shape = (self.ob_dim,)
 19 |         else:
 20 |             self.input_shape = hparams['input_shape']
 21 | 
 22 |         self.ac_dim = hparams['ac_dim']
 23 |         self.double_q = hparams['double_q']
 24 |         self.grad_norm_clipping = hparams['grad_norm_clipping']
 25 |         self.gamma = hparams['gamma']
 26 | 
 27 |         self.optimizer_spec = optimizer_spec
 28 |         network_initializer = hparams['q_func']
 29 |         self.q_net = network_initializer(self.ob_dim, self.ac_dim)
 30 |         self.q_net_target = network_initializer(self.ob_dim, self.ac_dim)
 31 |         self.optimizer = self.optimizer_spec.constructor(
 32 |             self.q_net.parameters(),
 33 |             **self.optimizer_spec.optim_kwargs
 34 |         )
 35 |         self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR(
 36 |             self.optimizer,
 37 |             self.optimizer_spec.learning_rate_schedule,
 38 |         )
 39 |         self.loss = nn.SmoothL1Loss()  # AKA Huber loss
 40 |         self.q_net.to(ptu.device)
 41 |         self.q_net_target.to(ptu.device)
 42 | 
 43 |     def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
 44 |         """
 45 |             Update the parameters of the critic.
 46 |             let sum_of_path_lengths be the sum of the lengths of the paths sampled from
 47 |                 Agent.sample_trajectories
 48 |             let num_paths be the number of paths sampled from Agent.sample_trajectories
 49 |             arguments:
 50 |                 ob_no: shape: (sum_of_path_lengths, ob_dim)
 51 |                 next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
 52 |                 reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
 53 |                     the reward for each timestep
 54 |                 terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
 55 |                     at that timestep of 0 if the episode did not end
 56 |             returns:
 57 |                 nothing
 58 |         """
 59 |         ob_no = ptu.from_numpy(ob_no)
 60 |         ac_na = ptu.from_numpy(ac_na).to(torch.long)
 61 |         next_ob_no = ptu.from_numpy(next_ob_no)
 62 |         reward_n = ptu.from_numpy(reward_n)
 63 |         terminal_n = ptu.from_numpy(terminal_n)
 64 | 
 65 |         qa_t_values = self.q_net(ob_no)
 66 |         q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1)
 67 |         
 68 |         # TODO compute the Q-values from the target network 
 69 |         qa_tp1_values = TODO
 70 | 
 71 |         if self.double_q:
 72 |             # You must fill this part for Q2 of the Q-learning portion of the homework.
 73 |             # In double Q-learning, the best action is selected using the Q-network that
 74 |             # is being updated, but the Q-value for this action is obtained from the
 75 |             # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.
 76 |             TODO
 77 |         else:
 78 |             q_tp1, _ = qa_tp1_values.max(dim=1)
 79 | 
 80 |         # TODO compute targets for minimizing Bellman error
 81 |         # HINT: as you saw in lecture, this would be:
 82 |             #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal)
 83 |         target = TODO
 84 |         target = target.detach()
 85 | 
 86 |         assert q_t_values.shape == target.shape
 87 |         loss = self.loss(q_t_values, target)
 88 | 
 89 |         self.optimizer.zero_grad()
 90 |         loss.backward()
 91 |         utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping)
 92 |         self.optimizer.step()
 93 | 
 94 |         return {
 95 |             'Training Loss': ptu.to_numpy(loss),
 96 |         }
 97 | 
 98 |     def update_target_network(self):
 99 |         for target_param, param in zip(
100 |                 self.q_net_target.parameters(), self.q_net.parameters()
101 |         ):
102 |             target_param.data.copy_(param.data)
103 | 
104 |     def qa_values(self, obs):
105 |         obs = ptu.from_numpy(obs)
106 |         qa_values = self.q_net(obs)
107 |         return ptu.to_numpy(qa_values)
108 | 


--------------------------------------------------------------------------------
/hw3/cs285/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285/envs/__init__.py


--------------------------------------------------------------------------------
/hw3/cs285/envs/box2d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285/envs/box2d/__init__.py


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
 1 | from gym.wrappers import Monitor
 2 | import glob
 3 | import io
 4 | import base64
 5 | from IPython.display import HTML
 6 | from IPython import display as ipythondisplay
 7 | 
 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
 9 | 
10 | def show_video():
11 |   mp4list = glob.glob('/content/video/*.mp4')
12 |   if len(mp4list) > 0:
13 |     mp4 = mp4list[0]
14 |     video = io.open(mp4, 'r+b').read()
15 |     encoded = base64.b64encode(video)
16 |     ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
17 |                 loop controls style="height: 400px;">
18 |                 <source src="data:video/mp4;base64,{0}" type="video/mp4" />
19 |              </video>'''.format(encoded.decode('ascii'))))
20 |   else: 
21 |     print("Could not find video")
22 | 
23 | 
24 | def wrap_env(env):
25 |   env = Monitor(env, '/content/video', force=True)
26 |   return env
27 | 


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tensorboardX import SummaryWriter
 3 | import numpy as np
 4 | 
 5 | class Logger:
 6 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 7 |         self._log_dir = log_dir
 8 |         print('########################')
 9 |         print('logging outputs to ', log_dir)
10 |         print('########################')
11 |         self._n_logged_samples = n_logged_samples
12 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 | 
14 |     def log_scalar(self, scalar, name, step_):
15 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 | 
17 |     def log_scalars(self, scalar_dict, group_name, step, phase):
18 |         """Will log all scalars in the same plot."""
19 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 | 
21 |     def log_image(self, image, name, step):
22 |         assert(len(image.shape) == 3)  # [C, H, W]
23 |         self._summ_writer.add_image('{}'.format(name), image, step)
24 | 
25 |     def log_video(self, video_frames, name, step, fps=10):
26 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 | 
29 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 | 
31 |         # reshape the rollouts
32 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 | 
34 |         # max rollout length
35 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 |         max_length = videos[0].shape[0]
37 |         for i in range(max_videos_to_save):
38 |             if videos[i].shape[0]>max_length:
39 |                 max_length = videos[i].shape[0]
40 | 
41 |         # pad rollouts to all be same length
42 |         for i in range(max_videos_to_save):
43 |             if videos[i].shape[0]<max_length:
44 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
45 |                 videos[i] = np.concatenate([videos[i], padding], 0)
46 | 
47 |         # log videos to tensorboard event file
48 |         videos = np.stack(videos[:max_videos_to_save], 0)
49 |         self.log_video(videos, video_title, step, fps=fps)
50 | 
51 |     def log_figures(self, figure, name, step, phase):
52 |         """figure: matplotlib.pyplot figure handle"""
53 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
54 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 | 
56 |     def log_figure(self, figure, name, step, phase):
57 |         """figure: matplotlib.pyplot figure handle"""
58 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 | 
60 |     def log_graph(self, array, name, step, phase):
61 |         """figure: matplotlib.pyplot figure handle"""
62 |         im = plot_graph(array)
63 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 | 
65 |     def dump_scalars(self, log_path=None):
66 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 |         self._summ_writer.export_scalars_to_json(log_path)
68 | 
69 |     def flush(self):
70 |         self._summ_writer.flush()
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | Activation = Union[str, nn.Module]
 7 | 
 8 | 
 9 | _str_to_activation = {
10 |     'relu': nn.ReLU(),
11 |     'tanh': nn.Tanh(),
12 |     'leaky_relu': nn.LeakyReLU(),
13 |     'sigmoid': nn.Sigmoid(),
14 |     'selu': nn.SELU(),
15 |     'softplus': nn.Softplus(),
16 |     'identity': nn.Identity(),
17 | }
18 | 
19 | 
20 | def build_mlp(
21 |         input_size: int,
22 |         output_size: int,
23 |         n_layers: int,
24 |         size: int,
25 |         activation: Activation = 'tanh',
26 |         output_activation: Activation = 'identity',
27 | ):
28 |     """
29 |         Builds a feedforward neural network
30 |         arguments:
31 |             input_placeholder: placeholder variable for the state (batch_size, input_size)
32 |             scope: variable scope of the network
33 |             n_layers: number of hidden layers
34 |             size: dimension of each hidden layer
35 |             activation: activation of each hidden layer
36 |             input_size: size of the input layer
37 |             output_size: size of the output layer
38 |             output_activation: activation of the output layer
39 |         returns:
40 |             output_placeholder: the result of a forward pass through the hidden layers + the output layer
41 |     """
42 |     if isinstance(activation, str):
43 |         activation = _str_to_activation[activation]
44 |     if isinstance(output_activation, str):
45 |         output_activation = _str_to_activation[output_activation]
46 |     layers = []
47 |     in_size = input_size
48 |     for _ in range(n_layers):
49 |         layers.append(nn.Linear(in_size, size))
50 |         layers.append(activation)
51 |         in_size = size
52 |     layers.append(nn.Linear(in_size, output_size))
53 |     layers.append(output_activation)
54 |     return nn.Sequential(*layers)
55 | 
56 | 
57 | device = None
58 | 
59 | 
60 | def init_gpu(use_gpu=True, gpu_id=0):
61 |     global device
62 |     if torch.cuda.is_available() and use_gpu:
63 |         device = torch.device("cuda:" + str(gpu_id))
64 |         print("Using GPU id {}".format(gpu_id))
65 |     else:
66 |         device = torch.device("cpu")
67 |         print("GPU not detected. Defaulting to CPU.")
68 | 
69 | 
70 | def set_device(gpu_id):
71 |     torch.cuda.set_device(gpu_id)
72 | 
73 | 
74 | def from_numpy(*args, **kwargs):
75 |     return torch.from_numpy(*args, **kwargs).float().to(device)
76 | 
77 | 
78 | def to_numpy(tensor):
79 |     return tensor.to('cpu').detach().numpy()
80 | 


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | from cs285.infrastructure.utils import *
 2 | 
 3 | 
 4 | class ReplayBuffer(object):
 5 | 
 6 |     def __init__(self, max_size=1000000):
 7 | 
 8 |         self.max_size = max_size
 9 |         self.paths = []
10 |         self.obs = None
11 |         self.acs = None
12 |         self.concatenated_rews = None
13 |         self.next_obs = None
14 |         self.terminals = None
15 | 
16 |     def add_rollouts(self, paths, noised=False):
17 | 
18 |         # add new rollouts into our list of rollouts
19 |         for path in paths:
20 |             self.paths.append(path)
21 | 
22 |         # convert new rollouts into their component arrays, and append them onto our arrays
23 |         observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
24 | 
25 |         if noised:
26 |             observations = add_noise(observations)
27 |             next_observations = add_noise(next_observations)
28 | 
29 |         if self.obs is None:
30 |             self.obs = observations[-self.max_size:]
31 |             self.acs = actions[-self.max_size:]
32 |             self.next_obs = next_observations[-self.max_size:]
33 |             self.terminals = terminals[-self.max_size:]
34 |             self.concatenated_rews = concatenated_rews[-self.max_size:]
35 |         else:
36 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
37 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
38 |             self.next_obs = np.concatenate(
39 |                 [self.next_obs, next_observations]
40 |             )[-self.max_size:]
41 |             self.terminals = np.concatenate(
42 |                 [self.terminals, terminals]
43 |             )[-self.max_size:]
44 |             self.concatenated_rews = np.concatenate(
45 |                 [self.concatenated_rews, concatenated_rews]
46 |             )[-self.max_size:]
47 | 
48 |     ########################################
49 |     ########################################
50 | 
51 |     def sample_random_rollouts(self, num_rollouts):
52 |         rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
53 |         return self.paths[rand_indices]
54 | 
55 |     def sample_recent_rollouts(self, num_rollouts=1):
56 |         return self.paths[-num_rollouts:]
57 | 
58 |     ########################################
59 |     ########################################
60 | 
61 |     def sample_random_data(self, batch_size):
62 | 
63 |         assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
64 |         rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
65 |         return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
66 | 
67 |     def sample_recent_data(self, batch_size=1, concat_rew=True):
68 | 
69 |         if concat_rew:
70 |             return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
71 |         else:
72 |             num_recent_rollouts_to_return = 0
73 |             num_datapoints_so_far = 0
74 |             index = -1
75 |             while num_datapoints_so_far < batch_size:
76 |                 recent_rollout = self.paths[index]
77 |                 index -=1
78 |                 num_recent_rollouts_to_return +=1
79 |                 num_datapoints_so_far += get_pathlength(recent_rollout)
80 |             rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
81 |             observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
82 |             return observations, actions, unconcatenated_rews, next_observations, terminals
83 | 


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import copy
  4 | 
  5 | ############################################
  6 | ############################################
  7 | 
  8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics):
  9 | 
 10 |     model = models[0]
 11 | 
 12 |     # true
 13 |     true_states = perform_actions(env, action_sequence)['observation']
 14 | 
 15 |     # predicted
 16 |     ob = np.expand_dims(true_states[0],0)
 17 |     pred_states = []
 18 |     for ac in action_sequence:
 19 |         pred_states.append(ob)
 20 |         action = np.expand_dims(ac,0)
 21 |         ob = model.get_prediction(ob, action, data_statistics)
 22 |     pred_states = np.squeeze(pred_states)
 23 | 
 24 |     # mpe
 25 |     mpe = mean_squared_error(pred_states, true_states)
 26 | 
 27 |     return mpe, true_states, pred_states
 28 | 
 29 | def perform_actions(env, actions):
 30 |     ob = env.reset()
 31 |     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
 32 |     steps = 0
 33 |     for ac in actions:
 34 |         obs.append(ob)
 35 |         acs.append(ac)
 36 |         ob, rew, done, _ = env.step(ac)
 37 |         # add the observation after taking a step to next_obs
 38 |         next_obs.append(ob)
 39 |         rewards.append(rew)
 40 |         steps += 1
 41 |         # If the episode ended, the corresponding terminal value is 1
 42 |         # otherwise, it is 0
 43 |         if done:
 44 |             terminals.append(1)
 45 |             break
 46 |         else:
 47 |             terminals.append(0)
 48 | 
 49 |     return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 50 | 
 51 | def mean_squared_error(a, b):
 52 |     return np.mean((a-b)**2)
 53 | 
 54 | ############################################
 55 | ############################################
 56 | 
 57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
 58 |     # TODO: get this from Piazza
 59 | 
 60 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
 61 |     """
 62 |         Collect rollouts using policy
 63 |         until we have collected min_timesteps_per_batch steps
 64 |     """
 65 |     # TODO: get this from Piazza
 66 | 
 67 |     return paths, timesteps_this_batch
 68 | 
 69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
 70 |     """
 71 |         Collect ntraj rollouts using policy
 72 |     """
 73 |     # TODO: get this from Piazza
 74 | 
 75 |     return paths
 76 | 
 77 | ############################################
 78 | ############################################
 79 | 
 80 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
 81 |     """
 82 |         Take info (separate arrays) from a single rollout
 83 |         and return it in a single dictionary
 84 |     """
 85 |     if image_obs != []:
 86 |         image_obs = np.stack(image_obs, axis=0)
 87 |     return {"observation" : np.array(obs, dtype=np.float32),
 88 |             "image_obs" : np.array(image_obs, dtype=np.uint8),
 89 |             "reward" : np.array(rewards, dtype=np.float32),
 90 |             "action" : np.array(acs, dtype=np.float32),
 91 |             "next_observation": np.array(next_obs, dtype=np.float32),
 92 |             "terminal": np.array(terminals, dtype=np.float32)}
 93 | 
 94 | 
 95 | def convert_listofrollouts(paths):
 96 |     """
 97 |         Take a list of rollout dictionaries
 98 |         and return separate arrays,
 99 |         where each array is a concatenation of that array from across the rollouts
100 |     """
101 |     observations = np.concatenate([path["observation"] for path in paths])
102 |     actions = np.concatenate([path["action"] for path in paths])
103 |     next_observations = np.concatenate([path["next_observation"] for path in paths])
104 |     terminals = np.concatenate([path["terminal"] for path in paths])
105 |     concatenated_rewards = np.concatenate([path["reward"] for path in paths])
106 |     unconcatenated_rewards = [path["reward"] for path in paths]
107 |     return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
108 | 
109 | ############################################
110 | ############################################
111 | 
112 | def get_pathlength(path):
113 |     return len(path["reward"])
114 | 
115 | def normalize(data, mean, std, eps=1e-8):
116 |     return (data-mean)/(std+eps)
117 | 
118 | def unnormalize(data, mean, std):
119 |     return data*std+mean
120 | 
121 | def add_noise(data_inp, noiseToSignal=0.01):
122 | 
123 |     data = copy.deepcopy(data_inp) #(num data points, dim)
124 | 
125 |     #mean of data
126 |     mean_data = np.mean(data, axis=0)
127 | 
128 |     #if mean is 0,
129 |     #make it 0.001 to avoid 0 issues later for dividing by std
130 |     mean_data[mean_data == 0] = 0.000001
131 | 
132 |     #width of normal distribution to sample noise from
133 |     #larger magnitude number = could have larger magnitude noise
134 |     std_of_noise = mean_data * noiseToSignal
135 |     for j in range(mean_data.shape[0]):
136 |         data[:, j] = np.copy(data[:, j] + np.random.normal(
137 |             0, np.absolute(std_of_noise[j]), (data.shape[0],)))
138 | 
139 |     return data
140 | 


--------------------------------------------------------------------------------
/hw3/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import itertools
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | from torch import optim
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch import distributions
 10 | 
 11 | from cs285.infrastructure import pytorch_util as ptu
 12 | from cs285.policies.base_policy import BasePolicy
 13 | 
 14 | 
 15 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
 16 | 
 17 |     def __init__(self,
 18 |                  ac_dim,
 19 |                  ob_dim,
 20 |                  n_layers,
 21 |                  size,
 22 |                  discrete=False,
 23 |                  learning_rate=1e-4,
 24 |                  training=True,
 25 |                  nn_baseline=False,
 26 |                  **kwargs
 27 |                  ):
 28 |         super().__init__(**kwargs)
 29 | 
 30 |         # init vars
 31 |         self.ac_dim = ac_dim
 32 |         self.ob_dim = ob_dim
 33 |         self.n_layers = n_layers
 34 |         self.discrete = discrete
 35 |         self.size = size
 36 |         self.learning_rate = learning_rate
 37 |         self.training = training
 38 |         self.nn_baseline = nn_baseline
 39 | 
 40 |         if self.discrete:
 41 |             self.logits_na = ptu.build_mlp(input_size=self.ob_dim,
 42 |                                            output_size=self.ac_dim,
 43 |                                            n_layers=self.n_layers,
 44 |                                            size=self.size)
 45 |             self.logits_na.to(ptu.device)
 46 |             self.mean_net = None
 47 |             self.logstd = None
 48 |             self.optimizer = optim.Adam(self.logits_na.parameters(),
 49 |                                         self.learning_rate)
 50 |         else:
 51 |             self.logits_na = None
 52 |             self.mean_net = ptu.build_mlp(input_size=self.ob_dim,
 53 |                                       output_size=self.ac_dim,
 54 |                                       n_layers=self.n_layers, size=self.size)
 55 |             self.logstd = nn.Parameter(
 56 |                 torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
 57 |             )
 58 |             self.mean_net.to(ptu.device)
 59 |             self.logstd.to(ptu.device)
 60 |             self.optimizer = optim.Adam(
 61 |                 itertools.chain([self.logstd], self.mean_net.parameters()),
 62 |                 self.learning_rate
 63 |             )
 64 | 
 65 |         if nn_baseline:
 66 |             self.baseline = ptu.build_mlp(
 67 |                 input_size=self.ob_dim,
 68 |                 output_size=1,
 69 |                 n_layers=self.n_layers,
 70 |                 size=self.size,
 71 |             )
 72 |             self.baseline.to(ptu.device)
 73 |             self.baseline_optimizer = optim.Adam(
 74 |                 self.baseline.parameters(),
 75 |                 self.learning_rate,
 76 |             )
 77 |         else:
 78 |             self.baseline = None
 79 | 
 80 |     ##################################
 81 | 
 82 |     def save(self, filepath):
 83 |         torch.save(self.state_dict(), filepath)
 84 | 
 85 |     ##################################
 86 | 
 87 |     # query the policy with observation(s) to get selected action(s)
 88 |     def get_action(self, obs: np.ndarray) -> np.ndarray:
 89 |         # TODO: get this from Piazza
 90 |         return action
 91 | 
 92 |     # update/train this policy
 93 |     def update(self, observations, actions, **kwargs):
 94 |         raise NotImplementedError
 95 | 
 96 |     # This function defines the forward pass of the network.
 97 |     # You can return anything you want, but you should be able to differentiate
 98 |     # through it. For example, you can return a torch.FloatTensor. You can also
 99 |     # return more flexible objects, such as a
100 |     # `torch.distributions.Distribution` object. It's up to you!
101 |     def forward(self, observation: torch.FloatTensor):
102 |         # TODO: get this from Piazza
103 |         return action_distribution
104 | 
105 | 
106 | #####################################################
107 | #####################################################
108 | 
109 | 
110 | class MLPPolicyAC(MLPPolicy):
111 |     def update(self, observations, actions, adv_n=None):
112 |         # TODO: update the policy and return the loss
113 |         loss = TODO
114 |         return loss.item()
115 | 


--------------------------------------------------------------------------------
/hw3/cs285/policies/argmax_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class ArgMaxPolicy(object):
 5 | 
 6 |     def __init__(self, critic):
 7 |         self.critic = critic
 8 | 
 9 |     def get_action(self, obs):
10 |         if len(obs.shape) > 3:
11 |             observation = obs
12 |         else:
13 |             observation = obs[None]
14 |         
15 |         ## TODO return the action that maxinmizes the Q-value 
16 |         # at the current observation as the output
17 |         actions = TODO
18 | 
19 |         return action.squeeze()


--------------------------------------------------------------------------------
/hw3/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import numpy as np
 3 | 
 4 | 
 5 | class BasePolicy(object, metaclass=abc.ABCMeta):
 6 |     def get_action(self, obs: np.ndarray) -> np.ndarray:
 7 |         raise NotImplementedError
 8 | 
 9 |     def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 |         """Return a dictionary of logging information."""
11 |         raise NotImplementedError
12 | 
13 |     def save(self, filepath: str):
14 |         raise NotImplementedError
15 | 


--------------------------------------------------------------------------------
/hw3/cs285/scripts/read_results.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import tensorflow as tf
 3 | 
 4 | def get_section_results(file):
 5 |     """
 6 |         requires tensorflow==1.12.0
 7 |     """
 8 |     X = []
 9 |     Y = []
10 |     for e in tf.train.summary_iterator(file):
11 |         for v in e.summary.value:
12 |             if v.tag == 'Train_EnvstepsSoFar':
13 |                 X.append(v.simple_value)
14 |             elif v.tag == 'Eval_AverageReturn':
15 |                 Y.append(v.simple_value)
16 |     return X, Y
17 | 
18 | if __name__ == '__main__':
19 |     import glob
20 | 
21 |     logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*'
22 |     eventfile = glob.glob(logdir)[0]
23 | 
24 |     X, Y = get_section_results(eventfile)
25 |     for i, (x, y) in enumerate(zip(X, Y)):
26 |         print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y))


--------------------------------------------------------------------------------
/hw3/cs285/scripts/run_hw3_actor_critic.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | from cs285.agents.ac_agent import ACAgent
  5 | from cs285.infrastructure.rl_trainer import RL_Trainer
  6 | 
  7 | 
  8 | class AC_Trainer(object):
  9 | 
 10 |     def __init__(self, params):
 11 | 
 12 |         #####################
 13 |         ## SET AGENT PARAMS
 14 |         #####################
 15 | 
 16 |         computation_graph_args = {
 17 |             'n_layers': params['n_layers'],
 18 |             'size': params['size'],
 19 |             'learning_rate': params['learning_rate'],
 20 |             'num_target_updates': params['num_target_updates'],
 21 |             'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'],
 22 |             }
 23 | 
 24 |         estimate_advantage_args = {
 25 |             'gamma': params['discount'],
 26 |             'standardize_advantages': not(params['dont_standardize_advantages']),
 27 |         }
 28 | 
 29 |         train_args = {
 30 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
 31 |             'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
 32 |             'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'],
 33 |         }
 34 | 
 35 |         agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}
 36 | 
 37 |         self.params = params
 38 |         self.params['agent_class'] = ACAgent
 39 |         self.params['agent_params'] = agent_params
 40 |         self.params['batch_size_initial'] = self.params['batch_size']
 41 | 
 42 |         ################
 43 |         ## RL TRAINER
 44 |         ################
 45 | 
 46 |         self.rl_trainer = RL_Trainer(self.params)
 47 | 
 48 |     def run_training_loop(self):
 49 | 
 50 |         self.rl_trainer.run_training_loop(
 51 |             self.params['n_iter'],
 52 |             collect_policy = self.rl_trainer.agent.actor,
 53 |             eval_policy = self.rl_trainer.agent.actor,
 54 |             )
 55 | 
 56 | 
 57 | def main():
 58 | 
 59 |     import argparse
 60 |     parser = argparse.ArgumentParser()
 61 |     parser.add_argument('--env_name', type=str, default='CartPole-v0')
 62 |     parser.add_argument('--ep_len', type=int, default=200)
 63 |     parser.add_argument('--exp_name', type=str, default='todo')
 64 |     parser.add_argument('--n_iter', '-n', type=int, default=200)
 65 | 
 66 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
 67 |     parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1)
 68 |     parser.add_argument('--num_actor_updates_per_agent_update', type=int, default=1)
 69 | 
 70 |     parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration
 71 |     parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
 72 |     parser.add_argument('--train_batch_size', '-tb', type=int, default=1000) ##steps used per gradient step
 73 | 
 74 |     parser.add_argument('--discount', type=float, default=1.0)
 75 |     parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
 76 |     parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true')
 77 |     parser.add_argument('--num_target_updates', '-ntu', type=int, default=10)
 78 |     parser.add_argument('--num_grad_steps_per_target_update', '-ngsptu', type=int, default=10)
 79 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
 80 |     parser.add_argument('--size', '-s', type=int, default=64)
 81 | 
 82 |     parser.add_argument('--seed', type=int, default=1)
 83 |     parser.add_argument('--no_gpu', '-ngpu', action='store_true')
 84 |     parser.add_argument('--which_gpu', '-gpu_id', default=0)
 85 |     parser.add_argument('--video_log_freq', type=int, default=-1)
 86 |     parser.add_argument('--scalar_log_freq', type=int, default=10)
 87 | 
 88 |     parser.add_argument('--save_params', action='store_true')
 89 | 
 90 |     args = parser.parse_args()
 91 | 
 92 |     # convert to dictionary
 93 |     params = vars(args)
 94 | 
 95 |     # for policy gradient, we made a design decision
 96 |     # to force batch_size = train_batch_size
 97 |     # note that, to avoid confusion, you don't even have a train_batch_size argument anymore (above)
 98 |     params['train_batch_size'] = params['batch_size']
 99 | 
100 |     ##################################
101 |     ### CREATE DIRECTORY FOR LOGGING
102 |     ##################################
103 | 
104 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
105 | 
106 |     if not (os.path.exists(data_path)):
107 |         os.makedirs(data_path)
108 | 
109 |     logdir = 'hw3_ ' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
110 |     logdir = os.path.join(data_path, logdir)
111 |     params['logdir'] = logdir
112 |     if not(os.path.exists(logdir)):
113 |         os.makedirs(logdir)
114 | 
115 |     print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
116 | 
117 |     ###################
118 |     ### RUN TRAINING
119 |     ###################
120 | 
121 |     trainer = AC_Trainer(params)
122 |     trainer.run_training_loop()
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     main()
127 | 


--------------------------------------------------------------------------------
/hw3/cs285/scripts/run_hw3_dqn.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | 
 4 | from cs285.infrastructure.rl_trainer import RL_Trainer
 5 | from cs285.agents.dqn_agent import DQNAgent
 6 | from cs285.infrastructure.dqn_utils import get_env_kwargs
 7 | 
 8 | 
 9 | class Q_Trainer(object):
10 | 
11 |     def __init__(self, params):
12 |         self.params = params
13 | 
14 |         train_args = {
15 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
16 |             'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
17 |             'train_batch_size': params['batch_size'],
18 |             'double_q': params['double_q'],
19 |         }
20 | 
21 |         env_args = get_env_kwargs(params['env_name'])
22 | 
23 |         self.agent_params = {**train_args, **env_args, **params}
24 | 
25 |         self.params['agent_class'] = DQNAgent
26 |         self.params['agent_params'] = self.agent_params
27 |         self.params['train_batch_size'] = params['batch_size']
28 |         self.params['env_wrappers'] = self.agent_params['env_wrappers']
29 | 
30 |         self.rl_trainer = RL_Trainer(self.params)
31 | 
32 |     def run_training_loop(self):
33 |         self.rl_trainer.run_training_loop(
34 |             self.agent_params['num_timesteps'],
35 |             collect_policy = self.rl_trainer.agent.actor,
36 |             eval_policy = self.rl_trainer.agent.actor,
37 |         )
38 | 
39 | def main():
40 | 
41 |     import argparse
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument(
44 |         '--env_name',
45 |         default='MsPacman-v0',
46 |         choices=('PongNoFrameskip-v4', 'LunarLander-v3', 'MsPacman-v0')
47 |     )
48 | 
49 |     parser.add_argument('--ep_len', type=int, default=200)
50 |     parser.add_argument('--exp_name', type=str, default='todo')
51 | 
52 |     parser.add_argument('--eval_batch_size', type=int, default=1000)
53 | 
54 |     parser.add_argument('--batch_size', type=int, default=32)
55 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
56 |     parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1)
57 |     parser.add_argument('--double_q', action='store_true')
58 | 
59 |     parser.add_argument('--seed', type=int, default=1)
60 |     parser.add_argument('--no_gpu', '-ngpu', action='store_true')
61 |     parser.add_argument('--which_gpu', '-gpu_id', default=0)
62 |     parser.add_argument('--scalar_log_freq', type=int, default=int(1e4))
63 |     parser.add_argument('--video_log_freq', type=int, default=-1)
64 | 
65 |     parser.add_argument('--save_params', action='store_true')
66 | 
67 |     args = parser.parse_args()
68 | 
69 |     # convert to dictionary
70 |     params = vars(args)
71 |     params['video_log_freq'] = -1 # This param is not used for DQN
72 |     ##################################
73 |     ### CREATE DIRECTORY FOR LOGGING
74 |     ##################################
75 | 
76 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
77 | 
78 |     if not (os.path.exists(data_path)):
79 |         os.makedirs(data_path)
80 | 
81 |     logdir = 'hw3_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
82 |     logdir = os.path.join(data_path, logdir)
83 |     params['logdir'] = logdir
84 |     if not(os.path.exists(logdir)):
85 |         os.makedirs(logdir)
86 | 
87 |     print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
88 | 
89 |     trainer = Q_Trainer(params)
90 |     trainer.run_training_loop()
91 | 
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/hw3/cs285_hw3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285_hw3.pdf


--------------------------------------------------------------------------------
/hw3/requirements.txt:
--------------------------------------------------------------------------------
 1 | gym[atari]==0.17.2
 2 | mujoco-py==2.0.2.2
 3 | tensorboard==2.3.0
 4 | tensorboardX==1.8
 5 | matplotlib==2.2.2
 6 | ipython==6.4.0
 7 | moviepy==1.0.0
 8 | pyvirtualdisplay==1.3.2
 9 | torch==1.5.1
10 | opencv-python==4.4.0.42
11 | ipdb==0.13.3
12 | box2d-py
13 | 


--------------------------------------------------------------------------------
/hw3/requirements_colab.txt:
--------------------------------------------------------------------------------
 1 | gym[atari]==0.17.2
 2 | tensorboard==2.3.0
 3 | tensorboardX==1.8
 4 | matplotlib==2.2.2
 5 | ipython==6.4.0
 6 | moviepy==1.0.0
 7 | pyvirtualdisplay==1.3.2
 8 | torch==1.5.1
 9 | opencv-python==4.4.0.42
10 | ipdb==0.13.3
11 | box2d-py
12 | 


--------------------------------------------------------------------------------
/hw3/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------
/hw4/README.md:
--------------------------------------------------------------------------------
 1 | ## Setup
 2 | 
 3 | You can run this code on your own machine or on Google Colab. 
 4 | 
 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally.
 6 | 
 7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below:
 8 | 
 9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw4/cs285/scripts/run_hw4_mb.ipynb)
10 | 
11 | ## Complete the code
12 | 
13 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with `TODO: get this from Piazza'.
14 | 
15 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
16 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
17 | 
18 | You will then need to implement code in the following files:
19 | - [agents/mb_agent.py](cs285/agents/mb_agent.py)
20 | - [models/ff_model.py](cs285/models/ff_model.py)
21 | - [policies/MPC_policy.py](cs285/policies/MPC_policy.py)
22 | 
23 | The relevant sections are marked with `TODO`.
24 | 
25 | You may also want to look through [scripts/run_hw4_mb.py](cs285/scripts/run_hw4_mb.py) (if running locally) or [scripts/run_hw4_mb.ipynb](cs285/scripts/run_hw4_mb.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook.
26 | 
27 | See the [assignment PDF](cs285_hw4.pdf) for more details on what files to edit.
28 | 
29 | 


--------------------------------------------------------------------------------
/hw4/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
 1 | class BaseAgent(object):
 2 |     def __init__(self, **kwargs):
 3 |         super(BaseAgent, self).__init__(**kwargs)
 4 | 
 5 |     def train(self) -> dict:
 6 |         """Return a dictionary of logging information."""
 7 |         raise NotImplementedError
 8 | 
 9 |     def add_to_replay_buffer(self, paths):
10 |         raise NotImplementedError
11 | 
12 |     def sample(self, batch_size):
13 |         raise NotImplementedError
14 | 
15 |     def save(self, path):
16 |         raise NotImplementedError


--------------------------------------------------------------------------------
/hw4/cs285/agents/mb_agent.py:
--------------------------------------------------------------------------------
 1 | from .base_agent import BaseAgent
 2 | from cs285.models.ff_model import FFModel
 3 | from cs285.policies.MPC_policy import MPCPolicy
 4 | from cs285.infrastructure.replay_buffer import ReplayBuffer
 5 | from cs285.infrastructure.utils import *
 6 | 
 7 | 
 8 | class MBAgent(BaseAgent):
 9 |     def __init__(self, env, agent_params):
10 |         super(MBAgent, self).__init__()
11 | 
12 |         self.env = env.unwrapped
13 |         self.agent_params = agent_params
14 |         self.ensemble_size = self.agent_params['ensemble_size']
15 | 
16 |         self.dyn_models = []
17 |         for i in range(self.ensemble_size):
18 |             model = FFModel(
19 |                 self.agent_params['ac_dim'],
20 |                 self.agent_params['ob_dim'],
21 |                 self.agent_params['n_layers'],
22 |                 self.agent_params['size'],
23 |                 self.agent_params['learning_rate'],
24 |             )
25 |             self.dyn_models.append(model)
26 | 
27 |         self.actor = MPCPolicy(
28 |             self.env,
29 |             ac_dim=self.agent_params['ac_dim'],
30 |             dyn_models=self.dyn_models,
31 |             horizon=self.agent_params['mpc_horizon'],
32 |             N=self.agent_params['mpc_num_action_sequences'],
33 |         )
34 | 
35 |         self.replay_buffer = ReplayBuffer()
36 | 
37 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
38 | 
39 |         # training a MB agent refers to updating the predictive model using observed state transitions
40 |         # NOTE: each model in the ensemble is trained on a different random batch of size batch_size
41 |         losses = []
42 |         num_data = ob_no.shape[0]
43 |         num_data_per_ens = int(num_data / self.ensemble_size)
44 | 
45 |         for i in range(self.ensemble_size):
46 | 
47 |             # select which datapoints to use for this model of the ensemble
48 |             # you might find the num_data_per_env variable defined above useful
49 | 
50 |             observations = # TODO(Q1)
51 |             actions = # TODO(Q1)
52 |             next_observations = # TODO(Q1)
53 | 
54 |             # use datapoints to update one of the dyn_models
55 |             model =  # TODO(Q1)
56 |             log = model.update(observations, actions, next_observations,
57 |                                 self.data_statistics)
58 |             loss = log['Training Loss']
59 |             losses.append(loss)
60 | 
61 |         avg_loss = np.mean(losses)
62 |         return {
63 |             'Training Loss': avg_loss,
64 |         }
65 | 
66 |     def add_to_replay_buffer(self, paths, add_sl_noise=False):
67 | 
68 |         # add data to replay buffer
69 |         self.replay_buffer.add_rollouts(paths, noised=add_sl_noise)
70 | 
71 |         # get updated mean/std of the data in our replay buffer
72 |         self.data_statistics = {
73 |             'obs_mean': np.mean(self.replay_buffer.obs, axis=0),
74 |             'obs_std': np.std(self.replay_buffer.obs, axis=0),
75 |             'acs_mean': np.mean(self.replay_buffer.acs, axis=0),
76 |             'acs_std': np.std(self.replay_buffer.acs, axis=0),
77 |             'delta_mean': np.mean(
78 |                 self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0),
79 |             'delta_std': np.std(
80 |                 self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0),
81 |         }
82 | 
83 |         # update the actor's data_statistics too, so actor.get_action can be calculated correctly
84 |         self.actor.data_statistics = self.data_statistics
85 | 
86 |     def sample(self, batch_size):
87 |         # NOTE: sampling batch_size * ensemble_size,
88 |         # so each model in our ensemble can get trained on batch_size data
89 |         return self.replay_buffer.sample_random_data(
90 |             batch_size * self.ensemble_size)
91 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | from gym.envs.registration import register
 2 | 
 3 | def register_envs():
 4 |     register(
 5 |         id='cheetah-cs285-v0',
 6 |         entry_point='cs285.envs.cheetah:HalfCheetahEnv',
 7 |         max_episode_steps=1000,
 8 |     )
 9 |     register(
10 |         id='obstacles-cs285-v0',
11 |         entry_point='cs285.envs.obstacles:Obstacles',
12 |         max_episode_steps=500,
13 |     )
14 |     register(
15 |         id='reacher-cs285-v0',
16 |         entry_point='cs285.envs.reacher:Reacher7DOFEnv',
17 |         max_episode_steps=500,
18 |     )
19 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs.cheetah.cheetah import HalfCheetahEnv
2 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/cheetah.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import mujoco_py
  3 | from gym import utils
  4 | from gym.envs.mujoco import mujoco_env
  5 | 
  6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  7 | 
  8 |     def __init__(self):
  9 | 
 10 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
 11 |         utils.EzPickle.__init__(self)
 12 | 
 13 |         self.skip = self.frame_skip
 14 | 
 15 |         self.action_dim = self.ac_dim = self.action_space.shape[0]
 16 |         self.observation_dim = self.obs_dim = self.observation_space.shape[0]
 17 | 
 18 |     def get_reward(self, observations, actions):
 19 | 
 20 |         """get reward/s of given (observations, actions) datapoint or datapoints
 21 | 
 22 |         Args:
 23 |             observations: (batchsize, obs_dim) or (obs_dim,)
 24 |             actions: (batchsize, ac_dim) or (ac_dim,)
 25 | 
 26 |         Return:
 27 |             r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
 28 |             done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
 29 |         """
 30 | 
 31 |         #initialize and reshape as needed, for batch mode
 32 |         self.reward_dict = {}
 33 |         if(len(observations.shape)==1):
 34 |             observations = np.expand_dims(observations, axis = 0)
 35 |             actions = np.expand_dims(actions, axis = 0)
 36 |             batch_mode = False
 37 |         else:
 38 |             batch_mode = True
 39 | 
 40 |         #get vars
 41 |         xvel = observations[:, 9].copy()
 42 |         body_angle = observations[:, 2].copy()
 43 |         front_leg = observations[:, 6].copy()
 44 |         front_shin = observations[:, 7].copy()
 45 |         front_foot = observations[:, 8].copy()
 46 |         zeros = np.zeros((observations.shape[0],)).copy()
 47 | 
 48 |         # ranges
 49 |         leg_range = 0.2
 50 |         shin_range = 0
 51 |         foot_range = 0
 52 |         penalty_factor = 10
 53 | 
 54 |         #calc rew
 55 |         self.reward_dict['run'] = xvel
 56 | 
 57 |         front_leg_rew = zeros.copy()
 58 |         front_leg_rew[front_leg>leg_range] = -penalty_factor
 59 |         self.reward_dict['leg'] = front_leg_rew
 60 | 
 61 |         front_shin_rew = zeros.copy()
 62 |         front_shin_rew[front_shin>shin_range] = -penalty_factor
 63 |         self.reward_dict['shin'] = front_shin_rew
 64 | 
 65 |         front_foot_rew = zeros.copy()
 66 |         front_foot_rew[front_foot>foot_range] = -penalty_factor
 67 |         self.reward_dict['foot'] = front_foot_rew
 68 | 
 69 |         # total reward
 70 |         self.reward_dict['r_total'] = self.reward_dict['run'] +  self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot']
 71 | 
 72 |         #return
 73 |         dones = zeros.copy()
 74 |         if(not batch_mode):
 75 |             return self.reward_dict['r_total'][0], dones[0]
 76 |         return self.reward_dict['r_total'], dones
 77 | 
 78 | 
 79 |     def get_score(self, obs):
 80 |         xposafter = obs[0]
 81 |         return xposafter
 82 | 
 83 |     ##############################################
 84 | 
 85 |     def step(self, action):
 86 | 
 87 |         #step
 88 |         self.do_simulation(action, self.frame_skip)
 89 | 
 90 |         #obs/reward/done/score
 91 |         ob = self._get_obs()
 92 |         rew, done = self.get_reward(ob, action)
 93 |         score = self.get_score(ob)
 94 | 
 95 |         #return
 96 |         env_info = {'obs_dict': self.obs_dict,
 97 |                     'rewards': self.reward_dict,
 98 |                     'score': score}
 99 |         return ob, rew, done, env_info
100 | 
101 |     def _get_obs(self):
102 | 
103 |         self.obs_dict = {}
104 |         self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy()
105 |         self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy()
106 |         self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy()
107 | 
108 |         return np.concatenate([
109 |             self.obs_dict['joints_pos'], #9
110 |             self.obs_dict['joints_vel'], #9
111 |             self.obs_dict['com_torso'], #3
112 |         ])
113 | 
114 |     ##############################################
115 | 
116 |     def reset_model(self, seed=None):
117 | 
118 |         # set reset pose/vel
119 |         self.reset_pose = self.init_qpos + self.np_random.uniform(
120 |                         low=-.1, high=.1, size=self.model.nq)
121 |         self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
122 | 
123 |         #reset the env to that pose/vel
124 |         return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy())
125 | 
126 | 
127 |     def do_reset(self, reset_pose, reset_vel, reset_goal=None):
128 | 
129 |         #reset
130 |         self.set_state(reset_pose, reset_vel)
131 | 
132 |         #return
133 |         return self._get_obs()
134 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/obstacles/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs.obstacles.obstacles_env import Obstacles
2 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs.reacher.reacher_env import Reacher7DOFEnv
2 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/reacher_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from gym.envs.mujoco import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | import os
  6 | 
  7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 | 
 10 |         # placeholder
 11 |         self.hand_sid = -2
 12 |         self.target_sid = -1
 13 | 
 14 |         curr_dir = os.path.dirname(os.path.abspath(__file__))
 15 |         mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2)
 16 |         utils.EzPickle.__init__(self)
 17 |         self.observation_dim = 26
 18 |         self.action_dim = 7
 19 | 
 20 |         self.hand_sid = self.model.site_name2id("finger")
 21 |         self.target_sid = self.model.site_name2id("target")
 22 |         self.skip = self.frame_skip
 23 | 
 24 | 
 25 |     def _get_obs(self):
 26 |         return np.concatenate([
 27 |             self.data.qpos.flat, #[7]
 28 |             self.data.qvel.flatten() / 10., #[7]
 29 |             self.data.site_xpos[self.hand_sid], #[3]
 30 |             self.model.site_pos[self.target_sid], #[3]
 31 |         ])
 32 | 
 33 |     def step(self, a):
 34 | 
 35 |         self.do_simulation(a, self.frame_skip)
 36 |         ob = self._get_obs()
 37 |         reward, done = self.get_reward(ob, a)
 38 | 
 39 |         score = self.get_score(ob)
 40 | 
 41 |         # finalize step
 42 |         env_info = {'ob': ob,
 43 |                     'rewards': self.reward_dict,
 44 |                     'score': score}
 45 | 
 46 |         return ob, reward, done, env_info
 47 | 
 48 |     def get_score(self, obs):
 49 |         hand_pos = obs[-6:-3]
 50 |         target_pos = obs[-3:]
 51 |         score = -1*np.abs(hand_pos-target_pos)
 52 |         return score
 53 | 
 54 |     def get_reward(self, observations, actions):
 55 | 
 56 |         """get reward/s of given (observations, actions) datapoint or datapoints
 57 | 
 58 |         Args:
 59 |             observations: (batchsize, obs_dim) or (obs_dim,)
 60 |             actions: (batchsize, ac_dim) or (ac_dim,)
 61 | 
 62 |         Return:
 63 |             r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
 64 |             done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
 65 |         """
 66 | 
 67 |         #initialize and reshape as needed, for batch mode
 68 |         self.reward_dict = {}
 69 |         if(len(observations.shape)==1):
 70 |             observations = np.expand_dims(observations, axis = 0)
 71 |             actions = np.expand_dims(actions, axis = 0)
 72 |             batch_mode = False
 73 |         else:
 74 |             batch_mode = True
 75 | 
 76 |         #get vars
 77 |         hand_pos = observations[:, -6:-3]
 78 |         target_pos = observations[:, -3:]
 79 | 
 80 |         #calc rew
 81 |         dist = np.linalg.norm(hand_pos - target_pos, axis=1)
 82 |         self.reward_dict['r_total'] = -10*dist
 83 | 
 84 |         #done is always false for this env
 85 |         dones = np.zeros((observations.shape[0],))
 86 | 
 87 |         #return
 88 |         if(not batch_mode):
 89 |             return self.reward_dict['r_total'][0], dones[0]
 90 |         return self.reward_dict['r_total'], dones
 91 | 
 92 |     def reset(self):
 93 |         _ = self.reset_model()
 94 | 
 95 |         self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1]
 96 | 
 97 |         observation, _reward, done, _info = self.step(np.zeros(7))
 98 |         ob = self._get_obs()
 99 | 
100 |         return ob
101 | 
102 |     def reset_model(self, seed=None):
103 |         if seed is not None:
104 |             self.seed(seed)
105 | 
106 |         self.reset_pose = self.init_qpos.copy()
107 |         self.reset_vel = self.init_qvel.copy()
108 | 
109 |         self.reset_goal = np.zeros(3)
110 |         self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3)
111 |         self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2)
112 |         self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25)
113 | 
114 |         return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal)
115 | 
116 |     def do_reset(self, reset_pose, reset_vel, reset_goal):
117 | 
118 |         self.set_state(reset_pose, reset_vel)
119 | 
120 |         #reset target
121 |         self.reset_goal = reset_goal.copy()
122 |         self.model.site_pos[self.target_sid] = self.reset_goal
123 |         self.sim.forward()
124 | 
125 |         #return
126 |         return self._get_obs()


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
 1 | from gym.wrappers import Monitor
 2 | import glob
 3 | import io
 4 | import base64
 5 | from IPython.display import HTML
 6 | from IPython import display as ipythondisplay
 7 | 
 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
 9 | 
10 | def show_video():
11 |   mp4list = glob.glob('/content/video/*.mp4')
12 |   if len(mp4list) > 0:
13 |     mp4 = mp4list[0]
14 |     video = io.open(mp4, 'r+b').read()
15 |     encoded = base64.b64encode(video)
16 |     ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
17 |                 loop controls style="height: 400px;">
18 |                 <source src="data:video/mp4;base64,{0}" type="video/mp4" />
19 |              </video>'''.format(encoded.decode('ascii'))))
20 |   else: 
21 |     print("Could not find video")
22 | 
23 | 
24 | def wrap_env(env):
25 |   env = Monitor(env, '/content/video', force=True)
26 |   return env
27 | 


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tensorboardX import SummaryWriter
 3 | import numpy as np
 4 | 
 5 | class Logger:
 6 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 7 |         self._log_dir = log_dir
 8 |         print('########################')
 9 |         print('logging outputs to ', log_dir)
10 |         print('########################')
11 |         self._n_logged_samples = n_logged_samples
12 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 | 
14 |     def log_scalar(self, scalar, name, step_):
15 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 | 
17 |     def log_scalars(self, scalar_dict, group_name, step, phase):
18 |         """Will log all scalars in the same plot."""
19 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 | 
21 |     def log_image(self, image, name, step):
22 |         assert(len(image.shape) == 3)  # [C, H, W]
23 |         self._summ_writer.add_image('{}'.format(name), image, step)
24 | 
25 |     def log_video(self, video_frames, name, step, fps=10):
26 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 | 
29 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 | 
31 |         # reshape the rollouts
32 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 | 
34 |         # max rollout length
35 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 |         max_length = videos[0].shape[0]
37 |         for i in range(max_videos_to_save):
38 |             if videos[i].shape[0]>max_length:
39 |                 max_length = videos[i].shape[0]
40 | 
41 |         # pad rollouts to all be same length
42 |         for i in range(max_videos_to_save):
43 |             if videos[i].shape[0]<max_length:
44 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
45 |                 videos[i] = np.concatenate([videos[i], padding], 0)
46 | 
47 |         # log videos to tensorboard event file
48 |         videos = np.stack(videos[:max_videos_to_save], 0)
49 |         self.log_video(videos, video_title, step, fps=fps)
50 | 
51 |     def log_figures(self, figure, name, step, phase):
52 |         """figure: matplotlib.pyplot figure handle"""
53 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
54 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 | 
56 |     def log_figure(self, figure, name, step, phase):
57 |         """figure: matplotlib.pyplot figure handle"""
58 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 | 
60 |     def log_graph(self, array, name, step, phase):
61 |         """figure: matplotlib.pyplot figure handle"""
62 |         im = plot_graph(array)
63 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 | 
65 |     def dump_scalars(self, log_path=None):
66 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 |         self._summ_writer.export_scalars_to_json(log_path)
68 | 
69 |     def flush(self):
70 |         self._summ_writer.flush()
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | Activation = Union[str, nn.Module]
 7 | 
 8 | 
 9 | _str_to_activation = {
10 |     'relu': nn.ReLU(),
11 |     'tanh': nn.Tanh(),
12 |     'leaky_relu': nn.LeakyReLU(),
13 |     'sigmoid': nn.Sigmoid(),
14 |     'selu': nn.SELU(),
15 |     'softplus': nn.Softplus(),
16 |     'identity': nn.Identity(),
17 | }
18 | 
19 | 
20 | def build_mlp(
21 |         input_size: int,
22 |         output_size: int,
23 |         n_layers: int,
24 |         size: int,
25 |         activation: Activation = 'tanh',
26 |         output_activation: Activation = 'identity',
27 | ):
28 |     """
29 |         Builds a feedforward neural network
30 |         arguments:
31 |             input_placeholder: placeholder variable for the state (batch_size, input_size)
32 |             scope: variable scope of the network
33 |             n_layers: number of hidden layers
34 |             size: dimension of each hidden layer
35 |             activation: activation of each hidden layer
36 |             input_size: size of the input layer
37 |             output_size: size of the output layer
38 |             output_activation: activation of the output layer
39 |         returns:
40 |             output_placeholder: the result of a forward pass through the hidden layers + the output layer
41 |     """
42 |     if isinstance(activation, str):
43 |         activation = _str_to_activation[activation]
44 |     if isinstance(output_activation, str):
45 |         output_activation = _str_to_activation[output_activation]
46 |     layers = []
47 |     in_size = input_size
48 |     for _ in range(n_layers):
49 |         layers.append(nn.Linear(in_size, size))
50 |         layers.append(activation)
51 |         in_size = size
52 |     layers.append(nn.Linear(in_size, output_size))
53 |     layers.append(output_activation)
54 |     return nn.Sequential(*layers)
55 | 
56 | 
57 | device = None
58 | 
59 | 
60 | def init_gpu(use_gpu=True, gpu_id=0):
61 |     global device
62 |     if torch.cuda.is_available() and use_gpu:
63 |         device = torch.device("cuda:" + str(gpu_id))
64 |         print("Using GPU id {}".format(gpu_id))
65 |     else:
66 |         device = torch.device("cpu")
67 |         print("GPU not detected. Defaulting to CPU.")
68 | 
69 | 
70 | def set_device(gpu_id):
71 |     torch.cuda.set_device(gpu_id)
72 | 
73 | 
74 | def from_numpy(*args, **kwargs):
75 |     return torch.from_numpy(*args, **kwargs).float().to(device)
76 | 
77 | 
78 | def to_numpy(tensor):
79 |     return tensor.to('cpu').detach().numpy()
80 | 


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | from cs285.infrastructure.utils import *
 2 | 
 3 | 
 4 | class ReplayBuffer(object):
 5 | 
 6 |     def __init__(self, max_size=1000000):
 7 | 
 8 |         self.max_size = max_size
 9 |         self.paths = []
10 |         self.obs = None
11 |         self.acs = None
12 |         self.concatenated_rews = None
13 |         self.next_obs = None
14 |         self.terminals = None
15 | 
16 |     def add_rollouts(self, paths, noised=False):
17 | 
18 |         # add new rollouts into our list of rollouts
19 |         for path in paths:
20 |             self.paths.append(path)
21 | 
22 |         # convert new rollouts into their component arrays, and append them onto our arrays
23 |         observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
24 | 
25 |         if noised:
26 |             observations = add_noise(observations)
27 |             next_observations = add_noise(next_observations)
28 | 
29 |         if self.obs is None:
30 |             self.obs = observations[-self.max_size:]
31 |             self.acs = actions[-self.max_size:]
32 |             self.next_obs = next_observations[-self.max_size:]
33 |             self.terminals = terminals[-self.max_size:]
34 |             self.concatenated_rews = concatenated_rews[-self.max_size:]
35 |         else:
36 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
37 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
38 |             self.next_obs = np.concatenate(
39 |                 [self.next_obs, next_observations]
40 |             )[-self.max_size:]
41 |             self.terminals = np.concatenate(
42 |                 [self.terminals, terminals]
43 |             )[-self.max_size:]
44 |             self.concatenated_rews = np.concatenate(
45 |                 [self.concatenated_rews, concatenated_rews]
46 |             )[-self.max_size:]
47 | 
48 |     ########################################
49 |     ########################################
50 | 
51 |     def sample_random_rollouts(self, num_rollouts):
52 |         rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
53 |         return self.paths[rand_indices]
54 | 
55 |     def sample_recent_rollouts(self, num_rollouts=1):
56 |         return self.paths[-num_rollouts:]
57 | 
58 |     ########################################
59 |     ########################################
60 | 
61 |     def sample_random_data(self, batch_size):
62 | 
63 |         assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
64 |         rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
65 |         return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
66 | 
67 |     def sample_recent_data(self, batch_size=1, concat_rew=True):
68 | 
69 |         if concat_rew:
70 |             return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
71 |         else:
72 |             num_recent_rollouts_to_return = 0
73 |             num_datapoints_so_far = 0
74 |             index = -1
75 |             while num_datapoints_so_far < batch_size:
76 |                 recent_rollout = self.paths[index]
77 |                 index -=1
78 |                 num_recent_rollouts_to_return +=1
79 |                 num_datapoints_so_far += get_pathlength(recent_rollout)
80 |             rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
81 |             observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
82 |             return observations, actions, unconcatenated_rews, next_observations, terminals
83 | 


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import copy
  4 | 
  5 | ############################################
  6 | ############################################
  7 | 
  8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics):
  9 | 
 10 |     model = models[0]
 11 | 
 12 |     # true
 13 |     true_states = perform_actions(env, action_sequence)['observation']
 14 | 
 15 |     # predicted
 16 |     ob = np.expand_dims(true_states[0],0)
 17 |     pred_states = []
 18 |     for ac in action_sequence:
 19 |         pred_states.append(ob)
 20 |         action = np.expand_dims(ac,0)
 21 |         ob = model.get_prediction(ob, action, data_statistics)
 22 |     pred_states = np.squeeze(pred_states)
 23 | 
 24 |     # mpe
 25 |     mpe = mean_squared_error(pred_states, true_states)
 26 | 
 27 |     return mpe, true_states, pred_states
 28 | 
 29 | def perform_actions(env, actions):
 30 |     ob = env.reset()
 31 |     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
 32 |     steps = 0
 33 |     for ac in actions:
 34 |         obs.append(ob)
 35 |         acs.append(ac)
 36 |         ob, rew, done, _ = env.step(ac)
 37 |         # add the observation after taking a step to next_obs
 38 |         next_obs.append(ob)
 39 |         rewards.append(rew)
 40 |         steps += 1
 41 |         # If the episode ended, the corresponding terminal value is 1
 42 |         # otherwise, it is 0
 43 |         if done:
 44 |             terminals.append(1)
 45 |             break
 46 |         else:
 47 |             terminals.append(0)
 48 | 
 49 |     return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 50 | 
 51 | def mean_squared_error(a, b):
 52 |     return np.mean((a-b)**2)
 53 | 
 54 | ############################################
 55 | ############################################
 56 | 
 57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
 58 | # TODO: get this from Piazza
 59 | 
 60 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
 61 |     """
 62 |         Collect rollouts using policy
 63 |         until we have collected min_timesteps_per_batch steps
 64 |     """
 65 |     # TODO: get this from Piazza
 66 | 
 67 |     return paths, timesteps_this_batch
 68 | 
 69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
 70 |     """
 71 |         Collect ntraj rollouts using policy
 72 |     """
 73 |     # TODO: get this from Piazza
 74 | 
 75 |     return paths
 76 | 
 77 | ############################################
 78 | ############################################
 79 | 
 80 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
 81 |     """
 82 |         Take info (separate arrays) from a single rollout
 83 |         and return it in a single dictionary
 84 |     """
 85 |     if image_obs != []:
 86 |         image_obs = np.stack(image_obs, axis=0)
 87 |     return {"observation" : np.array(obs, dtype=np.float32),
 88 |             "image_obs" : np.array(image_obs, dtype=np.uint8),
 89 |             "reward" : np.array(rewards, dtype=np.float32),
 90 |             "action" : np.array(acs, dtype=np.float32),
 91 |             "next_observation": np.array(next_obs, dtype=np.float32),
 92 |             "terminal": np.array(terminals, dtype=np.float32)}
 93 | 
 94 | 
 95 | def convert_listofrollouts(paths):
 96 |     """
 97 |         Take a list of rollout dictionaries
 98 |         and return separate arrays,
 99 |         where each array is a concatenation of that array from across the rollouts
100 |     """
101 |     observations = np.concatenate([path["observation"] for path in paths])
102 |     actions = np.concatenate([path["action"] for path in paths])
103 |     next_observations = np.concatenate([path["next_observation"] for path in paths])
104 |     terminals = np.concatenate([path["terminal"] for path in paths])
105 |     concatenated_rewards = np.concatenate([path["reward"] for path in paths])
106 |     unconcatenated_rewards = [path["reward"] for path in paths]
107 |     return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
108 | 
109 | ############################################
110 | ############################################
111 | 
112 | def get_pathlength(path):
113 |     return len(path["reward"])
114 | 
115 | def normalize(data, mean, std, eps=1e-8):
116 |     return (data-mean)/(std+eps)
117 | 
118 | def unnormalize(data, mean, std):
119 |     return data*std+mean
120 | 
121 | def add_noise(data_inp, noiseToSignal=0.01):
122 | 
123 |     data = copy.deepcopy(data_inp) #(num data points, dim)
124 | 
125 |     #mean of data
126 |     mean_data = np.mean(data, axis=0)
127 | 
128 |     #if mean is 0,
129 |     #make it 0.001 to avoid 0 issues later for dividing by std
130 |     mean_data[mean_data == 0] = 0.000001
131 | 
132 |     #width of normal distribution to sample noise from
133 |     #larger magnitude number = could have larger magnitude noise
134 |     std_of_noise = mean_data * noiseToSignal
135 |     for j in range(mean_data.shape[0]):
136 |         data[:, j] = np.copy(data[:, j] + np.random.normal(
137 |             0, np.absolute(std_of_noise[j]), (data.shape[0],)))
138 | 
139 |     return data
140 | 


--------------------------------------------------------------------------------
/hw4/cs285/models/base_model.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import Any
 3 | 
 4 | 
 5 | Prediction = Any
 6 | 
 7 | 
 8 | class BaseModel(object):
 9 |     def update(self, ob_no, next_ob_no, re_n, terminal_n) -> dict:
10 |         raise NotImplementedError
11 | 
12 |     def get_prediction(self, ob_no, ac_na, data_statistics) -> Prediction:
13 |         raise NotImplementedError
14 | 
15 |     def convert_prediction_to_numpy(self, pred: Prediction) -> np.ndarray:
16 |         """Allow caller to be pytorch-agnostic."""
17 |         raise NotImplementedError
18 | 


--------------------------------------------------------------------------------
/hw4/cs285/policies/MPC_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .base_policy import BasePolicy
 4 | 
 5 | 
 6 | class MPCPolicy(BasePolicy):
 7 | 
 8 |     def __init__(self,
 9 |                  env,
10 |                  ac_dim,
11 |                  dyn_models,
12 |                  horizon,
13 |                  N,
14 |                  **kwargs
15 |                  ):
16 |         super().__init__(**kwargs)
17 | 
18 |         # init vars
19 |         self.env = env
20 |         self.dyn_models = dyn_models
21 |         self.horizon = horizon
22 |         self.N = N
23 |         self.data_statistics = None  # NOTE must be updated from elsewhere
24 | 
25 |         self.ob_dim = self.env.observation_space.shape[0]
26 | 
27 |         # action space
28 |         self.ac_space = self.env.action_space
29 |         self.ac_dim = ac_dim
30 |         self.low = self.ac_space.low
31 |         self.high = self.ac_space.high
32 | 
33 |     def sample_action_sequences(self, num_sequences, horizon):
34 |         # TODO(Q1) uniformly sample trajectories and return an array of
35 |         # dimensions (num_sequences, horizon, self.ac_dim) in the range
36 |         # [self.low, self.high]
37 |         return random_action_sequences
38 | 
39 |     def get_action(self, obs):
40 | 
41 |         if self.data_statistics is None:
42 |             # print("WARNING: performing random actions.")
43 |             return self.sample_action_sequences(num_sequences=1, horizon=1)[0]
44 | 
45 |         # sample random actions (N x horizon)
46 |         candidate_action_sequences = self.sample_action_sequences(
47 |             num_sequences=self.N, horizon=self.horizon)
48 | 
49 |         # for each model in ensemble:
50 |         predicted_sum_of_rewards_per_model = []
51 |         for model in self.dyn_models:
52 |             sum_of_rewards = self.calculate_sum_of_rewards(
53 |                 obs, candidate_action_sequences, model)
54 |             predicted_sum_of_rewards_per_model.append(sum_of_rewards)
55 | 
56 |         # calculate mean_across_ensembles(predicted rewards)
57 |         predicted_rewards = np.mean(
58 |             predicted_sum_of_rewards_per_model, axis=0)  # [ens, N] --> N
59 | 
60 |         # pick the action sequence and return the 1st element of that sequence
61 |         best_action_sequence = None  # TODO (Q2)
62 |         action_to_take = None  # TODO (Q2)
63 |         return action_to_take[None]  # Unsqueeze the first index
64 | 
65 |     def calculate_sum_of_rewards(self, obs, candidate_action_sequences, model):
66 |         """
67 | 
68 |         :param obs: numpy array with the current observation. Shape [D_obs]
69 |         :param candidate_action_sequences: numpy array with the candidate action
70 |         sequences. Shape [N, H, D_action] where
71 |             - N is the number of action sequences considered
72 |             - H is the horizon
73 |             - D_action is the action of the dimension
74 |         :param model: The current dynamics model.
75 |         :return: numpy array with the sum of rewards for each action sequence.
76 |         The array should have shape [N].
77 |         """
78 |         sum_of_rewards = None  # TODO (Q2)
79 |         # For each candidate action sequence, predict a sequence of
80 |         # states for each dynamics model in your ensemble.
81 |         # Once you have a sequence of predicted states from each model in
82 |         # your ensemble, calculate the sum of rewards for each sequence
83 |         # using `self.env.get_reward(predicted_obs)`
84 |         # You should sum across `self.horizon` time step.
85 |         # Hint: you should use model.get_prediction and you shouldn't need
86 |         #       to import pytorch in this file.
87 |         # Hint: Remember that the model can process observations and actions
88 |         #       in batch, which can be much faster than looping through each
89 |         #       action sequence.
90 |         return sum_of_rewards
91 | 


--------------------------------------------------------------------------------
/hw4/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import numpy as np
 3 | 
 4 | 
 5 | class BasePolicy(object, metaclass=abc.ABCMeta):
 6 |     def get_action(self, obs: np.ndarray) -> np.ndarray:
 7 |         raise NotImplementedError
 8 | 
 9 |     def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 |         """Return a dictionary of logging information."""
11 |         raise NotImplementedError
12 | 
13 |     def save(self, filepath: str):
14 |         raise NotImplementedError
15 | 


--------------------------------------------------------------------------------
/hw4/cs285/scripts/filter_events.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Usage:
 4 | 
 5 | Run the command
 6 | ```
 7 | python filter_events.py --events SOME_DIRECTORY
 8 | ```
 9 | 
10 | and it will generate a directory named `SOME_DIRECTORY_filtered` with the video 
11 | events removed.
12 | """
13 | from __future__ import print_function
14 | import os
15 | import sys
16 | import argparse
17 | import tqdm
18 | 
19 | # Adapted from
20 | # https://gist.github.com/serycjon/c9ad58ecc3176d87c49b69b598f4d6c6
21 | 
22 | import tensorflow as tf
23 | 
24 | 
25 | def parse_arguments():
26 |     parser = argparse.ArgumentParser(description='')
27 |     parser.add_argument('--event', help='event file', required=True)
28 | 
29 |     return parser.parse_args()
30 | 
31 | 
32 | def main(args):
33 |     out_path = os.path.dirname(args.event) + '_filtered'
34 |     writer = tf.summary.FileWriter(out_path)
35 | 
36 |     total = None
37 |     for event in tqdm.tqdm(tf.train.summary_iterator(args.event), total=total):
38 |         event_type = event.WhichOneof('what')
39 |         if event_type != 'summary':
40 |             writer.add_event(event)
41 |         else:
42 |             wall_time = event.wall_time
43 |             step = event.step
44 |             filtered_values = [value for value in event.summary.value if
45 |                                'rollouts' not in value.tag]
46 |             summary = tf.Summary(value=filtered_values)
47 | 
48 |             filtered_event = tf.summary.Event(summary=summary,
49 |                                               wall_time=wall_time,
50 |                                               step=step)
51 |             writer.add_event(filtered_event)
52 |     writer.close()
53 |     return 0
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     args = parse_arguments()
58 |     sys.exit(main(args))
59 | 


--------------------------------------------------------------------------------
/hw4/cs285/scripts/read_results.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import tensorflow as tf
 3 | 
 4 | def get_section_results(file):
 5 |     """
 6 |         requires tensorflow==1.12.0
 7 |     """
 8 |     X = []
 9 |     Y = []
10 |     for e in tf.train.summary_iterator(file):
11 |         for v in e.summary.value:
12 |             if v.tag == 'Train_EnvstepsSoFar':
13 |                 X.append(v.simple_value)
14 |             elif v.tag == 'Eval_AverageReturn':
15 |                 Y.append(v.simple_value)
16 |     return X, Y
17 | 
18 | if __name__ == '__main__':
19 |     import glob
20 | 
21 |     logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*'
22 |     eventfile = glob.glob(logdir)[0]
23 | 
24 |     X, Y = get_section_results(eventfile)
25 |     for i, (x, y) in enumerate(zip(X, Y)):
26 |         print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y))


--------------------------------------------------------------------------------
/hw4/cs285/scripts/run_hw4_mb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | from cs285.infrastructure.rl_trainer import RL_Trainer
  5 | from cs285.agents.mb_agent import MBAgent
  6 | 
  7 | 
  8 | class MB_Trainer(object):
  9 | 
 10 |     def __init__(self, params):
 11 | 
 12 |         #####################
 13 |         ## SET AGENT PARAMS
 14 |         #####################
 15 | 
 16 |         computation_graph_args = {
 17 |             'ensemble_size': params['ensemble_size'],
 18 |             'n_layers': params['n_layers'],
 19 |             'size': params['size'],
 20 |             'learning_rate': params['learning_rate'],
 21 |             }
 22 | 
 23 |         train_args = {
 24 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
 25 |         }
 26 | 
 27 |         controller_args = {
 28 |             'mpc_horizon': params['mpc_horizon'],
 29 |             'mpc_num_action_sequences': params['mpc_num_action_sequences'],
 30 |         }
 31 | 
 32 |         agent_params = {**computation_graph_args, **train_args, **controller_args}
 33 | 
 34 |         self.params = params
 35 |         self.params['agent_class'] = MBAgent
 36 |         self.params['agent_params'] = agent_params
 37 | 
 38 |         ################
 39 |         ## RL TRAINER
 40 |         ################
 41 | 
 42 |         self.rl_trainer = RL_Trainer(self.params)
 43 | 
 44 |     def run_training_loop(self):
 45 | 
 46 |         self.rl_trainer.run_training_loop(
 47 |             self.params['n_iter'],
 48 |             collect_policy = self.rl_trainer.agent.actor,
 49 |             eval_policy = self.rl_trainer.agent.actor,
 50 |             )
 51 | 
 52 | 
 53 | def main():
 54 | 
 55 |     import argparse
 56 |     parser = argparse.ArgumentParser()
 57 |     parser.add_argument('--env_name', type=str) #reacher-cs285-v0, ant-cs285-v0, cheetah-cs285-v0, obstacles-cs285-v0
 58 |     parser.add_argument('--ep_len', type=int, default=200)
 59 |     parser.add_argument('--exp_name', type=str, default='todo')
 60 |     parser.add_argument('--n_iter', '-n', type=int, default=20)
 61 | 
 62 |     parser.add_argument('--ensemble_size', '-e', type=int, default=3)
 63 |     parser.add_argument('--mpc_horizon', type=int, default=10)
 64 |     parser.add_argument('--mpc_num_action_sequences', type=int, default=1000)
 65 | 
 66 |     parser.add_argument('--add_sl_noise', '-noise', action='store_true')
 67 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1000)
 68 |     parser.add_argument('--batch_size_initial', type=int, default=20000) #(random) steps collected on 1st iteration (put into replay buffer)
 69 |     parser.add_argument('--batch_size', '-b', type=int, default=8000) #steps collected per train iteration (put into replay buffer)
 70 |     parser.add_argument('--train_batch_size', '-tb', type=int, default=512) ##steps used per gradient step (used for training)
 71 |     parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
 72 | 
 73 |     parser.add_argument('--learning_rate', '-lr', type=float, default=0.001)
 74 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
 75 |     parser.add_argument('--size', '-s', type=int, default=250)
 76 | 
 77 |     parser.add_argument('--seed', type=int, default=1)
 78 |     parser.add_argument('--no_gpu', '-ngpu', action='store_true')
 79 |     parser.add_argument('--which_gpu', '-gpu_id', default=0)
 80 |     parser.add_argument('--video_log_freq', type=int, default=1) #-1 to disable
 81 |     parser.add_argument('--scalar_log_freq', type=int, default=1) #-1 to disable
 82 |     parser.add_argument('--save_params', action='store_true')
 83 |     args = parser.parse_args()
 84 | 
 85 |     # convert to dictionary
 86 |     params = vars(args)
 87 | 
 88 |     # HARDCODE EPISODE LENGTHS FOR THE ENVS USED IN THIS MB ASSIGNMENT
 89 |     if params['env_name']=='reacher-cs285-v0':
 90 |         params['ep_len']=200
 91 |     if params['env_name']=='cheetah-cs285-v0':
 92 |         params['ep_len']=500
 93 |     if params['env_name']=='obstacles-cs285-v0':
 94 |         params['ep_len']=100
 95 | 
 96 |     ##################################
 97 |     ### CREATE DIRECTORY FOR LOGGING
 98 |     ##################################
 99 | 
100 |     logdir_prefix = 'hw4_'  # keep for autograder
101 | 
102 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data')
103 | 
104 |     if not (os.path.exists(data_path)):
105 |         os.makedirs(data_path)
106 | 
107 |     logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
108 |     logdir = os.path.join(data_path, logdir)
109 |     params['logdir'] = logdir
110 |     if not(os.path.exists(logdir)):
111 |         os.makedirs(logdir)
112 | 
113 |     print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
114 | 
115 |     ###################
116 |     ### RUN TRAINING
117 |     ###################
118 | 
119 |     trainer = MB_Trainer(params)
120 |     trainer.run_training_loop()
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     main()
125 | 


--------------------------------------------------------------------------------
/hw4/cs285_hw4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw4/cs285_hw4.pdf


--------------------------------------------------------------------------------
/hw4/requirements.txt:
--------------------------------------------------------------------------------
 1 | gym[atari]==0.17.2
 2 | mujoco-py==2.0.2.2
 3 | tensorboard==2.3.0
 4 | tensorboardX==1.8
 5 | matplotlib==2.2.2
 6 | ipython==6.4.0
 7 | moviepy==1.0.0
 8 | pyvirtualdisplay==1.3.2
 9 | torch==1.5.1
10 | opencv-python==4.4.0.42
11 | ipdb==0.13.3
12 | box2d-py
13 | 


--------------------------------------------------------------------------------
/hw4/requirements_colab.txt:
--------------------------------------------------------------------------------
 1 | gym[atari]==0.17.2
 2 | tensorboard==2.3.0
 3 | tensorboardX==1.8
 4 | matplotlib==2.2.2
 5 | ipython==6.4.0
 6 | moviepy==1.0.0
 7 | pyvirtualdisplay==1.3.2
 8 | torch==1.5.1
 9 | opencv-python==4.4.0.42
10 | ipdb==0.13.3
11 | box2d-py
12 | 


--------------------------------------------------------------------------------
/hw4/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------
/hw5/README.md:
--------------------------------------------------------------------------------
 1 | ## Setup
 2 | 
 3 | You can run this code on your own machine or on Google Colab. 
 4 | 
 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally.
 6 | 
 7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below:
 8 | 
 9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw5/run_hw5_expl.ipynb)
10 | 
11 | ## Complete the code
12 | 
13 | The following files have blanks to be filled with your solutions from homework 1 and 3. The relevant sections are marked with `TODO: get this from Piazza'.
14 | 
15 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
16 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
17 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
18 | - [policies/argmax_policy.py](cs285/policies/argmax_policy.py)
19 | - [critics/dqn_critic.py](cs285/critics/dqn_critic.py)
20 | 
21 | You will then need to implement code in the following files:
22 | - [exploration/rnd_model.py](cs285/exploration/rnd_model.py)
23 | - [agents/explore_or_exploit_agent.py](cs285/agents/explore_or_exploit_agent.py)
24 | - [critics/cql_critic.py](cs285/critics/cql_critic.py)
25 | 
26 | The relevant sections are marked with `TODO`.
27 | 
28 | You may also want to look through [scripts/run_hw5_expl.py](cs285/scripts/run_hw5_expl.py) (if running locally) or [run_hw5_expl.ipynb](run_hw5_expl.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook.
29 | 
30 | See the [assignment PDF](hw5.pdf) for more details on what files to edit.
31 | 
32 | For this particular assignment, you will need to install networkx==2.5
33 | 


--------------------------------------------------------------------------------
/hw5/cs285/agents/ac_agent.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | from cs285.critics.bootstrapped_continuous_critic import \
 4 |     BootstrappedContinuousCritic
 5 | from cs285.infrastructure.replay_buffer import ReplayBuffer
 6 | from cs285.infrastructure.utils import *
 7 | from cs285.policies.MLP_policy import MLPPolicyAC
 8 | from .base_agent import BaseAgent
 9 | 
10 | 
11 | class ACAgent(BaseAgent):
12 |     def __init__(self, env, agent_params):
13 |         super(ACAgent, self).__init__()
14 | 
15 |         self.env = env
16 |         self.agent_params = agent_params
17 | 
18 |         self.gamma = self.agent_params['gamma']
19 |         self.standardize_advantages = self.agent_params['standardize_advantages']
20 | 
21 |         self.actor = MLPPolicyAC(
22 |             self.agent_params['ac_dim'],
23 |             self.agent_params['ob_dim'],
24 |             self.agent_params['n_layers'],
25 |             self.agent_params['size'],
26 |             self.agent_params['discrete'],
27 |             self.agent_params['learning_rate'],
28 |         )
29 |         self.critic = BootstrappedContinuousCritic(self.agent_params)
30 | 
31 |         self.replay_buffer = ReplayBuffer()
32 | 
33 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
34 |         raise NotImplementedError
35 |         # Not needed for this homework
36 | 
37 |     ####################################
38 |     ####################################
39 | 
40 |     def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
41 |         raise NotImplementedError
42 |         # Not needed for this homework
43 | 
44 |     ####################################
45 |     ####################################
46 | 


--------------------------------------------------------------------------------
/hw5/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
 1 | class BaseAgent(object):
 2 |     def __init__(self, **kwargs):
 3 |         super(BaseAgent, self).__init__(**kwargs)
 4 | 
 5 |     def train(self) -> dict:
 6 |         """Return a dictionary of logging information."""
 7 |         raise NotImplementedError
 8 | 
 9 |     def add_to_replay_buffer(self, paths):
10 |         raise NotImplementedError
11 | 
12 |     def sample(self, batch_size):
13 |         raise NotImplementedError
14 | 
15 |     def save(self, path):
16 |         raise NotImplementedError


--------------------------------------------------------------------------------
/hw5/cs285/agents/dqn_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pdb
 3 | 
 4 | from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule
 5 | from cs285.policies.argmax_policy import ArgMaxPolicy
 6 | from cs285.critics.dqn_critic import DQNCritic
 7 | 
 8 | 
 9 | class DQNAgent(object):
10 |     def __init__(self, env, agent_params):
11 | 
12 |         self.env = env
13 |         self.agent_params = agent_params
14 |         self.batch_size = agent_params['batch_size']
15 |         # import ipdb; ipdb.set_trace()
16 |         self.last_obs = self.env.reset()
17 | 
18 |         self.num_actions = agent_params['ac_dim']
19 |         self.learning_starts = agent_params['learning_starts']
20 |         self.learning_freq = agent_params['learning_freq']
21 |         self.target_update_freq = agent_params['target_update_freq']
22 | 
23 |         self.replay_buffer_idx = None
24 |         self.exploration = agent_params['exploration_schedule']
25 |         self.optimizer_spec = agent_params['optimizer_spec']
26 | 
27 |         self.critic = DQNCritic(agent_params, self.optimizer_spec)
28 |         self.actor = ArgMaxPolicy(self.critic)
29 | 
30 |         lander = agent_params['env_name'].startswith('LunarLander')
31 |         self.replay_buffer = MemoryOptimizedReplayBuffer(
32 |             agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander)
33 |         self.t = 0
34 |         self.num_param_updates = 0
35 | 
36 |     def add_to_replay_buffer(self, paths):
37 |         pass
38 | 
39 |     def step_env(self):
40 |         """
41 |             Step the env and store the transition
42 |             At the end of this block of code, the simulator should have been
43 |             advanced one step, and the replay buffer should contain one more transition.
44 |             Note that self.last_obs must always point to the new latest observation.
45 |         """
46 |         raise NotImplementedError
47 |         # Not needed for this homework
48 | 
49 |     ####################################
50 |     ####################################
51 | 
52 |     def sample(self, batch_size):
53 |         if self.replay_buffer.can_sample(self.batch_size):
54 |             return self.replay_buffer.sample(batch_size)
55 |         else:
56 |             return [],[],[],[],[]
57 | 
58 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
59 |         raise NotImplementedError
60 |         # Not needed for this homework
61 | 
62 |     ####################################
63 |     ####################################


--------------------------------------------------------------------------------
/hw5/cs285/critics/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/hw5/cs285/critics/base_critic.py:
--------------------------------------------------------------------------------
1 | class BaseCritic(object):
2 |     def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n):
3 |         raise NotImplementedError
4 | 


--------------------------------------------------------------------------------
/hw5/cs285/critics/bootstrapped_continuous_critic.py:
--------------------------------------------------------------------------------
 1 | from .base_critic import BaseCritic
 2 | from torch import nn
 3 | from torch import optim
 4 | import pdb
 5 | 
 6 | from cs285.infrastructure import pytorch_util as ptu
 7 | 
 8 | 
 9 | class BootstrappedContinuousCritic(nn.Module, BaseCritic):
10 |     """
11 |         Notes on notation:
12 | 
13 |         Prefixes and suffixes:
14 |         ob - observation
15 |         ac - action
16 |         _no - this tensor should have shape (batch self.size /n/, observation dim)
17 |         _na - this tensor should have shape (batch self.size /n/, action dim)
18 |         _n  - this tensor should have shape (batch self.size /n/)
19 | 
20 |         Note: batch self.size /n/ is defined at runtime.
21 |         is None
22 |     """
23 |     def __init__(self, hparams):
24 |         super().__init__()
25 |         self.ob_dim = hparams['ob_dim']
26 |         self.ac_dim = hparams['ac_dim']
27 |         self.discrete = hparams['discrete']
28 |         self.size = hparams['size']
29 |         self.n_layers = hparams['n_layers']
30 |         self.learning_rate = hparams['learning_rate']
31 | 
32 |         # critic parameters
33 |         self.num_target_updates = hparams['num_target_updates']
34 |         self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update']
35 |         self.gamma = hparams['gamma']
36 |         self.critic_network = ptu.build_mlp(
37 |             self.ob_dim,
38 |             1,
39 |             n_layers=self.n_layers,
40 |             size=self.size,
41 |         )
42 |         self.critic_network.to(ptu.device)
43 |         self.loss = nn.MSELoss()
44 |         self.optimizer = optim.Adam(
45 |             self.critic_network.parameters(),
46 |             self.learning_rate,
47 |         )
48 | 
49 |     def forward(self, obs):
50 |         return self.critic_network(obs).squeeze(1)
51 | 
52 |     def forward_np(self, obs):
53 |         obs = ptu.from_numpy(obs)
54 |         predictions = self(obs)
55 |         return ptu.to_numpy(predictions)
56 | 
57 |     def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
58 |         """
59 |             Update the parameters of the critic.
60 | 
61 |             let sum_of_path_lengths be the sum of the lengths of the paths sampled from
62 |                 Agent.sample_trajectories
63 |             let num_paths be the number of paths sampled from Agent.sample_trajectories
64 | 
65 |             arguments:
66 |                 ob_no: shape: (sum_of_path_lengths, ob_dim)
67 |                 next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
68 |                 reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
69 |                     the reward for each timestep
70 |                 terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
71 |                     at that timestep of 0 if the episode did not end
72 | 
73 |             returns:
74 |                 nothing
75 |         """
76 |         raise NotImplementedError
77 |         # Not needed for this homework
78 | 
79 |     ####################################
80 |     ####################################
81 | 


--------------------------------------------------------------------------------
/hw5/cs285/critics/cql_critic.py:
--------------------------------------------------------------------------------
  1 | from .base_critic import BaseCritic
  2 | import torch
  3 | import torch.optim as optim
  4 | from torch.nn import utils
  5 | from torch import nn
  6 | import pdb
  7 | 
  8 | from cs285.infrastructure import pytorch_util as ptu
  9 | 
 10 | 
 11 | class CQLCritic(BaseCritic):
 12 | 
 13 |     def __init__(self, hparams, optimizer_spec, **kwargs):
 14 |         super().__init__(**kwargs)
 15 |         self.env_name = hparams['env_name']
 16 |         self.ob_dim = hparams['ob_dim']
 17 | 
 18 |         if isinstance(self.ob_dim, int):
 19 |             self.input_shape = (self.ob_dim,)
 20 |         else:
 21 |             self.input_shape = hparams['input_shape']
 22 | 
 23 |         self.ac_dim = hparams['ac_dim']
 24 |         self.double_q = hparams['double_q']
 25 |         self.grad_norm_clipping = hparams['grad_norm_clipping']
 26 |         self.gamma = hparams['gamma']
 27 | 
 28 |         self.optimizer_spec = optimizer_spec
 29 |         network_initializer = hparams['q_func']
 30 |         self.q_net = network_initializer(self.ob_dim, self.ac_dim)
 31 |         self.q_net_target = network_initializer(self.ob_dim, self.ac_dim)
 32 |         self.optimizer = self.optimizer_spec.constructor(
 33 |             self.q_net.parameters(),
 34 |             **self.optimizer_spec.optim_kwargs
 35 |         )
 36 |         self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR(
 37 |             self.optimizer,
 38 |             self.optimizer_spec.learning_rate_schedule,
 39 |         )
 40 |         self.loss = nn.MSELoss()
 41 |         self.q_net.to(ptu.device)
 42 |         self.q_net_target.to(ptu.device)
 43 |         self.cql_alpha = hparams['cql_alpha']
 44 | 
 45 |     def dqn_loss(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
 46 |         qa_t_values = self.q_net(ob_no)
 47 |         q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1)
 48 |         qa_tp1_values = self.q_net_target(next_ob_no)
 49 | 
 50 |         next_actions = self.q_net(next_ob_no).argmax(dim=1)
 51 |         q_tp1 = torch.gather(qa_tp1_values, 1, next_actions.unsqueeze(1)).squeeze(1)
 52 | 
 53 |         target = reward_n + self.gamma * q_tp1 * (1 - terminal_n)
 54 |         target = target.detach()
 55 |         loss = self.loss(q_t_values, target)
 56 | 
 57 |         return loss, qa_t_values, q_t_values
 58 | 
 59 | 
 60 |     def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
 61 |         """
 62 |             Update the parameters of the critic.
 63 |             let sum_of_path_lengths be the sum of the lengths of the paths sampled from
 64 |                 Agent.sample_trajectories
 65 |             let num_paths be the number of paths sampled from Agent.sample_trajectories
 66 |             arguments:
 67 |                 ob_no: shape: (sum_of_path_lengths, ob_dim)
 68 |                 next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
 69 |                 reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
 70 |                     the reward for each timestep
 71 |                 terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
 72 |                     at that timestep of 0 if the episode did not end
 73 |             returns:
 74 |                 nothing
 75 |         """
 76 |         ob_no = ptu.from_numpy(ob_no)
 77 |         ac_na = ptu.from_numpy(ac_na).to(torch.long)
 78 |         next_ob_no = ptu.from_numpy(next_ob_no)
 79 |         reward_n = ptu.from_numpy(reward_n)
 80 |         terminal_n = ptu.from_numpy(terminal_n)
 81 | 
 82 |         loss, qa_t_values, q_t_values = self.dqn_loss(
 83 |             ob_no, ac_na, next_ob_no, reward_n, terminal_n
 84 |             )
 85 |         
 86 |         # CQL Implementation
 87 |         # TODO: Implement CQL as described in the pdf and paper
 88 |         # Hint: After calculating cql_loss, augment the loss appropriately
 89 |         cql_loss = None
 90 | 
 91 |         self.optimizer.zero_grad()
 92 |         loss.backward()
 93 |         self.optimizer.step()
 94 | 
 95 |         info = {'Training Loss': ptu.to_numpy(loss)}
 96 | 
 97 |         # TODO: Uncomment these lines after implementing CQL
 98 |         # info['CQL Loss'] = ptu.to_numpy(cql_loss)
 99 |         # info['Data q-values'] = ptu.to_numpy(q_t_values).mean()
100 |         # info['OOD q-values'] = ptu.to_numpy(q_t_logsumexp).mean()
101 | 
102 |         return info
103 | 
104 | 
105 |     def update_target_network(self):
106 |         for target_param, param in zip(
107 |                 self.q_net_target.parameters(), self.q_net.parameters()
108 |         ):
109 |             target_param.data.copy_(param.data)
110 | 
111 |     def qa_values(self, obs):
112 |         obs = ptu.from_numpy(obs)
113 |         qa_values = self.q_net(obs)
114 |         return ptu.to_numpy(qa_values)
115 | 


--------------------------------------------------------------------------------
/hw5/cs285/critics/dqn_critic.py:
--------------------------------------------------------------------------------
 1 | from .base_critic import BaseCritic
 2 | import torch
 3 | import torch.optim as optim
 4 | from torch.nn import utils
 5 | from torch import nn
 6 | import pdb
 7 | 
 8 | from cs285.infrastructure import pytorch_util as ptu
 9 | 
10 | 
11 | class DQNCritic(BaseCritic):
12 | 
13 |     def __init__(self, hparams, optimizer_spec, **kwargs):
14 |         super().__init__(**kwargs)
15 |         self.env_name = hparams['env_name']
16 |         self.ob_dim = hparams['ob_dim']
17 | 
18 |         if isinstance(self.ob_dim, int):
19 |             self.input_shape = (self.ob_dim,)
20 |         else:
21 |             self.input_shape = hparams['input_shape']
22 | 
23 |         self.ac_dim = hparams['ac_dim']
24 |         self.double_q = hparams['double_q']
25 |         self.grad_norm_clipping = hparams['grad_norm_clipping']
26 |         self.gamma = hparams['gamma']
27 | 
28 |         self.optimizer_spec = optimizer_spec
29 |         network_initializer = hparams['q_func']
30 |         self.q_net = network_initializer(self.ob_dim, self.ac_dim)
31 |         self.q_net_target = network_initializer(self.ob_dim, self.ac_dim)
32 |         self.optimizer = self.optimizer_spec.constructor(
33 |             self.q_net.parameters(),
34 |             **self.optimizer_spec.optim_kwargs
35 |         )
36 |         self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR(
37 |             self.optimizer,
38 |             self.optimizer_spec.learning_rate_schedule,
39 |         )
40 |         self.loss = nn.SmoothL1Loss()  # AKA Huber loss
41 |         self.q_net.to(ptu.device)
42 |         self.q_net_target.to(ptu.device)
43 | 
44 |     def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
45 |         """
46 |             Update the parameters of the critic.
47 |             let sum_of_path_lengths be the sum of the lengths of the paths sampled from
48 |                 Agent.sample_trajectories
49 |             let num_paths be the number of paths sampled from Agent.sample_trajectories
50 |             arguments:
51 |                 ob_no: shape: (sum_of_path_lengths, ob_dim)
52 |                 next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
53 |                 reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
54 |                     the reward for each timestep
55 |                 terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
56 |                     at that timestep of 0 if the episode did not end
57 |             returns:
58 |                 nothing
59 |         """
60 |         raise NotImplementedError
61 |         # TODO: Get this from homework 3
62 | 
63 |     ####################################
64 |     ####################################
65 | 
66 |     def update_target_network(self):
67 |         for target_param, param in zip(
68 |                 self.q_net_target.parameters(), self.q_net.parameters()
69 |         ):
70 |             target_param.data.copy_(param.data)
71 | 
72 |     def qa_values(self, obs):
73 |         obs = ptu.from_numpy(obs)
74 |         qa_values = self.q_net(obs)
75 |         return ptu.to_numpy(qa_values)
76 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs import ant
2 | from cs285.envs import cheetah
3 | from cs285.envs import obstacles
4 | from cs285.envs import reacher


--------------------------------------------------------------------------------
/hw5/cs285/envs/ant/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | 
3 | register(
4 |     id='ant-cs285-v0',
5 |     entry_point='cs285.envs.ant:AntEnv',
6 |     max_episode_steps=1000,
7 | )
8 | from cs285.envs.ant.ant import AntEnv
9 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/box2d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/cs285/envs/box2d/__init__.py


--------------------------------------------------------------------------------
/hw5/cs285/envs/cheetah/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | 
3 | register(
4 |     id='cheetah-cs285-v0',
5 |     entry_point='cs285.envs.cheetah:HalfCheetahEnv',
6 |     max_episode_steps=1000,
7 | )
8 | from cs285.envs.cheetah.cheetah import HalfCheetahEnv
9 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/cheetah/cheetah.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import mujoco_py
  3 | from gym import utils
  4 | from gym.envs.mujoco import mujoco_env
  5 | 
  6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  7 | 
  8 |     def __init__(self):
  9 | 
 10 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
 11 |         utils.EzPickle.__init__(self)
 12 | 
 13 |         self.skip = self.frame_skip
 14 | 
 15 |         self.action_dim = self.ac_dim = self.action_space.shape[0]
 16 |         self.observation_dim = self.obs_dim = self.observation_space.shape[0]
 17 | 
 18 |     def get_reward(self, observations, actions):
 19 | 
 20 |         """get reward/s of given (observations, actions) datapoint or datapoints
 21 | 
 22 |         Args:
 23 |             observations: (batchsize, obs_dim) or (obs_dim,)
 24 |             actions: (batchsize, ac_dim) or (ac_dim,)
 25 | 
 26 |         Return:
 27 |             r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
 28 |             done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
 29 |         """
 30 | 
 31 |         #initialize and reshape as needed, for batch mode
 32 |         self.reward_dict = {}
 33 |         if(len(observations.shape)==1):
 34 |             observations = np.expand_dims(observations, axis = 0)
 35 |             actions = np.expand_dims(actions, axis = 0)
 36 |             batch_mode = False
 37 |         else:
 38 |             batch_mode = True
 39 | 
 40 |         #get vars
 41 |         xvel = observations[:, 9].copy()
 42 |         body_angle = observations[:, 2].copy()
 43 |         front_leg = observations[:, 6].copy()
 44 |         front_shin = observations[:, 7].copy()
 45 |         front_foot = observations[:, 8].copy()
 46 |         zeros = np.zeros((observations.shape[0],)).copy()
 47 | 
 48 |         # ranges
 49 |         leg_range = 0.2
 50 |         shin_range = 0
 51 |         foot_range = 0
 52 |         penalty_factor = 10
 53 | 
 54 |         #calc rew
 55 |         self.reward_dict['run'] = xvel
 56 | 
 57 |         front_leg_rew = zeros.copy()
 58 |         front_leg_rew[front_leg>leg_range] = -penalty_factor
 59 |         self.reward_dict['leg'] = front_leg_rew
 60 | 
 61 |         front_shin_rew = zeros.copy()
 62 |         front_shin_rew[front_shin>shin_range] = -penalty_factor
 63 |         self.reward_dict['shin'] = front_shin_rew
 64 | 
 65 |         front_foot_rew = zeros.copy()
 66 |         front_foot_rew[front_foot>foot_range] = -penalty_factor
 67 |         self.reward_dict['foot'] = front_foot_rew
 68 | 
 69 |         # total reward
 70 |         self.reward_dict['r_total'] = self.reward_dict['run'] +  self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot']
 71 | 
 72 |         #return
 73 |         dones = zeros.copy()
 74 |         if(not batch_mode):
 75 |             return self.reward_dict['r_total'][0], dones[0]
 76 |         return self.reward_dict['r_total'], dones
 77 | 
 78 | 
 79 |     def get_score(self, obs):
 80 |         xposafter = obs[0]
 81 |         return xposafter
 82 | 
 83 |     ##############################################
 84 | 
 85 |     def step(self, action):
 86 | 
 87 |         #step
 88 |         self.do_simulation(action, self.frame_skip)
 89 | 
 90 |         #obs/reward/done/score
 91 |         ob = self._get_obs()
 92 |         rew, done = self.get_reward(ob, action)
 93 |         score = self.get_score(ob)
 94 | 
 95 |         #return
 96 |         env_info = {'obs_dict': self.obs_dict,
 97 |                     'rewards': self.reward_dict,
 98 |                     'score': score}
 99 |         return ob, rew, done, env_info
100 | 
101 |     def _get_obs(self):
102 | 
103 |         self.obs_dict = {}
104 |         self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy()
105 |         self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy()
106 |         self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy()
107 | 
108 |         return np.concatenate([
109 |             self.obs_dict['joints_pos'], #9
110 |             self.obs_dict['joints_vel'], #9
111 |             self.obs_dict['com_torso'], #3
112 |         ])
113 | 
114 |     ##############################################
115 | 
116 |     def reset_model(self, seed=None):
117 | 
118 |         # set reset pose/vel
119 |         self.reset_pose = self.init_qpos + self.np_random.uniform(
120 |                         low=-.1, high=.1, size=self.model.nq)
121 |         self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
122 | 
123 |         #reset the env to that pose/vel
124 |         return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy())
125 | 
126 | 
127 |     def do_reset(self, reset_pose, reset_vel, reset_goal=None):
128 | 
129 |         #reset
130 |         self.set_state(reset_pose, reset_vel)
131 | 
132 |         #return
133 |         return self._get_obs()
134 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/obstacles/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | 
3 | register(
4 |     id='obstacles-cs285-v0',
5 |     entry_point='cs285.envs.obstacles:Obstacles',
6 |     max_episode_steps=500,
7 | )
8 | from cs285.envs.obstacles.obstacles_env import Obstacles
9 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/reacher/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | 
3 | register(
4 |     id='reacher-cs285-v0',
5 |     entry_point='cs285.envs.reacher:Reacher7DOFEnv',
6 |     max_episode_steps=500,
7 | )
8 | from cs285.envs.reacher.reacher_env import Reacher7DOFEnv
9 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/reacher/reacher_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from gym.envs.mujoco import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | import os
  6 | 
  7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 | 
 10 |         # placeholder
 11 |         self.hand_sid = -2
 12 |         self.target_sid = -1
 13 | 
 14 |         curr_dir = os.path.dirname(os.path.abspath(__file__))
 15 |         mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2)
 16 |         utils.EzPickle.__init__(self)
 17 |         self.observation_dim = 26
 18 |         self.action_dim = 7
 19 | 
 20 |         self.hand_sid = self.model.site_name2id("finger")
 21 |         self.target_sid = self.model.site_name2id("target")
 22 |         self.skip = self.frame_skip
 23 | 
 24 | 
 25 |     def _get_obs(self):
 26 |         return np.concatenate([
 27 |             self.data.qpos.flat, #[7]
 28 |             self.data.qvel.flatten() / 10., #[7]
 29 |             self.data.site_xpos[self.hand_sid], #[3]
 30 |             self.model.site_pos[self.target_sid], #[3]
 31 |         ])
 32 | 
 33 |     def step(self, a):
 34 | 
 35 |         self.do_simulation(a, self.frame_skip)
 36 |         ob = self._get_obs()
 37 |         reward, done = self.get_reward(ob, a)
 38 | 
 39 |         score = self.get_score(ob)
 40 | 
 41 |         # finalize step
 42 |         env_info = {'ob': ob,
 43 |                     'rewards': self.reward_dict,
 44 |                     'score': score}
 45 | 
 46 |         return ob, reward, done, env_info
 47 | 
 48 |     def get_score(self, obs):
 49 |         hand_pos = obs[-6:-3]
 50 |         target_pos = obs[-3:]
 51 |         score = -1*np.abs(hand_pos-target_pos)
 52 |         return score
 53 | 
 54 |     def get_reward(self, observations, actions):
 55 | 
 56 |         """get reward/s of given (observations, actions) datapoint or datapoints
 57 | 
 58 |         Args:
 59 |             observations: (batchsize, obs_dim) or (obs_dim,)
 60 |             actions: (batchsize, ac_dim) or (ac_dim,)
 61 | 
 62 |         Return:
 63 |             r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
 64 |             done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
 65 |         """
 66 | 
 67 |         #initialize and reshape as needed, for batch mode
 68 |         self.reward_dict = {}
 69 |         if(len(observations.shape)==1):
 70 |             observations = np.expand_dims(observations, axis = 0)
 71 |             actions = np.expand_dims(actions, axis = 0)
 72 |             batch_mode = False
 73 |         else:
 74 |             batch_mode = True
 75 | 
 76 |         #get vars
 77 |         hand_pos = observations[:, -6:-3]
 78 |         target_pos = observations[:, -3:]
 79 | 
 80 |         #calc rew
 81 |         dist = np.linalg.norm(hand_pos - target_pos, axis=1)
 82 |         self.reward_dict['r_total'] = -10*dist
 83 | 
 84 |         #done is always false for this env
 85 |         dones = np.zeros((observations.shape[0],))
 86 | 
 87 |         #return
 88 |         if(not batch_mode):
 89 |             return self.reward_dict['r_total'][0], dones[0]
 90 |         return self.reward_dict['r_total'], dones
 91 | 
 92 |     def reset(self):
 93 |         _ = self.reset_model()
 94 | 
 95 |         self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1]
 96 | 
 97 |         observation, _reward, done, _info = self.step(np.zeros(7))
 98 |         ob = self._get_obs()
 99 | 
100 |         return ob
101 | 
102 |     def reset_model(self, seed=None):
103 |         if seed is not None:
104 |             self.seed(seed)
105 | 
106 |         self.reset_pose = self.init_qpos.copy()
107 |         self.reset_vel = self.init_qvel.copy()
108 | 
109 |         self.reset_goal = np.zeros(3)
110 |         self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3)
111 |         self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2)
112 |         self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25)
113 | 
114 |         return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal)
115 | 
116 |     def do_reset(self, reset_pose, reset_vel, reset_goal):
117 | 
118 |         self.set_state(reset_pose, reset_vel)
119 | 
120 |         #reset target
121 |         self.reset_goal = reset_goal.copy()
122 |         self.model.site_pos[self.target_sid] = self.reset_goal
123 |         self.sim.forward()
124 | 
125 |         #return
126 |         return self._get_obs()


--------------------------------------------------------------------------------
/hw5/cs285/exploration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/cs285/exploration/__init__.py


--------------------------------------------------------------------------------
/hw5/cs285/exploration/base_exploration_model.py:
--------------------------------------------------------------------------------
1 | class BaseExplorationModel(object):
2 |     def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n):
3 |         raise NotImplementedError


--------------------------------------------------------------------------------
/hw5/cs285/exploration/rnd_model.py:
--------------------------------------------------------------------------------
 1 | from cs285.infrastructure import pytorch_util as ptu
 2 | from .base_exploration_model import BaseExplorationModel
 3 | import torch.optim as optim
 4 | from torch import nn
 5 | import torch
 6 | 
 7 | def init_method_1(model):
 8 |     model.weight.data.uniform_()
 9 |     model.bias.data.uniform_()
10 | 
11 | def init_method_2(model):
12 |     model.weight.data.normal_()
13 |     model.bias.data.normal_()
14 | 
15 | 
16 | class RNDModel(nn.Module, BaseExplorationModel):
17 |     def __init__(self, hparams, optimizer_spec, **kwargs):
18 |         super().__init__(**kwargs)
19 |         self.ob_dim = hparams['ob_dim']
20 |         self.output_size = hparams['rnd_output_size']
21 |         self.n_layers = hparams['rnd_n_layers']
22 |         self.size = hparams['rnd_size']
23 |         self.optimizer_spec = optimizer_spec
24 | 
25 |         # TODO: Create two neural networks:
26 |         # 1) f, the random function we are trying to learn
27 |         # 2) f_hat, the function we are using to learn f
28 |         # WARNING: Make sure you use different types of weight 
29 |         #          initializations for these two functions
30 | 
31 |         # HINT 1) Check out the method ptu.build_mlp
32 |         # HINT 2) There are two weight init methods defined above
33 | 
34 |         self.f = None
35 |         self.f_hat = None
36 |         
37 |         self.optimizer = self.optimizer_spec.constructor(
38 |             self.f_hat.parameters(),
39 |             **self.optimizer_spec.optim_kwargs
40 |         )
41 |         self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR(
42 |             self.optimizer,
43 |             self.optimizer_spec.learning_rate_schedule,
44 |         )
45 | 
46 |         self.f.to(ptu.device)
47 |         self.f_hat.to(ptu.device)
48 | 
49 |     def forward(self, ob_no):
50 |         # TODO: Get the prediction error for ob_no
51 |         # HINT: Remember to detach the output of self.f!
52 |         error = None
53 |         return error
54 | 
55 |     def forward_np(self, ob_no):
56 |         ob_no = ptu.from_numpy(ob_no)
57 |         error = self(ob_no)
58 |         return ptu.to_numpy(error)
59 | 
60 |     def update(self, ob_no):
61 |         # TODO: Update f_hat using ob_no
62 |         # Hint: Take the mean prediction error across the batch
63 |         loss = None
64 |         return loss.item()
65 | 


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
 1 | from gym.wrappers import Monitor
 2 | import glob
 3 | import io
 4 | import base64
 5 | from IPython.display import HTML
 6 | from IPython import display as ipythondisplay
 7 | 
 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
 9 | 
10 | def show_video():
11 |   mp4list = glob.glob('/content/video/*.mp4')
12 |   if len(mp4list) > 0:
13 |     mp4 = mp4list[0]
14 |     video = io.open(mp4, 'r+b').read()
15 |     encoded = base64.b64encode(video)
16 |     ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
17 |                 loop controls style="height: 400px;">
18 |                 <source src="data:video/mp4;base64,{0}" type="video/mp4" />
19 |              </video>'''.format(encoded.decode('ascii'))))
20 |   else: 
21 |     print("Could not find video")
22 | 
23 | 
24 | def wrap_env(env):
25 |   env = Monitor(env, '/content/video', force=True)
26 |   return env
27 | 


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tensorboardX import SummaryWriter
 3 | import numpy as np
 4 | 
 5 | class Logger:
 6 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 7 |         self._log_dir = log_dir
 8 |         print('########################')
 9 |         print('logging outputs to ', log_dir)
10 |         print('########################')
11 |         self._n_logged_samples = n_logged_samples
12 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 | 
14 |     def log_scalar(self, scalar, name, step_):
15 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 | 
17 |     def log_scalars(self, scalar_dict, group_name, step, phase):
18 |         """Will log all scalars in the same plot."""
19 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 | 
21 |     def log_image(self, image, name, step):
22 |         assert(len(image.shape) == 3)  # [C, H, W]
23 |         self._summ_writer.add_image('{}'.format(name), image, step)
24 | 
25 |     def log_video(self, video_frames, name, step, fps=10):
26 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 | 
29 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 | 
31 |         # reshape the rollouts
32 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 | 
34 |         # max rollout length
35 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 |         max_length = videos[0].shape[0]
37 |         for i in range(max_videos_to_save):
38 |             if videos[i].shape[0]>max_length:
39 |                 max_length = videos[i].shape[0]
40 | 
41 |         # pad rollouts to all be same length
42 |         for i in range(max_videos_to_save):
43 |             if videos[i].shape[0]<max_length:
44 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
45 |                 videos[i] = np.concatenate([videos[i], padding], 0)
46 | 
47 |         # log videos to tensorboard event file
48 |         videos = np.stack(videos[:max_videos_to_save], 0)
49 |         self.log_video(videos, video_title, step, fps=fps)
50 | 
51 |     def log_figures(self, figure, name, step, phase):
52 |         """figure: matplotlib.pyplot figure handle"""
53 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
54 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 | 
56 |     def log_figure(self, figure, name, step, phase):
57 |         """figure: matplotlib.pyplot figure handle"""
58 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 | 
60 |     def log_graph(self, array, name, step, phase):
61 |         """figure: matplotlib.pyplot figure handle"""
62 |         im = plot_graph(array)
63 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 | 
65 |     def dump_scalars(self, log_path=None):
66 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 |         self._summ_writer.export_scalars_to_json(log_path)
68 | 
69 |     def flush(self):
70 |         self._summ_writer.flush()
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | Activation = Union[str, nn.Module]
 7 | 
 8 | 
 9 | _str_to_activation = {
10 |     'relu': nn.ReLU(),
11 |     'tanh': nn.Tanh(),
12 |     'leaky_relu': nn.LeakyReLU(),
13 |     'sigmoid': nn.Sigmoid(),
14 |     'selu': nn.SELU(),
15 |     'softplus': nn.Softplus(),
16 |     'identity': nn.Identity(),
17 | }
18 | 
19 | 
20 | def build_mlp(
21 |         input_size: int,
22 |         output_size: int,
23 |         n_layers: int,
24 |         size: int,
25 |         activation: Activation = 'tanh',
26 |         output_activation: Activation = 'identity',
27 |         init_method=None,
28 | ):
29 |     """
30 |         Builds a feedforward neural network
31 |         arguments:
32 |             input_placeholder: placeholder variable for the state (batch_size, input_size)
33 |             scope: variable scope of the network
34 |             n_layers: number of hidden layers
35 |             size: dimension of each hidden layer
36 |             activation: activation of each hidden layer
37 |             input_size: size of the input layer
38 |             output_size: size of the output layer
39 |             output_activation: activation of the output layer
40 |         returns:
41 |             output_placeholder: the result of a forward pass through the hidden layers + the output layer
42 |     """
43 |     if isinstance(activation, str):
44 |         activation = _str_to_activation[activation]
45 |     if isinstance(output_activation, str):
46 |         output_activation = _str_to_activation[output_activation]
47 |     layers = []
48 |     in_size = input_size
49 |     for _ in range(n_layers):
50 |         curr_layer = nn.Linear(in_size, size)
51 |         if init_method is not None:
52 |             curr_layer.apply(init_method)
53 |         layers.append(curr_layer)
54 |         layers.append(activation)
55 |         in_size = size
56 | 
57 |     last_layer = nn.Linear(in_size, output_size)
58 |     if init_method is not None:
59 |         last_layer.apply(init_method)
60 | 
61 |     layers.append(last_layer)
62 |     layers.append(output_activation)
63 |         
64 |     return nn.Sequential(*layers)
65 | 
66 | 
67 | device = None
68 | 
69 | 
70 | def init_gpu(use_gpu=True, gpu_id=0):
71 |     global device
72 |     if torch.cuda.is_available() and use_gpu:
73 |         device = torch.device("cuda:" + str(gpu_id))
74 |         print("Using GPU id {}".format(gpu_id))
75 |     else:
76 |         device = torch.device("cpu")
77 |         print("GPU not detected. Defaulting to CPU.")
78 | 
79 | 
80 | def set_device(gpu_id):
81 |     torch.cuda.set_device(gpu_id)
82 | 
83 | 
84 | def from_numpy(*args, **kwargs):
85 |     return torch.from_numpy(*args, **kwargs).float().to(device)
86 | 
87 | 
88 | def to_numpy(tensor):
89 |     return tensor.to('cpu').detach().numpy()
90 | 


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | from cs285.infrastructure.utils import *
  2 | 
  3 | 
  4 | class ReplayBuffer(object):
  5 | 
  6 |     def __init__(self, max_size=1000000):
  7 | 
  8 |         self.max_size = max_size
  9 |         self.paths = []
 10 |         self.obs = None
 11 |         self.acs = None
 12 |         self.concatenated_rews = None
 13 |         self.unconcatenated_rews = None
 14 |         self.next_obs = None
 15 |         self.terminals = None
 16 | 
 17 |     def add_rollouts(self, paths, noised=False):
 18 | 
 19 |         # add new rollouts into our list of rollouts
 20 |         for path in paths:
 21 |             tpath = dict()
 22 |             # print (path.keys())
 23 |             tpath['observation'] = path['observations']
 24 |             tpath['next_observation'] = path['next_observations']
 25 |             tpath['reward'] = path['rewards']
 26 |             tpath['action'] = path['actions']
 27 |             tpath['terminal'] = path['terminals']
 28 |             self.paths.append(tpath)
 29 | 
 30 |         # convert new rollouts into their component arrays, and append them onto our arrays
 31 |         observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(self.paths)
 32 | 
 33 |         if noised:
 34 |             observations = add_noise(observations)
 35 |             next_observations = add_noise(next_observations)
 36 | 
 37 |         if self.obs is None:
 38 |             self.obs = observations[-self.max_size:]
 39 |             self.acs = actions[-self.max_size:]
 40 |             self.next_obs = next_observations[-self.max_size:]
 41 |             self.terminals = terminals[-self.max_size:]
 42 |             self.concatenated_rews = concatenated_rews[-self.max_size:]
 43 |             self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
 44 |         else:
 45 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
 46 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
 47 |             self.next_obs = np.concatenate(
 48 |                 [self.next_obs, next_observations]
 49 |             )[-self.max_size:]
 50 |             self.terminals = np.concatenate(
 51 |                 [self.terminals, terminals]
 52 |             )[-self.max_size:]
 53 |             self.concatenated_rews = np.concatenate(
 54 |                 [self.concatenated_rews, concatenated_rews]
 55 |             )[-self.max_size:]
 56 |             if isinstance(unconcatenated_rews, list):
 57 |                 self.unconcatenated_rews += unconcatenated_rews  # TODO keep only latest max_size around
 58 |             else:
 59 |                 self.unconcatenated_rews.append(unconcatenated_rews)  # TODO keep only latest max_size around
 60 | 
 61 |         print (self.terminals.sum())
 62 |     ########################################
 63 |     ########################################
 64 | 
 65 |     def sample_random_rollouts(self, num_rollouts):
 66 |         rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
 67 |         return self.paths[rand_indices]
 68 | 
 69 |     def sample_recent_rollouts(self, num_rollouts=1):
 70 |         return self.paths[-num_rollouts:]
 71 | 
 72 |     def can_sample(self, batch_size):
 73 |         # print (self.obs.shape[0])
 74 |         if self.obs.shape[0] > batch_size:
 75 |             return True
 76 |         else:
 77 |             return False
 78 | 
 79 |     ########################################
 80 |     ########################################
 81 | 
 82 |     def sample_random_data(self, batch_size):
 83 | 
 84 |         assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
 85 |         rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
 86 |         return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
 87 | 
 88 |     def sample(self, batch_size):
 89 |         return self.sample_random_data(batch_size)
 90 |     
 91 |     def sample_recent_data(self, batch_size=1, concat_rew=True):
 92 | 
 93 |         if concat_rew:
 94 |             return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
 95 |         else:
 96 |             num_recent_rollouts_to_return = 0
 97 |             num_datapoints_so_far = 0
 98 |             index = -1
 99 |             while num_datapoints_so_far < batch_size:
100 |                 recent_rollout = self.paths[index]
101 |                 index -=1
102 |                 num_recent_rollouts_to_return +=1
103 |                 num_datapoints_so_far += get_pathlength(recent_rollout)
104 |             rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
105 |             observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
106 |             return observations, actions, unconcatenated_rews, next_observations, terminals
107 | 


--------------------------------------------------------------------------------
/hw5/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import itertools
  3 | from torch import nn
  4 | from torch.nn import functional as F
  5 | from torch import optim
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch import distributions
 10 | 
 11 | from cs285.infrastructure import pytorch_util as ptu
 12 | from cs285.policies.base_policy import BasePolicy
 13 | 
 14 | 
 15 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
 16 | 
 17 |     def __init__(self,
 18 |                  ac_dim,
 19 |                  ob_dim,
 20 |                  n_layers,
 21 |                  size,
 22 |                  discrete=False,
 23 |                  learning_rate=1e-4,
 24 |                  training=True,
 25 |                  nn_baseline=False,
 26 |                  **kwargs
 27 |                  ):
 28 |         super().__init__(**kwargs)
 29 | 
 30 |         # init vars
 31 |         self.ac_dim = ac_dim
 32 |         self.ob_dim = ob_dim
 33 |         self.n_layers = n_layers
 34 |         self.discrete = discrete
 35 |         self.size = size
 36 |         self.learning_rate = learning_rate
 37 |         self.training = training
 38 |         self.nn_baseline = nn_baseline
 39 | 
 40 |         if self.discrete:
 41 |             self.logits_na = ptu.build_mlp(input_size=self.ob_dim,
 42 |                                            output_size=self.ac_dim,
 43 |                                            n_layers=self.n_layers,
 44 |                                            size=self.size)
 45 |             self.logits_na.to(ptu.device)
 46 |             self.mean_net = None
 47 |             self.logstd = None
 48 |             self.optimizer = optim.Adam(self.logits_na.parameters(),
 49 |                                         self.learning_rate)
 50 |         else:
 51 |             self.logits_na = None
 52 |             self.mean_net = ptu.build_mlp(input_size=self.ob_dim,
 53 |                                       output_size=self.ac_dim,
 54 |                                       n_layers=self.n_layers, size=self.size)
 55 |             self.logstd = nn.Parameter(
 56 |                 torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
 57 |             )
 58 |             self.mean_net.to(ptu.device)
 59 |             self.logstd.to(ptu.device)
 60 |             self.optimizer = optim.Adam(
 61 |                 itertools.chain([self.logstd], self.mean_net.parameters()),
 62 |                 self.learning_rate
 63 |             )
 64 | 
 65 |         if nn_baseline:
 66 |             self.baseline = ptu.build_mlp(
 67 |                 input_size=self.ob_dim,
 68 |                 output_size=1,
 69 |                 n_layers=self.n_layers,
 70 |                 size=self.size,
 71 |             )
 72 |             self.baseline.to(ptu.device)
 73 |             self.baseline_optimizer = optim.Adam(
 74 |                 self.baseline.parameters(),
 75 |                 self.learning_rate,
 76 |             )
 77 |         else:
 78 |             self.baseline = None
 79 | 
 80 |     ##################################
 81 | 
 82 |     def save(self, filepath):
 83 |         torch.save(self.state_dict(), filepath)
 84 | 
 85 |     ##################################
 86 | 
 87 |     # query the policy with observation(s) to get selected action(s)
 88 |     def get_action(self, obs: np.ndarray) -> np.ndarray:
 89 |         raise NotImplementedError
 90 |         # TODO: get this from hw1
 91 | 
 92 |     ####################################
 93 |     ####################################
 94 | 
 95 |     # update/train this policy
 96 |     def update(self, observations, actions, **kwargs):
 97 |         raise NotImplementedError
 98 | 
 99 |     # This function defines the forward pass of the network.
100 |     # You can return anything you want, but you should be able to differentiate
101 |     # through it. For example, you can return a torch.FloatTensor. You can also
102 |     # return more flexible objects, such as a
103 |     # `torch.distributions.Distribution` object. It's up to you!
104 |     def forward(self, observation: torch.FloatTensor):
105 |         raise NotImplementedError
106 |         # TODO: get this from hw1
107 | 
108 |     ####################################
109 |     ####################################
110 | 
111 | 
112 | #####################################################
113 | #####################################################
114 | 
115 | 
116 | class MLPPolicyAC(MLPPolicy):
117 |     # MJ: cut acs_labels_na and qvals from the signature if they are not used
118 |     def update(
119 |             self, observations, actions,
120 |             adv_n=None, acs_labels_na=None, qvals=None
121 |     ):
122 |         raise NotImplementedError
123 |         # Not needed for this homework
124 | 
125 |     ####################################
126 |     ####################################
127 | 


--------------------------------------------------------------------------------
/hw5/cs285/policies/argmax_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pdb
 3 | 
 4 | 
 5 | class ArgMaxPolicy(object):
 6 | 
 7 |     def __init__(self, critic):
 8 |         self.critic = critic
 9 | 
10 |     def set_critic(self, critic):
11 |         self.critic = critic
12 | 
13 |     def get_action(self, obs):
14 |         # MJ: changed the dimension check to a 3
15 |         if len(obs.shape) > 3:
16 |             observation = obs
17 |         else:
18 |             observation = obs[None]
19 | 
20 |         raise NotImplementedError
21 |         # TODO: get this from hw3
22 | 
23 |     ####################################
24 |     ####################################


--------------------------------------------------------------------------------
/hw5/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import numpy as np
 3 | 
 4 | 
 5 | class BasePolicy(object, metaclass=abc.ABCMeta):
 6 |     def get_action(self, obs: np.ndarray) -> np.ndarray:
 7 |         raise NotImplementedError
 8 | 
 9 |     def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 |         """Return a dictionary of logging information."""
11 |         raise NotImplementedError
12 | 
13 |     def save(self, filepath: str):
14 |         raise NotImplementedError
15 | 


--------------------------------------------------------------------------------
/hw5/cs285/scripts/read_results.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import tensorflow as tf
 3 | 
 4 | def get_section_results(file):
 5 |     """
 6 |         requires tensorflow==1.12.0
 7 |     """
 8 |     X = []
 9 |     Y = []
10 |     for e in tf.train.summary_iterator(file):
11 |         for v in e.summary.value:
12 |             if v.tag == 'Train_EnvstepsSoFar':
13 |                 X.append(v.simple_value)
14 |             elif v.tag == 'Eval_AverageReturn':
15 |                 Y.append(v.simple_value)
16 |     return X, Y
17 | 
18 | if __name__ == '__main__':
19 |     import glob
20 | 
21 |     logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*'
22 |     eventfile = glob.glob(logdir)[0]
23 | 
24 |     X, Y = get_section_results(eventfile)
25 |     for i, (x, y) in enumerate(zip(X, Y)):
26 |         print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y))


--------------------------------------------------------------------------------
/hw5/cs285/scripts/run_hw5_expl.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | 
  4 | from cs285.infrastructure.rl_trainer import RL_Trainer
  5 | from cs285.agents.explore_or_exploit_agent import ExplorationOrExploitationAgent
  6 | from cs285.infrastructure.dqn_utils import get_env_kwargs, PiecewiseSchedule, ConstantSchedule
  7 | 
  8 | 
  9 | class Q_Trainer(object):
 10 | 
 11 |     def __init__(self, params):
 12 |         self.params = params
 13 | 
 14 |         train_args = {
 15 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
 16 |             'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
 17 |             'train_batch_size': params['batch_size'],
 18 |             'double_q': params['double_q'],
 19 |         }
 20 | 
 21 |         env_args = get_env_kwargs(params['env_name'])
 22 | 
 23 |         self.agent_params = {**train_args, **env_args, **params}
 24 | 
 25 |         self.params['agent_class'] = ExplorationOrExploitationAgent
 26 |         self.params['agent_params'] = self.agent_params
 27 |         self.params['train_batch_size'] = params['batch_size']
 28 |         self.params['env_wrappers'] = self.agent_params['env_wrappers']
 29 | 
 30 |         self.rl_trainer = RL_Trainer(self.params)
 31 | 
 32 |     def run_training_loop(self):
 33 |         self.rl_trainer.run_training_loop(
 34 |             self.agent_params['num_timesteps'],
 35 |             collect_policy = self.rl_trainer.agent.actor,
 36 |             eval_policy = self.rl_trainer.agent.actor,
 37 |             )
 38 | 
 39 | def main():
 40 | 
 41 |     import argparse
 42 |     parser = argparse.ArgumentParser()
 43 |     parser.add_argument(
 44 |         '--env_name',
 45 |         default='PointmassHard-v0',
 46 |         choices=('PointmassEasy-v0', 'PointmassMedium-v0', 'PointmassHard-v0', 'PointmassVeryHard-v0')
 47 |     )
 48 | 
 49 |     parser.add_argument('--exp_name', type=str, default='todo')
 50 | 
 51 |     parser.add_argument('--eval_batch_size', type=int, default=1000)
 52 |     parser.add_argument('--batch_size', type=int, default=256)
 53 | 
 54 |     parser.add_argument('--use_rnd', action='store_true')
 55 |     parser.add_argument('--num_exploration_steps', type=int, default=10000)
 56 |     parser.add_argument('--unsupervised_exploration', action='store_true')
 57 | 
 58 |     parser.add_argument('--offline_exploitation', action='store_true')
 59 |     parser.add_argument('--cql_alpha', type=float, default=0.0)
 60 | 
 61 |     parser.add_argument('--exploit_rew_shift', type=float, default=0.0)
 62 |     parser.add_argument('--exploit_rew_scale', type=float, default=1.0)
 63 | 
 64 |     parser.add_argument('--rnd_output_size', type=int, default=5)
 65 |     parser.add_argument('--rnd_n_layers', type=int, default=2)
 66 |     parser.add_argument('--rnd_size', type=int, default=400)
 67 | 
 68 |     parser.add_argument('--seed', type=int, default=2)
 69 |     parser.add_argument('--no_gpu', '-ngpu', action='store_true')
 70 |     parser.add_argument('--which_gpu', '-gpu_id', default=0)
 71 |     parser.add_argument('--scalar_log_freq', type=int, default=int(1e3))
 72 |     parser.add_argument('--save_params', action='store_true')
 73 | 
 74 |     args = parser.parse_args()
 75 | 
 76 |     # convert to dictionary
 77 |     params = vars(args)
 78 |     params['double_q'] = True
 79 |     params['num_agent_train_steps_per_iter'] = 1
 80 |     params['num_critic_updates_per_agent_update'] = 1
 81 |     params['exploit_weight_schedule'] = ConstantSchedule(1.0)
 82 |     params['video_log_freq'] = -1 # This param is not used for DQN
 83 |     params['num_timesteps'] = 50000
 84 |     params['learning_starts'] = 2000
 85 |     params['eps'] = 0.2
 86 |     ##################################
 87 |     ### CREATE DIRECTORY FOR LOGGING
 88 |     ##################################
 89 | 
 90 |     if params['env_name']=='PointmassEasy-v0':
 91 |         params['ep_len']=50
 92 |     if params['env_name']=='PointmassMedium-v0':
 93 |         params['ep_len']=150
 94 |     if params['env_name']=='PointmassHard-v0':
 95 |         params['ep_len']=100
 96 |     if params['env_name']=='PointmassVeryHard-v0':
 97 |         params['ep_len']=200
 98 |     
 99 |     if params['use_rnd']:
100 |         params['explore_weight_schedule'] = PiecewiseSchedule([(0,1), (params['num_exploration_steps'], 0)], outside_value=0.0)
101 |     else:
102 |         params['explore_weight_schedule'] = ConstantSchedule(0.0)
103 | 
104 |     if params['unsupervised_exploration']:
105 |         params['explore_weight_schedule'] = ConstantSchedule(1.0)
106 |         params['exploit_weight_schedule'] = ConstantSchedule(0.0)
107 |         
108 |         if not params['use_rnd']:
109 |             params['learning_starts'] = params['num_exploration_steps']
110 |     
111 | 
112 |     logdir_prefix = 'hw5_expl_'  # keep for autograder
113 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data')
114 | 
115 |     if not (os.path.exists(data_path)):
116 |         os.makedirs(data_path)
117 | 
118 |     logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
119 |     logdir = os.path.join(data_path, logdir)
120 |     params['logdir'] = logdir
121 |     if not(os.path.exists(logdir)):
122 |         os.makedirs(logdir)
123 | 
124 |     print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
125 | 
126 |     trainer = Q_Trainer(params)
127 |     trainer.run_training_loop()
128 | 
129 | 
130 | if __name__ == "__main__":
131 |     main()
132 | 


--------------------------------------------------------------------------------
/hw5/hw5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/hw5.pdf


--------------------------------------------------------------------------------
/hw5/requirements.txt:
--------------------------------------------------------------------------------
 1 | gym==0.17.2
 2 | mujoco-py==2.0.2.2
 3 | tensorboard==2.3.0
 4 | tensorboardX==1.8
 5 | matplotlib==2.2.2
 6 | ipython==6.4.0
 7 | moviepy==1.0.0
 8 | pyvirtualdisplay==1.3.2
 9 | torch==1.5.1
10 | opencv-python==4.4.0.42
11 | networkx==2.5
12 | ipdb==0.13.3
13 | box2d-py
14 | 


--------------------------------------------------------------------------------
/hw5/requirements_colab.txt:
--------------------------------------------------------------------------------
 1 | gym==0.17.2
 2 | tensorboard==2.3.0
 3 | tensorboardX==1.8
 4 | matplotlib==2.2.2
 5 | ipython==6.4.0
 6 | moviepy==1.0.0
 7 | pyvirtualdisplay==1.3.2
 8 | torch==1.5.1
 9 | opencv-python==4.4.0.42
10 | networkx==2.5
11 | ipdb==0.13.3
12 | box2d-py
13 | 


--------------------------------------------------------------------------------
/hw5/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------