├── .gitignore
├── README.md
├── hw1
├── README.md
├── cs285
│ ├── agents
│ │ ├── __init__.py
│ │ ├── base_agent.py
│ │ └── bc_agent.py
│ ├── expert_data
│ │ ├── expert_data_Ant-v2.pkl
│ │ ├── expert_data_HalfCheetah-v2.pkl
│ │ ├── expert_data_Hopper-v2.pkl
│ │ ├── expert_data_Humanoid-v2.pkl
│ │ └── expert_data_Walker2d-v2.pkl
│ ├── infrastructure
│ │ ├── __init__.py
│ │ ├── colab_utils.py
│ │ ├── logger.py
│ │ ├── pytorch_util.py
│ │ ├── replay_buffer.py
│ │ ├── rl_trainer.py
│ │ └── utils.py
│ ├── policies
│ │ ├── MLP_policy.py
│ │ ├── __init__.py
│ │ ├── base_policy.py
│ │ ├── experts
│ │ │ ├── Ant.pkl
│ │ │ ├── HalfCheetah.pkl
│ │ │ ├── Hopper.pkl
│ │ │ ├── Humanoid.pkl
│ │ │ └── Walker2d.pkl
│ │ └── loaded_gaussian_policy.py
│ └── scripts
│ │ ├── run_hw1.ipynb
│ │ └── run_hw1.py
├── cs285_hw1.pdf
├── installation.md
├── requirements.txt
├── requirements_colab.txt
└── setup.py
├── hw2
├── README.md
├── cs285
│ ├── agents
│ │ ├── __init__.py
│ │ ├── base_agent.py
│ │ └── pg_agent.py
│ ├── infrastructure
│ │ ├── __init__.py
│ │ ├── colab_utils.py
│ │ ├── logger.py
│ │ ├── pytorch_util.py
│ │ ├── replay_buffer.py
│ │ ├── rl_trainer.py
│ │ └── utils.py
│ ├── policies
│ │ ├── MLP_policy.py
│ │ ├── __init__.py
│ │ └── base_policy.py
│ └── scripts
│ │ ├── read_results.py
│ │ ├── run_hw2.ipynb
│ │ └── run_hw2.py
├── cs285_hw2.pdf
├── requirements.txt
├── requirements_colab.txt
└── setup.py
├── hw3
├── README.md
├── cs285
│ ├── agents
│ │ ├── ac_agent.py
│ │ ├── base_agent.py
│ │ └── dqn_agent.py
│ ├── critics
│ │ ├── __init__.py
│ │ ├── base_critic.py
│ │ ├── bootstrapped_continuous_critic.py
│ │ └── dqn_critic.py
│ ├── envs
│ │ ├── __init__.py
│ │ └── box2d
│ │ │ ├── __init__.py
│ │ │ └── lunar_lander.py
│ ├── infrastructure
│ │ ├── atari_wrappers.py
│ │ ├── colab_utils.py
│ │ ├── dqn_utils.py
│ │ ├── logger.py
│ │ ├── pytorch_util.py
│ │ ├── replay_buffer.py
│ │ ├── rl_trainer.py
│ │ └── utils.py
│ ├── policies
│ │ ├── MLP_policy.py
│ │ ├── argmax_policy.py
│ │ └── base_policy.py
│ └── scripts
│ │ ├── read_results.py
│ │ ├── run_hw3_actor_critic.ipynb
│ │ ├── run_hw3_actor_critic.py
│ │ ├── run_hw3_dqn.ipynb
│ │ └── run_hw3_dqn.py
├── cs285_hw3.pdf
├── requirements.txt
├── requirements_colab.txt
└── setup.py
├── hw4
├── README.md
├── cs285
│ ├── agents
│ │ ├── base_agent.py
│ │ └── mb_agent.py
│ ├── envs
│ │ ├── __init__.py
│ │ ├── cheetah
│ │ │ ├── __init__.py
│ │ │ └── cheetah.py
│ │ ├── obstacles
│ │ │ ├── __init__.py
│ │ │ └── obstacles_env.py
│ │ └── reacher
│ │ │ ├── __init__.py
│ │ │ ├── assets
│ │ │ └── sawyer.xml
│ │ │ └── reacher_env.py
│ ├── infrastructure
│ │ ├── colab_utils.py
│ │ ├── logger.py
│ │ ├── pytorch_util.py
│ │ ├── replay_buffer.py
│ │ ├── rl_trainer.py
│ │ └── utils.py
│ ├── models
│ │ ├── base_model.py
│ │ └── ff_model.py
│ ├── policies
│ │ ├── MPC_policy.py
│ │ └── base_policy.py
│ └── scripts
│ │ ├── filter_events.py
│ │ ├── read_results.py
│ │ ├── run_hw4_mb.ipynb
│ │ └── run_hw4_mb.py
├── cs285_hw4.pdf
├── requirements.txt
├── requirements_colab.txt
└── setup.py
└── hw5
├── README.md
├── cs285
├── agents
│ ├── ac_agent.py
│ ├── base_agent.py
│ ├── dqn_agent.py
│ └── explore_or_exploit_agent.py
├── critics
│ ├── __init__.py
│ ├── base_critic.py
│ ├── bootstrapped_continuous_critic.py
│ ├── cql_critic.py
│ └── dqn_critic.py
├── envs
│ ├── __init__.py
│ ├── ant
│ │ ├── __init__.py
│ │ └── ant.py
│ ├── box2d
│ │ ├── __init__.py
│ │ └── lunar_lander.py
│ ├── cheetah
│ │ ├── __init__.py
│ │ └── cheetah.py
│ ├── obstacles
│ │ ├── __init__.py
│ │ └── obstacles_env.py
│ ├── pointmass
│ │ └── pointmass.py
│ └── reacher
│ │ ├── __init__.py
│ │ ├── assets
│ │ └── sawyer.xml
│ │ └── reacher_env.py
├── exploration
│ ├── __init__.py
│ ├── base_exploration_model.py
│ └── rnd_model.py
├── infrastructure
│ ├── atari_wrappers.py
│ ├── colab_utils.py
│ ├── dqn_utils.py
│ ├── logger.py
│ ├── pytorch_util.py
│ ├── replay_buffer.py
│ ├── rl_trainer.py
│ └── utils.py
├── policies
│ ├── MLP_policy.py
│ ├── argmax_policy.py
│ └── base_policy.py
└── scripts
│ ├── read_results.py
│ └── run_hw5_expl.py
├── hw5.pdf
├── requirements.txt
├── requirements_colab.txt
├── run_hw5_expl.ipynb
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | data/
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Assignments for [Berkeley CS 285: Deep Reinforcement Learning, Decision Making, and Control](http://rail.eecs.berkeley.edu/deeprlcourse/).
2 |
--------------------------------------------------------------------------------
/hw1/README.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 |
3 | You can run this code on your own machine or on Google Colab.
4 |
5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](installation.md) for instructions.
6 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below:
7 |
8 | [](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw1/cs285/scripts/run_hw1.ipynb)
9 |
10 | ## Complete the code
11 |
12 | Fill in sections marked with `TODO`. In particular, see
13 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
14 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
15 | - [infrastructure/replay_buffer.py](cs285/infrastructure/replay_buffer.py)
16 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
17 | - [infrastructure/pytorch_util.py](cs285/infrastructure/pytorch_util.py)
18 |
19 | Look for sections maked with `HW1` to see how the edits you make will be used.
20 | Some other files that you may find relevant
21 | - [scripts/run_hw1.py](cs285/scripts/run_hw1.py) (if running locally) or [scripts/run_hw1.ipynb](cs285/scripts/run_hw1.ipynb) (if running on Colab)
22 | - [agents/bc_agent.py](cs285/agents/bc_agent.py)
23 |
24 | See the homework pdf for more details.
25 |
26 | ## Run the code
27 |
28 | Tip: While debugging, you probably want to keep the flag `--video_log_freq -1` which will disable video logging and speed up the experiment. However, feel free to remove it to save videos of your awesome policy!
29 |
30 | If running on Colab, adjust the `#@params` in the `Args` class according to the commmand line arguments above.
31 |
32 | ### Section 1 (Behavior Cloning)
33 | Command for problem 1:
34 |
35 | ```
36 | python cs285/scripts/run_hw1.py \
37 | --expert_policy_file cs285/policies/experts/Ant.pkl \
38 | --env_name Ant-v2 --exp_name bc_ant --n_iter 1 \
39 | --expert_data cs285/expert_data/expert_data_Ant-v2.pkl
40 | --video_log_freq -1
41 | ```
42 |
43 | Make sure to also try another environment.
44 | See the homework PDF for more details on what else you need to run.
45 | To generate videos of the policy, remove the `--video_log_freq -1` flag.
46 |
47 | ### Section 2 (DAgger)
48 | Command for section 1:
49 | (Note the `--do_dagger` flag, and the higher value for `n_iter`)
50 |
51 | ```
52 | python cs285/scripts/run_hw1.py \
53 | --expert_policy_file cs285/policies/experts/Ant.pkl \
54 | --env_name Ant-v2 --exp_name dagger_ant --n_iter 10 \
55 | --do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \
56 | --video_log_freq -1
57 | ```
58 |
59 | Make sure to also try another environment.
60 | See the homework PDF for more details on what else you need to run.
61 |
62 | ## Visualization the saved tensorboard event file:
63 |
64 | You can visualize your runs using tensorboard:
65 | ```
66 | tensorboard --logdir data
67 | ```
68 |
69 | You will see scalar summaries as well as videos of your trained policies (in the 'images' tab).
70 |
71 | You can choose to visualize specific runs with a comma-separated list:
72 | ```
73 | tensorboard --logdir data/run1,data/run2,data/run3...
74 | ```
75 |
76 | If running on Colab, you will be using the `%tensorboard` [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) to do the same thing; see the [notebook](cs285/scripts/run_hw1.ipynb) for more details.
77 |
78 |
--------------------------------------------------------------------------------
/hw1/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/agents/__init__.py
--------------------------------------------------------------------------------
/hw1/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
1 |
2 | class BaseAgent(object):
3 | def __init__(self, **kwargs):
4 | super(BaseAgent, self).__init__(**kwargs)
5 |
6 | def train(self) -> dict:
7 | """Return a dictionary of logging information."""
8 | raise NotImplementedError
9 |
10 | def add_to_replay_buffer(self, paths):
11 | raise NotImplementedError
12 |
13 | def sample(self, batch_size):
14 | raise NotImplementedError
15 |
16 | def save(self, path):
17 | raise NotImplementedError
18 |
--------------------------------------------------------------------------------
/hw1/cs285/agents/bc_agent.py:
--------------------------------------------------------------------------------
1 | from cs285.infrastructure.replay_buffer import ReplayBuffer
2 | from cs285.policies.MLP_policy import MLPPolicySL
3 | from .base_agent import BaseAgent
4 |
5 |
6 | class BCAgent(BaseAgent):
7 | def __init__(self, env, agent_params):
8 | super(BCAgent, self).__init__()
9 |
10 | # init vars
11 | self.env = env
12 | self.agent_params = agent_params
13 |
14 | # actor/policy
15 | self.actor = MLPPolicySL(
16 | self.agent_params['ac_dim'],
17 | self.agent_params['ob_dim'],
18 | self.agent_params['n_layers'],
19 | self.agent_params['size'],
20 | discrete=self.agent_params['discrete'],
21 | learning_rate=self.agent_params['learning_rate'],
22 | )
23 |
24 | # replay buffer
25 | self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
26 |
27 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
28 | # training a BC agent refers to updating its actor using
29 | # the given observations and corresponding action labels
30 | log = self.actor.update(ob_no, ac_na) # HW1: you will modify this
31 | return log
32 |
33 | def add_to_replay_buffer(self, paths):
34 | self.replay_buffer.add_rollouts(paths)
35 |
36 | def sample(self, batch_size):
37 | return self.replay_buffer.sample_random_data(batch_size) # HW1: you will modify this
38 |
39 | def save(self, path):
40 | return self.actor.save(path)
--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Ant-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Ant-v2.pkl
--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl
--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl
--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl
--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl
--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/infrastructure/__init__.py
--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
1 | from gym.wrappers import Monitor
2 | import glob
3 | import io
4 | import base64
5 | from IPython.display import HTML
6 | from IPython import display as ipythondisplay
7 |
8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
9 |
10 | def show_video():
11 | mp4list = glob.glob('/content/video/*.mp4')
12 | if len(mp4list) > 0:
13 | mp4 = mp4list[0]
14 | video = io.open(mp4, 'r+b').read()
15 | encoded = base64.b64encode(video)
16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii'))))
20 | else:
21 | print("Could not find video")
22 |
23 |
24 | def wrap_env(env):
25 | env = Monitor(env, '/content/video', force=True)
26 | return env
--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tensorboardX import SummaryWriter
3 | import numpy as np
4 |
5 | class Logger:
6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
7 | self._log_dir = log_dir
8 | print('########################')
9 | print('logging outputs to ', log_dir)
10 | print('########################')
11 | self._n_logged_samples = n_logged_samples
12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 |
14 | def log_scalar(self, scalar, name, step_):
15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 |
17 | def log_scalars(self, scalar_dict, group_name, step, phase):
18 | """Will log all scalars in the same plot."""
19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 |
21 | def log_image(self, image, name, step):
22 | assert(len(image.shape) == 3) # [C, H, W]
23 | self._summ_writer.add_image('{}'.format(name), image, step)
24 |
25 | def log_video(self, video_frames, name, step, fps=10):
26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 |
29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 |
31 | # reshape the rollouts
32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 |
34 | # max rollout length
35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 | max_length = videos[0].shape[0]
37 | for i in range(max_videos_to_save):
38 | if videos[i].shape[0]>max_length:
39 | max_length = videos[i].shape[0]
40 |
41 | # pad rollouts to all be same length
42 | for i in range(max_videos_to_save):
43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!"
54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 |
56 | def log_figure(self, figure, name, step, phase):
57 | """figure: matplotlib.pyplot figure handle"""
58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 |
60 | def log_graph(self, array, name, step, phase):
61 | """figure: matplotlib.pyplot figure handle"""
62 | im = plot_graph(array)
63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 |
65 | def dump_scalars(self, log_path=None):
66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 | self._summ_writer.export_scalars_to_json(log_path)
68 |
69 | def flush(self):
70 | self._summ_writer.flush()
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import torch
4 | from torch import nn
5 |
6 | Activation = Union[str, nn.Module]
7 |
8 |
9 | _str_to_activation = {
10 | 'relu': nn.ReLU(),
11 | 'tanh': nn.Tanh(),
12 | 'leaky_relu': nn.LeakyReLU(),
13 | 'sigmoid': nn.Sigmoid(),
14 | 'selu': nn.SELU(),
15 | 'softplus': nn.Softplus(),
16 | 'identity': nn.Identity(),
17 | }
18 |
19 |
20 | def build_mlp(
21 | input_size: int,
22 | output_size: int,
23 | n_layers: int,
24 | size: int,
25 | activation: Activation = 'tanh',
26 | output_activation: Activation = 'identity',
27 | ) -> nn.Module:
28 | """
29 | Builds a feedforward neural network
30 |
31 | arguments:
32 | n_layers: number of hidden layers
33 | size: dimension of each hidden layer
34 | activation: activation of each hidden layer
35 |
36 | input_size: size of the input layer
37 | output_size: size of the output layer
38 | output_activation: activation of the output layer
39 |
40 | returns:
41 | MLP (nn.Module)
42 | """
43 | if isinstance(activation, str):
44 | activation = _str_to_activation[activation]
45 | if isinstance(output_activation, str):
46 | output_activation = _str_to_activation[output_activation]
47 |
48 | # TODO: return a MLP. This should be an instance of nn.Module
49 | # Note: nn.Sequential is an instance of nn.Module.
50 | raise NotImplementedError
51 |
52 |
53 | device = None
54 |
55 |
56 | def init_gpu(use_gpu=True, gpu_id=0):
57 | global device
58 | if torch.cuda.is_available() and use_gpu:
59 | device = torch.device("cuda:" + str(gpu_id))
60 | print("Using GPU id {}".format(gpu_id))
61 | else:
62 | device = torch.device("cpu")
63 | print("GPU not detected. Defaulting to CPU.")
64 |
65 |
66 | def set_device(gpu_id):
67 | torch.cuda.set_device(gpu_id)
68 |
69 |
70 | def from_numpy(*args, **kwargs):
71 | return torch.from_numpy(*args, **kwargs).float().to(device)
72 |
73 |
74 | def to_numpy(tensor):
75 | return tensor.to('cpu').detach().numpy()
76 |
--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
1 | from cs285.infrastructure.utils import *
2 |
3 |
4 | class ReplayBuffer(object):
5 |
6 | def __init__(self, max_size=1000000):
7 |
8 | self.max_size = max_size
9 |
10 | # store each rollout
11 | self.paths = []
12 |
13 | # store (concatenated) component arrays from each rollout
14 | self.obs = None
15 | self.acs = None
16 | self.rews = None
17 | self.next_obs = None
18 | self.terminals = None
19 |
20 | def __len__(self):
21 | if self.obs:
22 | return self.obs.shape[0]
23 | else:
24 | return 0
25 |
26 | def add_rollouts(self, paths, concat_rew=True):
27 |
28 | # add new rollouts into our list of rollouts
29 | for path in paths:
30 | self.paths.append(path)
31 |
32 | # convert new rollouts into their component arrays, and append them onto
33 | # our arrays
34 | observations, actions, rewards, next_observations, terminals = (
35 | convert_listofrollouts(paths, concat_rew))
36 |
37 | if self.obs is None:
38 | self.obs = observations[-self.max_size:]
39 | self.acs = actions[-self.max_size:]
40 | self.rews = rewards[-self.max_size:]
41 | self.next_obs = next_observations[-self.max_size:]
42 | self.terminals = terminals[-self.max_size:]
43 | else:
44 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
45 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
46 | if concat_rew:
47 | self.rews = np.concatenate(
48 | [self.rews, rewards]
49 | )[-self.max_size:]
50 | else:
51 | if isinstance(rewards, list):
52 | self.rews += rewards
53 | else:
54 | self.rews.append(rewards)
55 | self.rews = self.rews[-self.max_size:]
56 | self.next_obs = np.concatenate(
57 | [self.next_obs, next_observations]
58 | )[-self.max_size:]
59 | self.terminals = np.concatenate(
60 | [self.terminals, terminals]
61 | )[-self.max_size:]
62 |
63 | ########################################
64 | ########################################
65 |
66 | def sample_random_data(self, batch_size):
67 | assert (
68 | self.obs.shape[0]
69 | == self.acs.shape[0]
70 | == self.rews.shape[0]
71 | == self.next_obs.shape[0]
72 | == self.terminals.shape[0]
73 | )
74 |
75 | ## TODO return batch_size number of random entries from each of the 5 component arrays above
76 | ## HINT 1: use np.random.permutation to sample random indices
77 | ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
78 | ## HINT 3: look at the sample_recent_data function below
79 |
80 | return TODO, TODO, TODO, TODO, TODO
81 |
82 | def sample_recent_data(self, batch_size=1):
83 | return (
84 | self.obs[-batch_size:],
85 | self.acs[-batch_size:],
86 | self.rews[-batch_size:],
87 | self.next_obs[-batch_size:],
88 | self.terminals[-batch_size:],
89 | )
90 |
--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 |
4 | ############################################
5 | ############################################
6 |
7 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
8 |
9 | # initialize env for the beginning of a new rollout
10 | ob = TODO # HINT: should be the output of resetting the env
11 |
12 | # init vars
13 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
14 | steps = 0
15 | while True:
16 |
17 | # render image of the simulated env
18 | if render:
19 | if 'rgb_array' in render_mode:
20 | if hasattr(env, 'sim'):
21 | image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
22 | else:
23 | image_obs.append(env.render(mode=render_mode))
24 | if 'human' in render_mode:
25 | env.render(mode=render_mode)
26 | time.sleep(env.model.opt.timestep)
27 |
28 | # use the most recent ob to decide what to do
29 | obs.append(ob)
30 | ac = TODO # HINT: query the policy's get_action function
31 | ac = ac[0]
32 | acs.append(ac)
33 |
34 | # take that action and record results
35 | ob, rew, done, _ = env.step(ac)
36 |
37 | # record result of taking that action
38 | steps += 1
39 | next_obs.append(ob)
40 | rewards.append(rew)
41 |
42 | # TODO end the rollout if the rollout ended
43 | # HINT: rollout can end due to done, or due to max_path_length
44 | rollout_done = TODO # HINT: this is either 0 or 1
45 | terminals.append(rollout_done)
46 |
47 | if rollout_done:
48 | break
49 |
50 | return Path(obs, image_obs, acs, rewards, next_obs, terminals)
51 |
52 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
53 | """
54 | Collect rollouts until we have collected min_timesteps_per_batch steps.
55 |
56 | TODO implement this function
57 | Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
58 | Hint2: use get_pathlength to count the timesteps collected in each path
59 | """
60 | timesteps_this_batch = 0
61 | paths = []
62 | while timesteps_this_batch < min_timesteps_per_batch:
63 |
64 | TODO
65 |
66 | return paths, timesteps_this_batch
67 |
68 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
69 | """
70 | Collect ntraj rollouts.
71 |
72 | TODO implement this function
73 | Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
74 | """
75 | paths = []
76 |
77 | TODO
78 |
79 | return paths
80 |
81 | ############################################
82 | ############################################
83 |
84 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
85 | """
86 | Take info (separate arrays) from a single rollout
87 | and return it in a single dictionary
88 | """
89 | if image_obs != []:
90 | image_obs = np.stack(image_obs, axis=0)
91 | return {"observation" : np.array(obs, dtype=np.float32),
92 | "image_obs" : np.array(image_obs, dtype=np.uint8),
93 | "reward" : np.array(rewards, dtype=np.float32),
94 | "action" : np.array(acs, dtype=np.float32),
95 | "next_observation": np.array(next_obs, dtype=np.float32),
96 | "terminal": np.array(terminals, dtype=np.float32)}
97 |
98 |
99 | def convert_listofrollouts(paths, concat_rew=True):
100 | """
101 | Take a list of rollout dictionaries
102 | and return separate arrays,
103 | where each array is a concatenation of that array from across the rollouts
104 | """
105 | observations = np.concatenate([path["observation"] for path in paths])
106 | actions = np.concatenate([path["action"] for path in paths])
107 | if concat_rew:
108 | rewards = np.concatenate([path["reward"] for path in paths])
109 | else:
110 | rewards = [path["reward"] for path in paths]
111 | next_observations = np.concatenate([path["next_observation"] for path in paths])
112 | terminals = np.concatenate([path["terminal"] for path in paths])
113 | return observations, actions, rewards, next_observations, terminals
114 |
115 | ############################################
116 | ############################################
117 |
118 | def get_pathlength(path):
119 | return len(path["reward"])
--------------------------------------------------------------------------------
/hw1/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import itertools
3 | from typing import Any
4 | from torch import nn
5 | from torch.nn import functional as F
6 | from torch import optim
7 |
8 | import numpy as np
9 | import torch
10 | from torch import distributions
11 |
12 | from cs285.infrastructure import pytorch_util as ptu
13 | from cs285.policies.base_policy import BasePolicy
14 |
15 |
16 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
17 |
18 | def __init__(self,
19 | ac_dim,
20 | ob_dim,
21 | n_layers,
22 | size,
23 | discrete=False,
24 | learning_rate=1e-4,
25 | training=True,
26 | nn_baseline=False,
27 | **kwargs
28 | ):
29 | super().__init__(**kwargs)
30 |
31 | # init vars
32 | self.ac_dim = ac_dim
33 | self.ob_dim = ob_dim
34 | self.n_layers = n_layers
35 | self.discrete = discrete
36 | self.size = size
37 | self.learning_rate = learning_rate
38 | self.training = training
39 | self.nn_baseline = nn_baseline
40 |
41 | if self.discrete:
42 | self.logits_na = ptu.build_mlp(
43 | input_size=self.ob_dim,
44 | output_size=self.ac_dim,
45 | n_layers=self.n_layers,
46 | size=self.size,
47 | )
48 | self.logits_na.to(ptu.device)
49 | self.mean_net = None
50 | self.logstd = None
51 | self.optimizer = optim.Adam(self.logits_na.parameters(),
52 | self.learning_rate)
53 | else:
54 | self.logits_na = None
55 | self.mean_net = ptu.build_mlp(
56 | input_size=self.ob_dim,
57 | output_size=self.ac_dim,
58 | n_layers=self.n_layers, size=self.size,
59 | )
60 | self.mean_net.to(ptu.device)
61 | self.logstd = nn.Parameter(
62 | torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
63 | )
64 | self.logstd.to(ptu.device)
65 | self.optimizer = optim.Adam(
66 | itertools.chain([self.logstd], self.mean_net.parameters()),
67 | self.learning_rate
68 | )
69 |
70 | ##################################
71 |
72 | def save(self, filepath):
73 | torch.save(self.state_dict(), filepath)
74 |
75 | ##################################
76 |
77 | def get_action(self, obs: np.ndarray) -> np.ndarray:
78 | if len(obs.shape) > 1:
79 | observation = obs
80 | else:
81 | observation = obs[None]
82 |
83 | # TODO return the action that the policy prescribes
84 | raise NotImplementedError
85 |
86 | # update/train this policy
87 | def update(self, observations, actions, **kwargs):
88 | raise NotImplementedError
89 |
90 | # This function defines the forward pass of the network.
91 | # You can return anything you want, but you should be able to differentiate
92 | # through it. For example, you can return a torch.FloatTensor. You can also
93 | # return more flexible objects, such as a
94 | # `torch.distributions.Distribution` object. It's up to you!
95 | def forward(self, observation: torch.FloatTensor) -> Any:
96 | raise NotImplementedError
97 |
98 |
99 | #####################################################
100 | #####################################################
101 |
102 | class MLPPolicySL(MLPPolicy):
103 | def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs):
104 | super().__init__(ac_dim, ob_dim, n_layers, size, **kwargs)
105 | self.loss = nn.MSELoss()
106 |
107 | def update(
108 | self, observations, actions,
109 | adv_n=None, acs_labels_na=None, qvals=None
110 | ):
111 | # TODO: update the policy and return the loss
112 | loss = TODO
113 | return {
114 | # You can add extra logging information here, but keep this line
115 | 'Training Loss': ptu.to_numpy(loss),
116 | }
117 |
--------------------------------------------------------------------------------
/hw1/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/__init__.py
--------------------------------------------------------------------------------
/hw1/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import numpy as np
3 |
4 |
5 | class BasePolicy(object, metaclass=abc.ABCMeta):
6 | def get_action(self, obs: np.ndarray) -> np.ndarray:
7 | raise NotImplementedError
8 |
9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 | """Return a dictionary of logging information."""
11 | raise NotImplementedError
12 |
13 | def save(self, filepath: str):
14 | raise NotImplementedError
15 |
--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Ant.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Ant.pkl
--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/HalfCheetah.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/HalfCheetah.pkl
--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Hopper.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Hopper.pkl
--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Humanoid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Humanoid.pkl
--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Walker2d.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Walker2d.pkl
--------------------------------------------------------------------------------
/hw1/cs285/policies/loaded_gaussian_policy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from cs285.infrastructure import pytorch_util as ptu
4 | from .base_policy import BasePolicy
5 | from torch import nn
6 | import torch
7 | import pickle
8 |
9 |
10 | def create_linear_layer(W, b) -> nn.Linear:
11 | out_features, in_features = W.shape
12 | linear_layer = nn.Linear(
13 | in_features,
14 | out_features,
15 | )
16 | linear_layer.weight.data = ptu.from_numpy(W.T)
17 | linear_layer.bias.data = ptu.from_numpy(b[0])
18 | return linear_layer
19 |
20 |
21 | def read_layer(l):
22 | assert list(l.keys()) == ['AffineLayer']
23 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
24 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer'][
25 | 'b'].astype(np.float32)
26 |
27 |
28 | class LoadedGaussianPolicy(BasePolicy, nn.Module):
29 | def __init__(self, filename, **kwargs):
30 | super().__init__(**kwargs)
31 |
32 | with open(filename, 'rb') as f:
33 | data = pickle.loads(f.read())
34 |
35 | self.nonlin_type = data['nonlin_type']
36 | if self.nonlin_type == 'lrelu':
37 | self.non_lin = nn.LeakyReLU(0.01)
38 | elif self.nonlin_type == 'tanh':
39 | self.non_lin = nn.Tanh()
40 | else:
41 | raise NotImplementedError()
42 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
43 |
44 | assert policy_type == 'GaussianPolicy', (
45 | 'Policy type {} not supported'.format(policy_type)
46 | )
47 | self.policy_params = data[policy_type]
48 |
49 | assert set(self.policy_params.keys()) == {
50 | 'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'
51 | }
52 |
53 | # Build the policy. First, observation normalization.
54 | assert list(self.policy_params['obsnorm'].keys()) == ['Standardizer']
55 | obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D']
56 | obsnorm_meansq = self.policy_params['obsnorm']['Standardizer'][
57 | 'meansq_1_D']
58 | obsnorm_stdev = np.sqrt(
59 | np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
60 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
61 |
62 | self.obs_norm_mean = nn.Parameter(ptu.from_numpy(obsnorm_mean))
63 | self.obs_norm_std = nn.Parameter(ptu.from_numpy(obsnorm_stdev))
64 | self.hidden_layers = nn.ModuleList()
65 |
66 | # Hidden layers next
67 | assert list(self.policy_params['hidden'].keys()) == ['FeedforwardNet']
68 | layer_params = self.policy_params['hidden']['FeedforwardNet']
69 | for layer_name in sorted(layer_params.keys()):
70 | l = layer_params[layer_name]
71 | W, b = read_layer(l)
72 | linear_layer = create_linear_layer(W, b)
73 | self.hidden_layers.append(linear_layer)
74 |
75 | # Output layer
76 | W, b = read_layer(self.policy_params['out'])
77 | self.output_layer = create_linear_layer(W, b)
78 |
79 | def forward(self, obs):
80 | normed_obs = (obs - self.obs_norm_mean) / (self.obs_norm_std + 1e-6)
81 | h = normed_obs
82 | for layer in self.hidden_layers:
83 | h = layer(h)
84 | h = self.non_lin(h)
85 | return self.output_layer(h)
86 |
87 | ##################################
88 |
89 | def update(self, obs_no, acs_na, adv_n=None, acs_labels_na=None):
90 | raise NotImplementedError("""
91 | This policy class simply loads in a particular type of policy and
92 | queries it. Do not try to train it.
93 | """)
94 |
95 | def get_action(self, obs):
96 | if len(obs.shape) > 1:
97 | observation = obs
98 | else:
99 | observation = obs[None, :]
100 | observation = ptu.from_numpy(observation.astype(np.float32))
101 | action = self(observation)
102 | return ptu.to_numpy(action)
103 |
104 | def save(self, filepath):
105 | torch.save(self.state_dict(), filepath)
106 |
--------------------------------------------------------------------------------
/hw1/cs285_hw1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285_hw1.pdf
--------------------------------------------------------------------------------
/hw1/installation.md:
--------------------------------------------------------------------------------
1 | ## Install mujoco:
2 | ```
3 | mkdir ~/.mujoco
4 | cd ~/.mujoco
5 | wget https://www.roboti.us/download/mujoco200_linux.zip
6 | unzip mujoco200_linux.zip
7 | mv mujoco200_linux mujoco200
8 | rm mujoco200_linux.zip
9 | cp .
10 | ```
11 | The above instructions download MuJoCo for Linux. If you are on Mac or Windows, you will need to change the `wget` address to either
12 | `https://www.roboti.us/download/mujoco200_macos.zip` or `https://www.roboti.us/download/mujoco200_win64.zip`.
13 |
14 | Finally, add the following to bottom of your bashrc:
15 | ```
16 | export LD_LIBRARY_PATH=~/.mujoco/mujoco200/bin/
17 | ```
18 |
19 | ## Install other dependencies
20 |
21 |
22 | There are two options:
23 |
24 | A. (Recommended) Install with conda:
25 |
26 | 1. Install conda, if you don't already have it, by following the instructions at [this link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/)
27 |
28 | ```
29 |
30 | This install will modify the `PATH` variable in your bashrc.
31 | You need to open a new terminal for that path change to take place (to be able to find 'conda' in the next step).
32 |
33 | 2. Create a conda environment that will contain python 3:
34 | ```
35 | conda create -n cs285 python=3.6
36 | ```
37 |
38 | 3. activate the environment (do this every time you open a new terminal and want to run code):
39 | ```
40 | source activate cs285
41 | ```
42 |
43 | 4. Install the requirements into this conda environment
44 | ```
45 | pip install --user -r requirements.txt
46 | ```
47 |
48 | 5. Allow your code to be able to see 'cs285'
49 | ```
50 | cd
51 | $ pip install -e .
52 | ```
53 |
54 | This conda environment requires activating it every time you open a new terminal (in order to run code), but the benefit is that the required dependencies for this codebase will not affect existing/other versions of things on your computer. This stand-alone environment will have everything that is necessary.
55 |
56 |
57 | B. Install on system Python:
58 | ```
59 | pip install -r requirements.txt
60 | ```
--------------------------------------------------------------------------------
/hw1/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.17.2
2 | mujoco-py==2.0.2.2
3 | tensorboard==2.3.0
4 | tensorboardX==1.8
5 | matplotlib==2.2.2
6 | ipython==6.4.0
7 | moviepy==1.0.0
8 | pyvirtualdisplay==1.3.2
9 | torch==1.6.0
10 | opencv-python==4.4.0.42
11 | ipdb==0.13.3
12 | box2d-py
13 |
--------------------------------------------------------------------------------
/hw1/requirements_colab.txt:
--------------------------------------------------------------------------------
1 | gym==0.17.2
2 | tensorboard==2.3.0
3 | tensorboardX==1.8
4 | matplotlib==2.2.2
5 | ipython==6.4.0
6 | moviepy==1.0.0
7 | pyvirtualdisplay==1.3.2
8 | torch==1.6.0
9 | opencv-python==4.4.0.42
10 | ipdb==0.13.3
11 | box2d-py
12 |
--------------------------------------------------------------------------------
/hw1/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 |
4 | setup(
5 | name='cs285',
6 | version='0.1.0',
7 | packages=['cs285'],
8 | )
--------------------------------------------------------------------------------
/hw2/README.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 |
3 | You can run this code on your own machine or on Google Colab.
4 |
5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. If you completed this installation for homework 1, you do not need to repeat it.
6 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below:
7 |
8 | [](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw2/cs285/scripts/run_hw2.ipynb)
9 |
10 | ## Complete the code
11 |
12 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with "TODO: get this from hw1".
13 |
14 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
15 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
16 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
17 |
18 | You will then need to complete the following new files for homework 2. The relevant sections are marked with "TODO".
19 | - [agents/pg_agent.py](cs285/agents/pg_agent.py)
20 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
21 |
22 | You will also want to look through [scripts/run_hw2.py](cs285/scripts/run_hw2.py) (if running locally) or [scripts/run_hw2.ipynb](cs285/scripts/run_hw1.2pynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook.
23 |
24 | You will be running your policy gradients implementation in four experiments total, investigating the effects of design decisions like reward-to-go estimators, neural network baselines for variance reduction, and advantage normalization. See the [assignment PDF](cs285_hw2.pdf) for more details.
25 |
26 | ## Plotting your results
27 |
28 | We have provided a snippet that may be used for reading your Tensorboard eventfiles in [scripts/read_results.py](cs285/scripts/read_results.py). Reading these eventfiles and plotting them with [matplotlib](https://matplotlib.org/) or [seaborn](https://seaborn.pydata.org/) will produce the cleanest results for your submission. For debugging purposes, we recommend visualizing the Tensorboard logs using `tensorboard --logdir data`.
29 |
--------------------------------------------------------------------------------
/hw2/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_agent import BaseAgent
2 | from .pg_agent import PGAgent
3 |
4 |
--------------------------------------------------------------------------------
/hw2/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
1 | class BaseAgent(object):
2 | def __init__(self, **kwargs):
3 | super(BaseAgent, self).__init__(**kwargs)
4 |
5 | def train(self) -> dict:
6 | """Return a dictionary of logging information."""
7 | raise NotImplementedError
8 |
9 | def add_to_replay_buffer(self, paths):
10 | raise NotImplementedError
11 |
12 | def sample(self, batch_size):
13 | raise NotImplementedError
14 |
15 | def save(self, path):
16 | raise NotImplementedError
--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285/infrastructure/__init__.py
--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
1 | from gym.wrappers import Monitor
2 | import glob
3 | import io
4 | import base64
5 | from IPython.display import HTML
6 | from IPython import display as ipythondisplay
7 |
8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
9 |
10 | def show_video():
11 | mp4list = glob.glob('/content/video/*.mp4')
12 | if len(mp4list) > 0:
13 | mp4 = mp4list[0]
14 | video = io.open(mp4, 'r+b').read()
15 | encoded = base64.b64encode(video)
16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii'))))
20 | else:
21 | print("Could not find video")
22 |
23 |
24 | def wrap_env(env):
25 | env = Monitor(env, '/content/video', force=True)
26 | return env
--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tensorboardX import SummaryWriter
3 | import numpy as np
4 |
5 | class Logger:
6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
7 | self._log_dir = log_dir
8 | print('########################')
9 | print('logging outputs to ', log_dir)
10 | print('########################')
11 | self._n_logged_samples = n_logged_samples
12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 |
14 | def log_scalar(self, scalar, name, step_):
15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 |
17 | def log_scalars(self, scalar_dict, group_name, step, phase):
18 | """Will log all scalars in the same plot."""
19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 |
21 | def log_image(self, image, name, step):
22 | assert(len(image.shape) == 3) # [C, H, W]
23 | self._summ_writer.add_image('{}'.format(name), image, step)
24 |
25 | def log_video(self, video_frames, name, step, fps=10):
26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 |
29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 |
31 | # reshape the rollouts
32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 |
34 | # max rollout length
35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 | max_length = videos[0].shape[0]
37 | for i in range(max_videos_to_save):
38 | if videos[i].shape[0]>max_length:
39 | max_length = videos[i].shape[0]
40 |
41 | # pad rollouts to all be same length
42 | for i in range(max_videos_to_save):
43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!"
54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 |
56 | def log_figure(self, figure, name, step, phase):
57 | """figure: matplotlib.pyplot figure handle"""
58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 |
60 | def log_graph(self, array, name, step, phase):
61 | """figure: matplotlib.pyplot figure handle"""
62 | im = plot_graph(array)
63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 |
65 | def dump_scalars(self, log_path=None):
66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 | self._summ_writer.export_scalars_to_json(log_path)
68 |
69 | def flush(self):
70 | self._summ_writer.flush()
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import torch
4 | from torch import nn
5 |
6 | Activation = Union[str, nn.Module]
7 |
8 |
9 | _str_to_activation = {
10 | 'relu': nn.ReLU(),
11 | 'tanh': nn.Tanh(),
12 | 'leaky_relu': nn.LeakyReLU(),
13 | 'sigmoid': nn.Sigmoid(),
14 | 'selu': nn.SELU(),
15 | 'softplus': nn.Softplus(),
16 | 'identity': nn.Identity(),
17 | }
18 |
19 |
20 | def build_mlp(
21 | input_size: int,
22 | output_size: int,
23 | n_layers: int,
24 | size: int,
25 | activation: Activation = 'tanh',
26 | output_activation: Activation = 'identity',
27 | ):
28 | """
29 | Builds a feedforward neural network
30 |
31 | arguments:
32 | input_placeholder: placeholder variable for the state (batch_size, input_size)
33 | scope: variable scope of the network
34 |
35 | n_layers: number of hidden layers
36 | size: dimension of each hidden layer
37 | activation: activation of each hidden layer
38 |
39 | input_size: size of the input layer
40 | output_size: size of the output layer
41 | output_activation: activation of the output layer
42 |
43 | returns:
44 | output_placeholder: the result of a forward pass through the hidden layers + the output layer
45 | """
46 | if isinstance(activation, str):
47 | activation = _str_to_activation[activation]
48 | if isinstance(output_activation, str):
49 | output_activation = _str_to_activation[output_activation]
50 | layers = []
51 | in_size = input_size
52 | for _ in range(n_layers):
53 | layers.append(nn.Linear(in_size, size))
54 | layers.append(activation)
55 | in_size = size
56 | layers.append(nn.Linear(in_size, output_size))
57 | layers.append(output_activation)
58 | return nn.Sequential(*layers)
59 |
60 |
61 | device = None
62 |
63 |
64 | def init_gpu(use_gpu=True, gpu_id=0):
65 | global device
66 | if torch.cuda.is_available() and use_gpu:
67 | device = torch.device("cuda:" + str(gpu_id))
68 | print("Using GPU id {}".format(gpu_id))
69 | else:
70 | device = torch.device("cpu")
71 | print("GPU not detected. Defaulting to CPU.")
72 |
73 |
74 | def set_device(gpu_id):
75 | torch.cuda.set_device(gpu_id)
76 |
77 |
78 | def from_numpy(*args, **kwargs):
79 | return torch.from_numpy(*args, **kwargs).float().to(device)
80 |
81 |
82 | def to_numpy(tensor):
83 | return tensor.to('cpu').detach().numpy()
84 |
--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
1 | from cs285.infrastructure.utils import *
2 |
3 |
4 | class ReplayBuffer(object):
5 |
6 | def __init__(self, max_size=1000000):
7 |
8 | self.max_size = max_size
9 | self.paths = []
10 | self.obs = None
11 | self.acs = None
12 | self.concatenated_rews = None
13 | self.unconcatenated_rews = None
14 | self.next_obs = None
15 | self.terminals = None
16 |
17 | def add_rollouts(self, paths, noised=False):
18 |
19 | # add new rollouts into our list of rollouts
20 | for path in paths:
21 | self.paths.append(path)
22 |
23 | # convert new rollouts into their component arrays, and append them onto our arrays
24 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
25 |
26 | if noised:
27 | observations = add_noise(observations)
28 | next_observations = add_noise(next_observations)
29 |
30 | if self.obs is None:
31 | self.obs = observations[-self.max_size:]
32 | self.acs = actions[-self.max_size:]
33 | self.next_obs = next_observations[-self.max_size:]
34 | self.terminals = terminals[-self.max_size:]
35 | self.concatenated_rews = concatenated_rews[-self.max_size:]
36 | self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
37 | else:
38 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
39 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
40 | self.next_obs = np.concatenate(
41 | [self.next_obs, next_observations]
42 | )[-self.max_size:]
43 | self.terminals = np.concatenate(
44 | [self.terminals, terminals]
45 | )[-self.max_size:]
46 | self.concatenated_rews = np.concatenate(
47 | [self.concatenated_rews, concatenated_rews]
48 | )[-self.max_size:]
49 | if isinstance(unconcatenated_rews, list):
50 | self.unconcatenated_rews += unconcatenated_rews # TODO keep only latest max_size around
51 | else:
52 | self.unconcatenated_rews.append(unconcatenated_rews) # TODO keep only latest max_size around
53 |
54 | ########################################
55 | ########################################
56 |
57 | def sample_random_rollouts(self, num_rollouts):
58 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
59 | return self.paths[rand_indices]
60 |
61 | def sample_recent_rollouts(self, num_rollouts=1):
62 | return self.paths[-num_rollouts:]
63 |
64 | ########################################
65 | ########################################
66 |
67 | def sample_random_data(self, batch_size):
68 |
69 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
70 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
71 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
72 |
73 | def sample_recent_data(self, batch_size=1, concat_rew=True):
74 |
75 | if concat_rew:
76 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
77 | else:
78 | num_recent_rollouts_to_return = 0
79 | num_datapoints_so_far = 0
80 | index = -1
81 | while num_datapoints_so_far < batch_size:
82 | recent_rollout = self.paths[index]
83 | index -=1
84 | num_recent_rollouts_to_return +=1
85 | num_datapoints_so_far += get_pathlength(recent_rollout)
86 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
87 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
88 | return observations, actions, unconcatenated_rews, next_observations, terminals
--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import copy
4 |
5 | ############################################
6 | ############################################
7 |
8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics):
9 |
10 | model = models[0]
11 |
12 | # true
13 | true_states = perform_actions(env, action_sequence)['observation']
14 |
15 | # predicted
16 | ob = np.expand_dims(true_states[0],0)
17 | pred_states = []
18 | for ac in action_sequence:
19 | pred_states.append(ob)
20 | action = np.expand_dims(ac,0)
21 | ob = model.get_prediction(ob, action, data_statistics)
22 | pred_states = np.squeeze(pred_states)
23 |
24 | # mpe
25 | mpe = mean_squared_error(pred_states, true_states)
26 |
27 | return mpe, true_states, pred_states
28 |
29 | def perform_actions(env, actions):
30 | ob = env.reset()
31 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
32 | steps = 0
33 | for ac in actions:
34 | obs.append(ob)
35 | acs.append(ac)
36 | ob, rew, done, _ = env.step(ac)
37 | # add the observation after taking a step to next_obs
38 | next_obs.append(ob)
39 | rewards.append(rew)
40 | steps += 1
41 | # If the episode ended, the corresponding terminal value is 1
42 | # otherwise, it is 0
43 | if done:
44 | terminals.append(1)
45 | break
46 | else:
47 | terminals.append(0)
48 |
49 | return Path(obs, image_obs, acs, rewards, next_obs, terminals)
50 |
51 | def mean_squared_error(a, b):
52 | return np.mean((a-b)**2)
53 |
54 | ############################################
55 | ############################################
56 |
57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
58 | # TODO: get this from hw1
59 | return Path(obs, image_obs, acs, rewards, next_obs, terminals)
60 |
61 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
62 | # TODO: get this from hw1
63 | return paths, timesteps_this_batch
64 |
65 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
66 | # TODO: get this from hw1
67 | return paths
68 |
69 | ############################################
70 | ############################################
71 |
72 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
73 | """
74 | Take info (separate arrays) from a single rollout
75 | and return it in a single dictionary
76 | """
77 | if image_obs != []:
78 | image_obs = np.stack(image_obs, axis=0)
79 | return {"observation" : np.array(obs, dtype=np.float32),
80 | "image_obs" : np.array(image_obs, dtype=np.uint8),
81 | "reward" : np.array(rewards, dtype=np.float32),
82 | "action" : np.array(acs, dtype=np.float32),
83 | "next_observation": np.array(next_obs, dtype=np.float32),
84 | "terminal": np.array(terminals, dtype=np.float32)}
85 |
86 |
87 | def convert_listofrollouts(paths):
88 | """
89 | Take a list of rollout dictionaries
90 | and return separate arrays,
91 | where each array is a concatenation of that array from across the rollouts
92 | """
93 | observations = np.concatenate([path["observation"] for path in paths])
94 | actions = np.concatenate([path["action"] for path in paths])
95 | next_observations = np.concatenate([path["next_observation"] for path in paths])
96 | terminals = np.concatenate([path["terminal"] for path in paths])
97 | concatenated_rewards = np.concatenate([path["reward"] for path in paths])
98 | unconcatenated_rewards = [path["reward"] for path in paths]
99 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
100 |
101 | ############################################
102 | ############################################
103 |
104 | def get_pathlength(path):
105 | return len(path["reward"])
106 |
107 | def normalize(data, mean, std, eps=1e-8):
108 | return (data-mean)/(std+eps)
109 |
110 | def unnormalize(data, mean, std):
111 | return data*std+mean
112 |
113 | def add_noise(data_inp, noiseToSignal=0.01):
114 |
115 | data = copy.deepcopy(data_inp) #(num data points, dim)
116 |
117 | #mean of data
118 | mean_data = np.mean(data, axis=0)
119 |
120 | #if mean is 0,
121 | #make it 0.001 to avoid 0 issues later for dividing by std
122 | mean_data[mean_data == 0] = 0.000001
123 |
124 | #width of normal distribution to sample noise from
125 | #larger magnitude number = could have larger magnitude noise
126 | std_of_noise = mean_data * noiseToSignal
127 | for j in range(mean_data.shape[0]):
128 | data[:, j] = np.copy(data[:, j] + np.random.normal(
129 | 0, np.absolute(std_of_noise[j]), (data.shape[0],)))
130 |
131 | return data
--------------------------------------------------------------------------------
/hw2/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285/policies/__init__.py
--------------------------------------------------------------------------------
/hw2/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import numpy as np
3 |
4 |
5 | class BasePolicy(object, metaclass=abc.ABCMeta):
6 | def get_action(self, obs: np.ndarray) -> np.ndarray:
7 | raise NotImplementedError
8 |
9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 | """Return a dictionary of logging information."""
11 | raise NotImplementedError
12 |
13 | def save(self, filepath: str):
14 | raise NotImplementedError
15 |
--------------------------------------------------------------------------------
/hw2/cs285/scripts/read_results.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import tensorflow as tf
3 |
4 | def get_section_results(file):
5 | """
6 | requires tensorflow==1.12.0
7 | """
8 | X = []
9 | Y = []
10 | for e in tf.train.summary_iterator(file):
11 | for v in e.summary.value:
12 | if v.tag == 'Train_EnvstepsSoFar':
13 | X.append(v.simple_value)
14 | elif v.tag == 'Eval_AverageReturn':
15 | Y.append(v.simple_value)
16 | return X, Y
17 |
18 | if __name__ == '__main__':
19 | import glob
20 |
21 | logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*'
22 | eventfile = glob.glob(logdir)[0]
23 |
24 | X, Y = get_section_results(eventfile)
25 | for i, (x, y) in enumerate(zip(X, Y)):
26 | print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y))
--------------------------------------------------------------------------------
/hw2/cs285/scripts/run_hw2.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from cs285.infrastructure.rl_trainer import RL_Trainer
5 | from cs285.agents.pg_agent import PGAgent
6 |
7 | class PG_Trainer(object):
8 |
9 | def __init__(self, params):
10 |
11 | #####################
12 | ## SET AGENT PARAMS
13 | #####################
14 |
15 | computation_graph_args = {
16 | 'n_layers': params['n_layers'],
17 | 'size': params['size'],
18 | 'learning_rate': params['learning_rate'],
19 | }
20 |
21 | estimate_advantage_args = {
22 | 'gamma': params['discount'],
23 | 'standardize_advantages': not(params['dont_standardize_advantages']),
24 | 'reward_to_go': params['reward_to_go'],
25 | 'nn_baseline': params['nn_baseline'],
26 | }
27 |
28 | train_args = {
29 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
30 | }
31 |
32 | agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}
33 |
34 | self.params = params
35 | self.params['agent_class'] = PGAgent
36 | self.params['agent_params'] = agent_params
37 | self.params['batch_size_initial'] = self.params['batch_size']
38 |
39 | ################
40 | ## RL TRAINER
41 | ################
42 |
43 | self.rl_trainer = RL_Trainer(self.params)
44 |
45 | def run_training_loop(self):
46 |
47 | self.rl_trainer.run_training_loop(
48 | self.params['n_iter'],
49 | collect_policy = self.rl_trainer.agent.actor,
50 | eval_policy = self.rl_trainer.agent.actor,
51 | )
52 |
53 |
54 | def main():
55 |
56 | import argparse
57 | parser = argparse.ArgumentParser()
58 | parser.add_argument('--env_name', type=str)
59 | parser.add_argument('--exp_name', type=str, default='todo')
60 | parser.add_argument('--n_iter', '-n', type=int, default=200)
61 |
62 | parser.add_argument('--reward_to_go', '-rtg', action='store_true')
63 | parser.add_argument('--nn_baseline', action='store_true')
64 | parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true')
65 | parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration
66 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
67 |
68 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
69 | parser.add_argument('--discount', type=float, default=1.0)
70 | parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
71 | parser.add_argument('--n_layers', '-l', type=int, default=2)
72 | parser.add_argument('--size', '-s', type=int, default=64)
73 |
74 | parser.add_argument('--ep_len', type=int) #students shouldn't change this away from env's default
75 | parser.add_argument('--seed', type=int, default=1)
76 | parser.add_argument('--no_gpu', '-ngpu', action='store_true')
77 | parser.add_argument('--which_gpu', '-gpu_id', default=0)
78 | parser.add_argument('--video_log_freq', type=int, default=-1)
79 | parser.add_argument('--scalar_log_freq', type=int, default=1)
80 |
81 | parser.add_argument('--save_params', action='store_true')
82 |
83 | args = parser.parse_args()
84 |
85 | # convert to dictionary
86 | params = vars(args)
87 |
88 | ## ensure compatibility with hw1 code
89 | params['train_batch_size'] = params['batch_size']
90 |
91 | ##################################
92 | ### CREATE DIRECTORY FOR LOGGING
93 | ##################################
94 |
95 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data')
96 |
97 | if not (os.path.exists(data_path)):
98 | os.makedirs(data_path)
99 |
100 | logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
101 | logdir = os.path.join(data_path, logdir)
102 | params['logdir'] = logdir
103 | if not(os.path.exists(logdir)):
104 | os.makedirs(logdir)
105 |
106 | ###################
107 | ### RUN TRAINING
108 | ###################
109 |
110 | trainer = PG_Trainer(params)
111 | trainer.run_training_loop()
112 |
113 |
114 | if __name__ == "__main__":
115 | main()
116 |
--------------------------------------------------------------------------------
/hw2/cs285_hw2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285_hw2.pdf
--------------------------------------------------------------------------------
/hw2/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.5.1
2 | gym==0.17.2
3 | mujoco-py==2.0.2.2
4 | tensorboard==2.3.0
5 | tensorboardX==1.8
6 | matplotlib==2.2.2
7 | ipython==6.4.0
8 | moviepy==1.0.0
9 | pyvirtualdisplay==1.3.2
10 | ipdb==0.13.3
11 | box2d-py
12 | tensorflow==1.12.0
--------------------------------------------------------------------------------
/hw2/requirements_colab.txt:
--------------------------------------------------------------------------------
1 | torch==1.5.1+cu101
2 | gym==0.17.2
3 | tensorboard==2.3.0
4 | tensorboardX==1.8
5 | matplotlib==2.2.2
6 | ipython==6.4.0
7 | moviepy==1.0.0
8 | pyvirtualdisplay==1.3.2
9 | ipdb==0.13.3
10 | box2d-py
11 | tensorflow==2.3.0
--------------------------------------------------------------------------------
/hw2/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 |
4 | setup(
5 | name='cs285',
6 | version='0.1.0',
7 | packages=['cs285'],
8 | )
--------------------------------------------------------------------------------
/hw3/README.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 |
3 | You can run this code on your own machine or on Google Colab.
4 |
5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally.
6 |
7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below:
8 |
9 | [](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw3/cs285/scripts/run_hw3_dqn.ipynb) **Part I (Q-learning)**
10 |
11 | [](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw3/cs285/scripts/run_hw3_actor_critic.ipynb) **Part II (Actor-critic)**
12 |
13 | ## Complete the code
14 |
15 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with `TODO: get this from hw1 or hw2`.
16 |
17 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
18 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
19 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
20 |
21 | You will then need to implement new routines in the following files for homework 3 part 1 (Q-learning):
22 | - [agents/dqn_agent.py](cs285/agents/dqn_agent.py)
23 | - [critics/dqn_critic.py](cs285/critics/dqn_critic.py)
24 | - [policies/argmax_policy.py](cs285/policies/argmax_policy.py)
25 |
26 | and in the following files for part 2 (actor-critic):
27 | - [agents/ac_agent.py](cs285/agents/ac_agent.py)
28 | - [critics/bootstrapped_continuous_critic.py](cs285/critics/bootstrapped_continuous_critic.py)
29 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
30 |
31 | The relevant sections are marked with `TODO`.
32 |
33 | You may also want to look through [run_hw3_dqn.py](cs285/scripts/run_hw3_dqn.py) and [run_hw3_actor_critic.py](cs285/scripts/run_hw3_actor_critic.py) (if running locally) or [run_hw3_dqn.ipynb](cs285/scripts/run_hw3_dqn.ipynb) and [run_hw3_actor_critic.ipynb](cs285/scripts/run_hw3_actor_critic.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook.
34 |
35 | See the [assignment PDF](cs285_hw3.pdf) for more details on what files to edit.
36 |
37 |
--------------------------------------------------------------------------------
/hw3/cs285/agents/ac_agent.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | from cs285.critics.bootstrapped_continuous_critic import \
4 | BootstrappedContinuousCritic
5 | from cs285.infrastructure.replay_buffer import ReplayBuffer
6 | from cs285.infrastructure.utils import *
7 | from cs285.policies.MLP_policy import MLPPolicyAC
8 | from .base_agent import BaseAgent
9 |
10 |
11 | class ACAgent(BaseAgent):
12 | def __init__(self, env, agent_params):
13 | super(ACAgent, self).__init__()
14 |
15 | self.env = env
16 | self.agent_params = agent_params
17 |
18 | self.gamma = self.agent_params['gamma']
19 | self.standardize_advantages = self.agent_params['standardize_advantages']
20 |
21 | self.actor = MLPPolicyAC(
22 | self.agent_params['ac_dim'],
23 | self.agent_params['ob_dim'],
24 | self.agent_params['n_layers'],
25 | self.agent_params['size'],
26 | self.agent_params['discrete'],
27 | self.agent_params['learning_rate'],
28 | )
29 | self.critic = BootstrappedContinuousCritic(self.agent_params)
30 |
31 | self.replay_buffer = ReplayBuffer()
32 |
33 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
34 | # TODO Implement the following pseudocode:
35 | # for agent_params['num_critic_updates_per_agent_update'] steps,
36 | # update the critic
37 |
38 | # advantage = estimate_advantage(...)
39 |
40 | # for agent_params['num_actor_updates_per_agent_update'] steps,
41 | # update the actor
42 |
43 | loss = OrderedDict()
44 | loss['Critic_Loss'] = TODO
45 | loss['Actor_Loss'] = TODO
46 |
47 | return loss
48 |
49 | def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
50 | # TODO Implement the following pseudocode:
51 | # 1) query the critic with ob_no, to get V(s)
52 | # 2) query the critic with next_ob_no, to get V(s')
53 | # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
54 | # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
55 | # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
56 | adv_n = TODO
57 |
58 | if self.standardize_advantages:
59 | adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
60 | return adv_n
61 |
62 | def add_to_replay_buffer(self, paths):
63 | self.replay_buffer.add_rollouts(paths)
64 |
65 | def sample(self, batch_size):
66 | return self.replay_buffer.sample_recent_data(batch_size)
67 |
--------------------------------------------------------------------------------
/hw3/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
1 | class BaseAgent(object):
2 | def __init__(self, **kwargs):
3 | super(BaseAgent, self).__init__(**kwargs)
4 |
5 | def train(self) -> dict:
6 | """Return a dictionary of logging information."""
7 | raise NotImplementedError
8 |
9 | def add_to_replay_buffer(self, paths):
10 | raise NotImplementedError
11 |
12 | def sample(self, batch_size):
13 | raise NotImplementedError
14 |
15 | def save(self, path):
16 | raise NotImplementedError
--------------------------------------------------------------------------------
/hw3/cs285/agents/dqn_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule
4 | from cs285.policies.argmax_policy import ArgMaxPolicy
5 | from cs285.critics.dqn_critic import DQNCritic
6 |
7 |
8 | class DQNAgent(object):
9 | def __init__(self, env, agent_params):
10 |
11 | self.env = env
12 | self.agent_params = agent_params
13 | self.batch_size = agent_params['batch_size']
14 | # import ipdb; ipdb.set_trace()
15 | self.last_obs = self.env.reset()
16 |
17 | self.num_actions = agent_params['ac_dim']
18 | self.learning_starts = agent_params['learning_starts']
19 | self.learning_freq = agent_params['learning_freq']
20 | self.target_update_freq = agent_params['target_update_freq']
21 |
22 | self.replay_buffer_idx = None
23 | self.exploration = agent_params['exploration_schedule']
24 | self.optimizer_spec = agent_params['optimizer_spec']
25 |
26 | self.critic = DQNCritic(agent_params, self.optimizer_spec)
27 | self.actor = ArgMaxPolicy(self.critic)
28 |
29 | lander = agent_params['env_name'].startswith('LunarLander')
30 | self.replay_buffer = MemoryOptimizedReplayBuffer(
31 | agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander)
32 | self.t = 0
33 | self.num_param_updates = 0
34 |
35 | def add_to_replay_buffer(self, paths):
36 | pass
37 |
38 | def step_env(self):
39 | """
40 | Step the env and store the transition
41 | At the end of this block of code, the simulator should have been
42 | advanced one step, and the replay buffer should contain one more transition.
43 | Note that self.last_obs must always point to the new latest observation.
44 | """
45 |
46 | # TODO store the latest observation ("frame") into the replay buffer
47 | # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer`
48 | # in dqn_utils.py
49 | self.replay_buffer_idx = TODO
50 |
51 | eps = self.exploration.value(self.t)
52 |
53 | # TODO use epsilon greedy exploration when selecting action
54 | perform_random_action = TODO
55 | if perform_random_action:
56 | # HINT: take random action
57 | # with probability eps (see np.random.random())
58 | # OR if your current step number (see self.t) is less that self.learning_starts
59 | action = TODO
60 | else:
61 | # HINT: Your actor will take in multiple previous observations ("frames") in order
62 | # to deal with the partial observability of the environment. Get the most recent
63 | # `frame_history_len` observations using functionality from the replay buffer,
64 | # and then use those observations as input to your actor.
65 | action = TODO
66 |
67 | # TODO take a step in the environment using the action from the policy
68 | # HINT1: remember that self.last_obs must always point to the newest/latest observation
69 | # HINT2: remember the following useful function that you've seen before:
70 | #obs, reward, done, info = env.step(action)
71 | TODO
72 |
73 | # TODO store the result of taking this action into the replay buffer
74 | # HINT1: see your replay buffer's `store_effect` function
75 | # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
76 | TODO
77 |
78 | # TODO if taking this step resulted in done, reset the env (and the latest observation)
79 | TODO
80 |
81 | def sample(self, batch_size):
82 | if self.replay_buffer.can_sample(self.batch_size):
83 | return self.replay_buffer.sample(batch_size)
84 | else:
85 | return [],[],[],[],[]
86 |
87 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
88 | log = {}
89 | if (self.t > self.learning_starts
90 | and self.t % self.learning_freq == 0
91 | and self.replay_buffer.can_sample(self.batch_size)
92 | ):
93 |
94 | # TODO fill in the call to the update function using the appropriate tensors
95 | log = self.critic.update(
96 | TODO
97 | )
98 |
99 | # TODO update the target network periodically
100 | # HINT: your critic already has this functionality implemented
101 | if self.num_param_updates % self.target_update_freq == 0:
102 | TODO
103 |
104 | self.num_param_updates += 1
105 |
106 | self.t += 1
107 | return log
108 |
--------------------------------------------------------------------------------
/hw3/cs285/critics/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/hw3/cs285/critics/base_critic.py:
--------------------------------------------------------------------------------
1 | class BaseCritic(object):
2 | def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n):
3 | raise NotImplementedError
4 |
--------------------------------------------------------------------------------
/hw3/cs285/critics/bootstrapped_continuous_critic.py:
--------------------------------------------------------------------------------
1 | from .base_critic import BaseCritic
2 | from torch import nn
3 | from torch import optim
4 |
5 | from cs285.infrastructure import pytorch_util as ptu
6 |
7 |
8 | class BootstrappedContinuousCritic(nn.Module, BaseCritic):
9 | """
10 | Notes on notation:
11 |
12 | Prefixes and suffixes:
13 | ob - observation
14 | ac - action
15 | _no - this tensor should have shape (batch self.size /n/, observation dim)
16 | _na - this tensor should have shape (batch self.size /n/, action dim)
17 | _n - this tensor should have shape (batch self.size /n/)
18 |
19 | Note: batch self.size /n/ is defined at runtime.
20 | is None
21 | """
22 | def __init__(self, hparams):
23 | super().__init__()
24 | self.ob_dim = hparams['ob_dim']
25 | self.ac_dim = hparams['ac_dim']
26 | self.discrete = hparams['discrete']
27 | self.size = hparams['size']
28 | self.n_layers = hparams['n_layers']
29 | self.learning_rate = hparams['learning_rate']
30 |
31 | # critic parameters
32 | self.num_target_updates = hparams['num_target_updates']
33 | self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update']
34 | self.gamma = hparams['gamma']
35 | self.critic_network = ptu.build_mlp(
36 | self.ob_dim,
37 | 1,
38 | n_layers=self.n_layers,
39 | size=self.size,
40 | )
41 | self.critic_network.to(ptu.device)
42 | self.loss = nn.MSELoss()
43 | self.optimizer = optim.Adam(
44 | self.critic_network.parameters(),
45 | self.learning_rate,
46 | )
47 |
48 | def forward(self, obs):
49 | return self.critic_network(obs).squeeze(1)
50 |
51 | def forward_np(self, obs):
52 | obs = ptu.from_numpy(obs)
53 | predictions = self(obs)
54 | return ptu.to_numpy(predictions)
55 |
56 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
57 | """
58 | Update the parameters of the critic.
59 |
60 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from
61 | Agent.sample_trajectories
62 | let num_paths be the number of paths sampled from Agent.sample_trajectories
63 |
64 | arguments:
65 | ob_no: shape: (sum_of_path_lengths, ob_dim)
66 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
67 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
68 | the reward for each timestep
69 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
70 | at that timestep of 0 if the episode did not end
71 |
72 | returns:
73 | training loss
74 | """
75 | # TODO: Implement the pseudocode below: do the following (
76 | # self.num_grad_steps_per_target_update * self.num_target_updates)
77 | # times:
78 | # every self.num_grad_steps_per_target_update steps (which includes the
79 | # first step), recompute the target values by
80 | # a) calculating V(s') by querying the critic with next_ob_no
81 | # b) and computing the target values as r(s, a) + gamma * V(s')
82 | # every time, update this critic using the observations and targets
83 | #
84 | # HINT: don't forget to use terminal_n to cut off the V(s') (ie set it
85 | # to 0) when a terminal state is reached
86 | # HINT: make sure to squeeze the output of the critic_network to ensure
87 | # that its dimensions match the reward
88 |
89 | return loss.item()
90 |
--------------------------------------------------------------------------------
/hw3/cs285/critics/dqn_critic.py:
--------------------------------------------------------------------------------
1 | from .base_critic import BaseCritic
2 | import torch
3 | import torch.optim as optim
4 | from torch.nn import utils
5 | from torch import nn
6 |
7 | from cs285.infrastructure import pytorch_util as ptu
8 |
9 |
10 | class DQNCritic(BaseCritic):
11 |
12 | def __init__(self, hparams, optimizer_spec, **kwargs):
13 | super().__init__(**kwargs)
14 | self.env_name = hparams['env_name']
15 | self.ob_dim = hparams['ob_dim']
16 |
17 | if isinstance(self.ob_dim, int):
18 | self.input_shape = (self.ob_dim,)
19 | else:
20 | self.input_shape = hparams['input_shape']
21 |
22 | self.ac_dim = hparams['ac_dim']
23 | self.double_q = hparams['double_q']
24 | self.grad_norm_clipping = hparams['grad_norm_clipping']
25 | self.gamma = hparams['gamma']
26 |
27 | self.optimizer_spec = optimizer_spec
28 | network_initializer = hparams['q_func']
29 | self.q_net = network_initializer(self.ob_dim, self.ac_dim)
30 | self.q_net_target = network_initializer(self.ob_dim, self.ac_dim)
31 | self.optimizer = self.optimizer_spec.constructor(
32 | self.q_net.parameters(),
33 | **self.optimizer_spec.optim_kwargs
34 | )
35 | self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR(
36 | self.optimizer,
37 | self.optimizer_spec.learning_rate_schedule,
38 | )
39 | self.loss = nn.SmoothL1Loss() # AKA Huber loss
40 | self.q_net.to(ptu.device)
41 | self.q_net_target.to(ptu.device)
42 |
43 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
44 | """
45 | Update the parameters of the critic.
46 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from
47 | Agent.sample_trajectories
48 | let num_paths be the number of paths sampled from Agent.sample_trajectories
49 | arguments:
50 | ob_no: shape: (sum_of_path_lengths, ob_dim)
51 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
52 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
53 | the reward for each timestep
54 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
55 | at that timestep of 0 if the episode did not end
56 | returns:
57 | nothing
58 | """
59 | ob_no = ptu.from_numpy(ob_no)
60 | ac_na = ptu.from_numpy(ac_na).to(torch.long)
61 | next_ob_no = ptu.from_numpy(next_ob_no)
62 | reward_n = ptu.from_numpy(reward_n)
63 | terminal_n = ptu.from_numpy(terminal_n)
64 |
65 | qa_t_values = self.q_net(ob_no)
66 | q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1)
67 |
68 | # TODO compute the Q-values from the target network
69 | qa_tp1_values = TODO
70 |
71 | if self.double_q:
72 | # You must fill this part for Q2 of the Q-learning portion of the homework.
73 | # In double Q-learning, the best action is selected using the Q-network that
74 | # is being updated, but the Q-value for this action is obtained from the
75 | # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.
76 | TODO
77 | else:
78 | q_tp1, _ = qa_tp1_values.max(dim=1)
79 |
80 | # TODO compute targets for minimizing Bellman error
81 | # HINT: as you saw in lecture, this would be:
82 | #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal)
83 | target = TODO
84 | target = target.detach()
85 |
86 | assert q_t_values.shape == target.shape
87 | loss = self.loss(q_t_values, target)
88 |
89 | self.optimizer.zero_grad()
90 | loss.backward()
91 | utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping)
92 | self.optimizer.step()
93 |
94 | return {
95 | 'Training Loss': ptu.to_numpy(loss),
96 | }
97 |
98 | def update_target_network(self):
99 | for target_param, param in zip(
100 | self.q_net_target.parameters(), self.q_net.parameters()
101 | ):
102 | target_param.data.copy_(param.data)
103 |
104 | def qa_values(self, obs):
105 | obs = ptu.from_numpy(obs)
106 | qa_values = self.q_net(obs)
107 | return ptu.to_numpy(qa_values)
108 |
--------------------------------------------------------------------------------
/hw3/cs285/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285/envs/__init__.py
--------------------------------------------------------------------------------
/hw3/cs285/envs/box2d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285/envs/box2d/__init__.py
--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
1 | from gym.wrappers import Monitor
2 | import glob
3 | import io
4 | import base64
5 | from IPython.display import HTML
6 | from IPython import display as ipythondisplay
7 |
8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
9 |
10 | def show_video():
11 | mp4list = glob.glob('/content/video/*.mp4')
12 | if len(mp4list) > 0:
13 | mp4 = mp4list[0]
14 | video = io.open(mp4, 'r+b').read()
15 | encoded = base64.b64encode(video)
16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii'))))
20 | else:
21 | print("Could not find video")
22 |
23 |
24 | def wrap_env(env):
25 | env = Monitor(env, '/content/video', force=True)
26 | return env
27 |
--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tensorboardX import SummaryWriter
3 | import numpy as np
4 |
5 | class Logger:
6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
7 | self._log_dir = log_dir
8 | print('########################')
9 | print('logging outputs to ', log_dir)
10 | print('########################')
11 | self._n_logged_samples = n_logged_samples
12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 |
14 | def log_scalar(self, scalar, name, step_):
15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 |
17 | def log_scalars(self, scalar_dict, group_name, step, phase):
18 | """Will log all scalars in the same plot."""
19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 |
21 | def log_image(self, image, name, step):
22 | assert(len(image.shape) == 3) # [C, H, W]
23 | self._summ_writer.add_image('{}'.format(name), image, step)
24 |
25 | def log_video(self, video_frames, name, step, fps=10):
26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 |
29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 |
31 | # reshape the rollouts
32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 |
34 | # max rollout length
35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 | max_length = videos[0].shape[0]
37 | for i in range(max_videos_to_save):
38 | if videos[i].shape[0]>max_length:
39 | max_length = videos[i].shape[0]
40 |
41 | # pad rollouts to all be same length
42 | for i in range(max_videos_to_save):
43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!"
54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 |
56 | def log_figure(self, figure, name, step, phase):
57 | """figure: matplotlib.pyplot figure handle"""
58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 |
60 | def log_graph(self, array, name, step, phase):
61 | """figure: matplotlib.pyplot figure handle"""
62 | im = plot_graph(array)
63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 |
65 | def dump_scalars(self, log_path=None):
66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 | self._summ_writer.export_scalars_to_json(log_path)
68 |
69 | def flush(self):
70 | self._summ_writer.flush()
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import torch
4 | from torch import nn
5 |
6 | Activation = Union[str, nn.Module]
7 |
8 |
9 | _str_to_activation = {
10 | 'relu': nn.ReLU(),
11 | 'tanh': nn.Tanh(),
12 | 'leaky_relu': nn.LeakyReLU(),
13 | 'sigmoid': nn.Sigmoid(),
14 | 'selu': nn.SELU(),
15 | 'softplus': nn.Softplus(),
16 | 'identity': nn.Identity(),
17 | }
18 |
19 |
20 | def build_mlp(
21 | input_size: int,
22 | output_size: int,
23 | n_layers: int,
24 | size: int,
25 | activation: Activation = 'tanh',
26 | output_activation: Activation = 'identity',
27 | ):
28 | """
29 | Builds a feedforward neural network
30 | arguments:
31 | input_placeholder: placeholder variable for the state (batch_size, input_size)
32 | scope: variable scope of the network
33 | n_layers: number of hidden layers
34 | size: dimension of each hidden layer
35 | activation: activation of each hidden layer
36 | input_size: size of the input layer
37 | output_size: size of the output layer
38 | output_activation: activation of the output layer
39 | returns:
40 | output_placeholder: the result of a forward pass through the hidden layers + the output layer
41 | """
42 | if isinstance(activation, str):
43 | activation = _str_to_activation[activation]
44 | if isinstance(output_activation, str):
45 | output_activation = _str_to_activation[output_activation]
46 | layers = []
47 | in_size = input_size
48 | for _ in range(n_layers):
49 | layers.append(nn.Linear(in_size, size))
50 | layers.append(activation)
51 | in_size = size
52 | layers.append(nn.Linear(in_size, output_size))
53 | layers.append(output_activation)
54 | return nn.Sequential(*layers)
55 |
56 |
57 | device = None
58 |
59 |
60 | def init_gpu(use_gpu=True, gpu_id=0):
61 | global device
62 | if torch.cuda.is_available() and use_gpu:
63 | device = torch.device("cuda:" + str(gpu_id))
64 | print("Using GPU id {}".format(gpu_id))
65 | else:
66 | device = torch.device("cpu")
67 | print("GPU not detected. Defaulting to CPU.")
68 |
69 |
70 | def set_device(gpu_id):
71 | torch.cuda.set_device(gpu_id)
72 |
73 |
74 | def from_numpy(*args, **kwargs):
75 | return torch.from_numpy(*args, **kwargs).float().to(device)
76 |
77 |
78 | def to_numpy(tensor):
79 | return tensor.to('cpu').detach().numpy()
80 |
--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
1 | from cs285.infrastructure.utils import *
2 |
3 |
4 | class ReplayBuffer(object):
5 |
6 | def __init__(self, max_size=1000000):
7 |
8 | self.max_size = max_size
9 | self.paths = []
10 | self.obs = None
11 | self.acs = None
12 | self.concatenated_rews = None
13 | self.next_obs = None
14 | self.terminals = None
15 |
16 | def add_rollouts(self, paths, noised=False):
17 |
18 | # add new rollouts into our list of rollouts
19 | for path in paths:
20 | self.paths.append(path)
21 |
22 | # convert new rollouts into their component arrays, and append them onto our arrays
23 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
24 |
25 | if noised:
26 | observations = add_noise(observations)
27 | next_observations = add_noise(next_observations)
28 |
29 | if self.obs is None:
30 | self.obs = observations[-self.max_size:]
31 | self.acs = actions[-self.max_size:]
32 | self.next_obs = next_observations[-self.max_size:]
33 | self.terminals = terminals[-self.max_size:]
34 | self.concatenated_rews = concatenated_rews[-self.max_size:]
35 | else:
36 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
37 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
38 | self.next_obs = np.concatenate(
39 | [self.next_obs, next_observations]
40 | )[-self.max_size:]
41 | self.terminals = np.concatenate(
42 | [self.terminals, terminals]
43 | )[-self.max_size:]
44 | self.concatenated_rews = np.concatenate(
45 | [self.concatenated_rews, concatenated_rews]
46 | )[-self.max_size:]
47 |
48 | ########################################
49 | ########################################
50 |
51 | def sample_random_rollouts(self, num_rollouts):
52 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
53 | return self.paths[rand_indices]
54 |
55 | def sample_recent_rollouts(self, num_rollouts=1):
56 | return self.paths[-num_rollouts:]
57 |
58 | ########################################
59 | ########################################
60 |
61 | def sample_random_data(self, batch_size):
62 |
63 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
64 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
65 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
66 |
67 | def sample_recent_data(self, batch_size=1, concat_rew=True):
68 |
69 | if concat_rew:
70 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
71 | else:
72 | num_recent_rollouts_to_return = 0
73 | num_datapoints_so_far = 0
74 | index = -1
75 | while num_datapoints_so_far < batch_size:
76 | recent_rollout = self.paths[index]
77 | index -=1
78 | num_recent_rollouts_to_return +=1
79 | num_datapoints_so_far += get_pathlength(recent_rollout)
80 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
81 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
82 | return observations, actions, unconcatenated_rews, next_observations, terminals
83 |
--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import copy
4 |
5 | ############################################
6 | ############################################
7 |
8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics):
9 |
10 | model = models[0]
11 |
12 | # true
13 | true_states = perform_actions(env, action_sequence)['observation']
14 |
15 | # predicted
16 | ob = np.expand_dims(true_states[0],0)
17 | pred_states = []
18 | for ac in action_sequence:
19 | pred_states.append(ob)
20 | action = np.expand_dims(ac,0)
21 | ob = model.get_prediction(ob, action, data_statistics)
22 | pred_states = np.squeeze(pred_states)
23 |
24 | # mpe
25 | mpe = mean_squared_error(pred_states, true_states)
26 |
27 | return mpe, true_states, pred_states
28 |
29 | def perform_actions(env, actions):
30 | ob = env.reset()
31 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
32 | steps = 0
33 | for ac in actions:
34 | obs.append(ob)
35 | acs.append(ac)
36 | ob, rew, done, _ = env.step(ac)
37 | # add the observation after taking a step to next_obs
38 | next_obs.append(ob)
39 | rewards.append(rew)
40 | steps += 1
41 | # If the episode ended, the corresponding terminal value is 1
42 | # otherwise, it is 0
43 | if done:
44 | terminals.append(1)
45 | break
46 | else:
47 | terminals.append(0)
48 |
49 | return Path(obs, image_obs, acs, rewards, next_obs, terminals)
50 |
51 | def mean_squared_error(a, b):
52 | return np.mean((a-b)**2)
53 |
54 | ############################################
55 | ############################################
56 |
57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
58 | # TODO: get this from Piazza
59 |
60 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
61 | """
62 | Collect rollouts using policy
63 | until we have collected min_timesteps_per_batch steps
64 | """
65 | # TODO: get this from Piazza
66 |
67 | return paths, timesteps_this_batch
68 |
69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
70 | """
71 | Collect ntraj rollouts using policy
72 | """
73 | # TODO: get this from Piazza
74 |
75 | return paths
76 |
77 | ############################################
78 | ############################################
79 |
80 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
81 | """
82 | Take info (separate arrays) from a single rollout
83 | and return it in a single dictionary
84 | """
85 | if image_obs != []:
86 | image_obs = np.stack(image_obs, axis=0)
87 | return {"observation" : np.array(obs, dtype=np.float32),
88 | "image_obs" : np.array(image_obs, dtype=np.uint8),
89 | "reward" : np.array(rewards, dtype=np.float32),
90 | "action" : np.array(acs, dtype=np.float32),
91 | "next_observation": np.array(next_obs, dtype=np.float32),
92 | "terminal": np.array(terminals, dtype=np.float32)}
93 |
94 |
95 | def convert_listofrollouts(paths):
96 | """
97 | Take a list of rollout dictionaries
98 | and return separate arrays,
99 | where each array is a concatenation of that array from across the rollouts
100 | """
101 | observations = np.concatenate([path["observation"] for path in paths])
102 | actions = np.concatenate([path["action"] for path in paths])
103 | next_observations = np.concatenate([path["next_observation"] for path in paths])
104 | terminals = np.concatenate([path["terminal"] for path in paths])
105 | concatenated_rewards = np.concatenate([path["reward"] for path in paths])
106 | unconcatenated_rewards = [path["reward"] for path in paths]
107 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
108 |
109 | ############################################
110 | ############################################
111 |
112 | def get_pathlength(path):
113 | return len(path["reward"])
114 |
115 | def normalize(data, mean, std, eps=1e-8):
116 | return (data-mean)/(std+eps)
117 |
118 | def unnormalize(data, mean, std):
119 | return data*std+mean
120 |
121 | def add_noise(data_inp, noiseToSignal=0.01):
122 |
123 | data = copy.deepcopy(data_inp) #(num data points, dim)
124 |
125 | #mean of data
126 | mean_data = np.mean(data, axis=0)
127 |
128 | #if mean is 0,
129 | #make it 0.001 to avoid 0 issues later for dividing by std
130 | mean_data[mean_data == 0] = 0.000001
131 |
132 | #width of normal distribution to sample noise from
133 | #larger magnitude number = could have larger magnitude noise
134 | std_of_noise = mean_data * noiseToSignal
135 | for j in range(mean_data.shape[0]):
136 | data[:, j] = np.copy(data[:, j] + np.random.normal(
137 | 0, np.absolute(std_of_noise[j]), (data.shape[0],)))
138 |
139 | return data
140 |
--------------------------------------------------------------------------------
/hw3/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import itertools
3 | from torch import nn
4 | from torch.nn import functional as F
5 | from torch import optim
6 |
7 | import numpy as np
8 | import torch
9 | from torch import distributions
10 |
11 | from cs285.infrastructure import pytorch_util as ptu
12 | from cs285.policies.base_policy import BasePolicy
13 |
14 |
15 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
16 |
17 | def __init__(self,
18 | ac_dim,
19 | ob_dim,
20 | n_layers,
21 | size,
22 | discrete=False,
23 | learning_rate=1e-4,
24 | training=True,
25 | nn_baseline=False,
26 | **kwargs
27 | ):
28 | super().__init__(**kwargs)
29 |
30 | # init vars
31 | self.ac_dim = ac_dim
32 | self.ob_dim = ob_dim
33 | self.n_layers = n_layers
34 | self.discrete = discrete
35 | self.size = size
36 | self.learning_rate = learning_rate
37 | self.training = training
38 | self.nn_baseline = nn_baseline
39 |
40 | if self.discrete:
41 | self.logits_na = ptu.build_mlp(input_size=self.ob_dim,
42 | output_size=self.ac_dim,
43 | n_layers=self.n_layers,
44 | size=self.size)
45 | self.logits_na.to(ptu.device)
46 | self.mean_net = None
47 | self.logstd = None
48 | self.optimizer = optim.Adam(self.logits_na.parameters(),
49 | self.learning_rate)
50 | else:
51 | self.logits_na = None
52 | self.mean_net = ptu.build_mlp(input_size=self.ob_dim,
53 | output_size=self.ac_dim,
54 | n_layers=self.n_layers, size=self.size)
55 | self.logstd = nn.Parameter(
56 | torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
57 | )
58 | self.mean_net.to(ptu.device)
59 | self.logstd.to(ptu.device)
60 | self.optimizer = optim.Adam(
61 | itertools.chain([self.logstd], self.mean_net.parameters()),
62 | self.learning_rate
63 | )
64 |
65 | if nn_baseline:
66 | self.baseline = ptu.build_mlp(
67 | input_size=self.ob_dim,
68 | output_size=1,
69 | n_layers=self.n_layers,
70 | size=self.size,
71 | )
72 | self.baseline.to(ptu.device)
73 | self.baseline_optimizer = optim.Adam(
74 | self.baseline.parameters(),
75 | self.learning_rate,
76 | )
77 | else:
78 | self.baseline = None
79 |
80 | ##################################
81 |
82 | def save(self, filepath):
83 | torch.save(self.state_dict(), filepath)
84 |
85 | ##################################
86 |
87 | # query the policy with observation(s) to get selected action(s)
88 | def get_action(self, obs: np.ndarray) -> np.ndarray:
89 | # TODO: get this from Piazza
90 | return action
91 |
92 | # update/train this policy
93 | def update(self, observations, actions, **kwargs):
94 | raise NotImplementedError
95 |
96 | # This function defines the forward pass of the network.
97 | # You can return anything you want, but you should be able to differentiate
98 | # through it. For example, you can return a torch.FloatTensor. You can also
99 | # return more flexible objects, such as a
100 | # `torch.distributions.Distribution` object. It's up to you!
101 | def forward(self, observation: torch.FloatTensor):
102 | # TODO: get this from Piazza
103 | return action_distribution
104 |
105 |
106 | #####################################################
107 | #####################################################
108 |
109 |
110 | class MLPPolicyAC(MLPPolicy):
111 | def update(self, observations, actions, adv_n=None):
112 | # TODO: update the policy and return the loss
113 | loss = TODO
114 | return loss.item()
115 |
--------------------------------------------------------------------------------
/hw3/cs285/policies/argmax_policy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class ArgMaxPolicy(object):
5 |
6 | def __init__(self, critic):
7 | self.critic = critic
8 |
9 | def get_action(self, obs):
10 | if len(obs.shape) > 3:
11 | observation = obs
12 | else:
13 | observation = obs[None]
14 |
15 | ## TODO return the action that maxinmizes the Q-value
16 | # at the current observation as the output
17 | actions = TODO
18 |
19 | return action.squeeze()
--------------------------------------------------------------------------------
/hw3/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import numpy as np
3 |
4 |
5 | class BasePolicy(object, metaclass=abc.ABCMeta):
6 | def get_action(self, obs: np.ndarray) -> np.ndarray:
7 | raise NotImplementedError
8 |
9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 | """Return a dictionary of logging information."""
11 | raise NotImplementedError
12 |
13 | def save(self, filepath: str):
14 | raise NotImplementedError
15 |
--------------------------------------------------------------------------------
/hw3/cs285/scripts/read_results.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import tensorflow as tf
3 |
4 | def get_section_results(file):
5 | """
6 | requires tensorflow==1.12.0
7 | """
8 | X = []
9 | Y = []
10 | for e in tf.train.summary_iterator(file):
11 | for v in e.summary.value:
12 | if v.tag == 'Train_EnvstepsSoFar':
13 | X.append(v.simple_value)
14 | elif v.tag == 'Eval_AverageReturn':
15 | Y.append(v.simple_value)
16 | return X, Y
17 |
18 | if __name__ == '__main__':
19 | import glob
20 |
21 | logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*'
22 | eventfile = glob.glob(logdir)[0]
23 |
24 | X, Y = get_section_results(eventfile)
25 | for i, (x, y) in enumerate(zip(X, Y)):
26 | print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y))
--------------------------------------------------------------------------------
/hw3/cs285/scripts/run_hw3_actor_critic.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from cs285.agents.ac_agent import ACAgent
5 | from cs285.infrastructure.rl_trainer import RL_Trainer
6 |
7 |
8 | class AC_Trainer(object):
9 |
10 | def __init__(self, params):
11 |
12 | #####################
13 | ## SET AGENT PARAMS
14 | #####################
15 |
16 | computation_graph_args = {
17 | 'n_layers': params['n_layers'],
18 | 'size': params['size'],
19 | 'learning_rate': params['learning_rate'],
20 | 'num_target_updates': params['num_target_updates'],
21 | 'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'],
22 | }
23 |
24 | estimate_advantage_args = {
25 | 'gamma': params['discount'],
26 | 'standardize_advantages': not(params['dont_standardize_advantages']),
27 | }
28 |
29 | train_args = {
30 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
31 | 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
32 | 'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'],
33 | }
34 |
35 | agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}
36 |
37 | self.params = params
38 | self.params['agent_class'] = ACAgent
39 | self.params['agent_params'] = agent_params
40 | self.params['batch_size_initial'] = self.params['batch_size']
41 |
42 | ################
43 | ## RL TRAINER
44 | ################
45 |
46 | self.rl_trainer = RL_Trainer(self.params)
47 |
48 | def run_training_loop(self):
49 |
50 | self.rl_trainer.run_training_loop(
51 | self.params['n_iter'],
52 | collect_policy = self.rl_trainer.agent.actor,
53 | eval_policy = self.rl_trainer.agent.actor,
54 | )
55 |
56 |
57 | def main():
58 |
59 | import argparse
60 | parser = argparse.ArgumentParser()
61 | parser.add_argument('--env_name', type=str, default='CartPole-v0')
62 | parser.add_argument('--ep_len', type=int, default=200)
63 | parser.add_argument('--exp_name', type=str, default='todo')
64 | parser.add_argument('--n_iter', '-n', type=int, default=200)
65 |
66 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
67 | parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1)
68 | parser.add_argument('--num_actor_updates_per_agent_update', type=int, default=1)
69 |
70 | parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration
71 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
72 | parser.add_argument('--train_batch_size', '-tb', type=int, default=1000) ##steps used per gradient step
73 |
74 | parser.add_argument('--discount', type=float, default=1.0)
75 | parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
76 | parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true')
77 | parser.add_argument('--num_target_updates', '-ntu', type=int, default=10)
78 | parser.add_argument('--num_grad_steps_per_target_update', '-ngsptu', type=int, default=10)
79 | parser.add_argument('--n_layers', '-l', type=int, default=2)
80 | parser.add_argument('--size', '-s', type=int, default=64)
81 |
82 | parser.add_argument('--seed', type=int, default=1)
83 | parser.add_argument('--no_gpu', '-ngpu', action='store_true')
84 | parser.add_argument('--which_gpu', '-gpu_id', default=0)
85 | parser.add_argument('--video_log_freq', type=int, default=-1)
86 | parser.add_argument('--scalar_log_freq', type=int, default=10)
87 |
88 | parser.add_argument('--save_params', action='store_true')
89 |
90 | args = parser.parse_args()
91 |
92 | # convert to dictionary
93 | params = vars(args)
94 |
95 | # for policy gradient, we made a design decision
96 | # to force batch_size = train_batch_size
97 | # note that, to avoid confusion, you don't even have a train_batch_size argument anymore (above)
98 | params['train_batch_size'] = params['batch_size']
99 |
100 | ##################################
101 | ### CREATE DIRECTORY FOR LOGGING
102 | ##################################
103 |
104 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
105 |
106 | if not (os.path.exists(data_path)):
107 | os.makedirs(data_path)
108 |
109 | logdir = 'hw3_ ' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
110 | logdir = os.path.join(data_path, logdir)
111 | params['logdir'] = logdir
112 | if not(os.path.exists(logdir)):
113 | os.makedirs(logdir)
114 |
115 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
116 |
117 | ###################
118 | ### RUN TRAINING
119 | ###################
120 |
121 | trainer = AC_Trainer(params)
122 | trainer.run_training_loop()
123 |
124 |
125 | if __name__ == "__main__":
126 | main()
127 |
--------------------------------------------------------------------------------
/hw3/cs285/scripts/run_hw3_dqn.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from cs285.infrastructure.rl_trainer import RL_Trainer
5 | from cs285.agents.dqn_agent import DQNAgent
6 | from cs285.infrastructure.dqn_utils import get_env_kwargs
7 |
8 |
9 | class Q_Trainer(object):
10 |
11 | def __init__(self, params):
12 | self.params = params
13 |
14 | train_args = {
15 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
16 | 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
17 | 'train_batch_size': params['batch_size'],
18 | 'double_q': params['double_q'],
19 | }
20 |
21 | env_args = get_env_kwargs(params['env_name'])
22 |
23 | self.agent_params = {**train_args, **env_args, **params}
24 |
25 | self.params['agent_class'] = DQNAgent
26 | self.params['agent_params'] = self.agent_params
27 | self.params['train_batch_size'] = params['batch_size']
28 | self.params['env_wrappers'] = self.agent_params['env_wrappers']
29 |
30 | self.rl_trainer = RL_Trainer(self.params)
31 |
32 | def run_training_loop(self):
33 | self.rl_trainer.run_training_loop(
34 | self.agent_params['num_timesteps'],
35 | collect_policy = self.rl_trainer.agent.actor,
36 | eval_policy = self.rl_trainer.agent.actor,
37 | )
38 |
39 | def main():
40 |
41 | import argparse
42 | parser = argparse.ArgumentParser()
43 | parser.add_argument(
44 | '--env_name',
45 | default='MsPacman-v0',
46 | choices=('PongNoFrameskip-v4', 'LunarLander-v3', 'MsPacman-v0')
47 | )
48 |
49 | parser.add_argument('--ep_len', type=int, default=200)
50 | parser.add_argument('--exp_name', type=str, default='todo')
51 |
52 | parser.add_argument('--eval_batch_size', type=int, default=1000)
53 |
54 | parser.add_argument('--batch_size', type=int, default=32)
55 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
56 | parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1)
57 | parser.add_argument('--double_q', action='store_true')
58 |
59 | parser.add_argument('--seed', type=int, default=1)
60 | parser.add_argument('--no_gpu', '-ngpu', action='store_true')
61 | parser.add_argument('--which_gpu', '-gpu_id', default=0)
62 | parser.add_argument('--scalar_log_freq', type=int, default=int(1e4))
63 | parser.add_argument('--video_log_freq', type=int, default=-1)
64 |
65 | parser.add_argument('--save_params', action='store_true')
66 |
67 | args = parser.parse_args()
68 |
69 | # convert to dictionary
70 | params = vars(args)
71 | params['video_log_freq'] = -1 # This param is not used for DQN
72 | ##################################
73 | ### CREATE DIRECTORY FOR LOGGING
74 | ##################################
75 |
76 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
77 |
78 | if not (os.path.exists(data_path)):
79 | os.makedirs(data_path)
80 |
81 | logdir = 'hw3_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
82 | logdir = os.path.join(data_path, logdir)
83 | params['logdir'] = logdir
84 | if not(os.path.exists(logdir)):
85 | os.makedirs(logdir)
86 |
87 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
88 |
89 | trainer = Q_Trainer(params)
90 | trainer.run_training_loop()
91 |
92 |
93 | if __name__ == "__main__":
94 | main()
95 |
--------------------------------------------------------------------------------
/hw3/cs285_hw3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285_hw3.pdf
--------------------------------------------------------------------------------
/hw3/requirements.txt:
--------------------------------------------------------------------------------
1 | gym[atari]==0.17.2
2 | mujoco-py==2.0.2.2
3 | tensorboard==2.3.0
4 | tensorboardX==1.8
5 | matplotlib==2.2.2
6 | ipython==6.4.0
7 | moviepy==1.0.0
8 | pyvirtualdisplay==1.3.2
9 | torch==1.5.1
10 | opencv-python==4.4.0.42
11 | ipdb==0.13.3
12 | box2d-py
13 |
--------------------------------------------------------------------------------
/hw3/requirements_colab.txt:
--------------------------------------------------------------------------------
1 | gym[atari]==0.17.2
2 | tensorboard==2.3.0
3 | tensorboardX==1.8
4 | matplotlib==2.2.2
5 | ipython==6.4.0
6 | moviepy==1.0.0
7 | pyvirtualdisplay==1.3.2
8 | torch==1.5.1
9 | opencv-python==4.4.0.42
10 | ipdb==0.13.3
11 | box2d-py
12 |
--------------------------------------------------------------------------------
/hw3/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 |
4 | setup(
5 | name='cs285',
6 | version='0.1.0',
7 | packages=['cs285'],
8 | )
--------------------------------------------------------------------------------
/hw4/README.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 |
3 | You can run this code on your own machine or on Google Colab.
4 |
5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally.
6 |
7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below:
8 |
9 | [](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw4/cs285/scripts/run_hw4_mb.ipynb)
10 |
11 | ## Complete the code
12 |
13 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with `TODO: get this from Piazza'.
14 |
15 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
16 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
17 |
18 | You will then need to implement code in the following files:
19 | - [agents/mb_agent.py](cs285/agents/mb_agent.py)
20 | - [models/ff_model.py](cs285/models/ff_model.py)
21 | - [policies/MPC_policy.py](cs285/policies/MPC_policy.py)
22 |
23 | The relevant sections are marked with `TODO`.
24 |
25 | You may also want to look through [scripts/run_hw4_mb.py](cs285/scripts/run_hw4_mb.py) (if running locally) or [scripts/run_hw4_mb.ipynb](cs285/scripts/run_hw4_mb.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook.
26 |
27 | See the [assignment PDF](cs285_hw4.pdf) for more details on what files to edit.
28 |
29 |
--------------------------------------------------------------------------------
/hw4/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
1 | class BaseAgent(object):
2 | def __init__(self, **kwargs):
3 | super(BaseAgent, self).__init__(**kwargs)
4 |
5 | def train(self) -> dict:
6 | """Return a dictionary of logging information."""
7 | raise NotImplementedError
8 |
9 | def add_to_replay_buffer(self, paths):
10 | raise NotImplementedError
11 |
12 | def sample(self, batch_size):
13 | raise NotImplementedError
14 |
15 | def save(self, path):
16 | raise NotImplementedError
--------------------------------------------------------------------------------
/hw4/cs285/agents/mb_agent.py:
--------------------------------------------------------------------------------
1 | from .base_agent import BaseAgent
2 | from cs285.models.ff_model import FFModel
3 | from cs285.policies.MPC_policy import MPCPolicy
4 | from cs285.infrastructure.replay_buffer import ReplayBuffer
5 | from cs285.infrastructure.utils import *
6 |
7 |
8 | class MBAgent(BaseAgent):
9 | def __init__(self, env, agent_params):
10 | super(MBAgent, self).__init__()
11 |
12 | self.env = env.unwrapped
13 | self.agent_params = agent_params
14 | self.ensemble_size = self.agent_params['ensemble_size']
15 |
16 | self.dyn_models = []
17 | for i in range(self.ensemble_size):
18 | model = FFModel(
19 | self.agent_params['ac_dim'],
20 | self.agent_params['ob_dim'],
21 | self.agent_params['n_layers'],
22 | self.agent_params['size'],
23 | self.agent_params['learning_rate'],
24 | )
25 | self.dyn_models.append(model)
26 |
27 | self.actor = MPCPolicy(
28 | self.env,
29 | ac_dim=self.agent_params['ac_dim'],
30 | dyn_models=self.dyn_models,
31 | horizon=self.agent_params['mpc_horizon'],
32 | N=self.agent_params['mpc_num_action_sequences'],
33 | )
34 |
35 | self.replay_buffer = ReplayBuffer()
36 |
37 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
38 |
39 | # training a MB agent refers to updating the predictive model using observed state transitions
40 | # NOTE: each model in the ensemble is trained on a different random batch of size batch_size
41 | losses = []
42 | num_data = ob_no.shape[0]
43 | num_data_per_ens = int(num_data / self.ensemble_size)
44 |
45 | for i in range(self.ensemble_size):
46 |
47 | # select which datapoints to use for this model of the ensemble
48 | # you might find the num_data_per_env variable defined above useful
49 |
50 | observations = # TODO(Q1)
51 | actions = # TODO(Q1)
52 | next_observations = # TODO(Q1)
53 |
54 | # use datapoints to update one of the dyn_models
55 | model = # TODO(Q1)
56 | log = model.update(observations, actions, next_observations,
57 | self.data_statistics)
58 | loss = log['Training Loss']
59 | losses.append(loss)
60 |
61 | avg_loss = np.mean(losses)
62 | return {
63 | 'Training Loss': avg_loss,
64 | }
65 |
66 | def add_to_replay_buffer(self, paths, add_sl_noise=False):
67 |
68 | # add data to replay buffer
69 | self.replay_buffer.add_rollouts(paths, noised=add_sl_noise)
70 |
71 | # get updated mean/std of the data in our replay buffer
72 | self.data_statistics = {
73 | 'obs_mean': np.mean(self.replay_buffer.obs, axis=0),
74 | 'obs_std': np.std(self.replay_buffer.obs, axis=0),
75 | 'acs_mean': np.mean(self.replay_buffer.acs, axis=0),
76 | 'acs_std': np.std(self.replay_buffer.acs, axis=0),
77 | 'delta_mean': np.mean(
78 | self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0),
79 | 'delta_std': np.std(
80 | self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0),
81 | }
82 |
83 | # update the actor's data_statistics too, so actor.get_action can be calculated correctly
84 | self.actor.data_statistics = self.data_statistics
85 |
86 | def sample(self, batch_size):
87 | # NOTE: sampling batch_size * ensemble_size,
88 | # so each model in our ensemble can get trained on batch_size data
89 | return self.replay_buffer.sample_random_data(
90 | batch_size * self.ensemble_size)
91 |
--------------------------------------------------------------------------------
/hw4/cs285/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | def register_envs():
4 | register(
5 | id='cheetah-cs285-v0',
6 | entry_point='cs285.envs.cheetah:HalfCheetahEnv',
7 | max_episode_steps=1000,
8 | )
9 | register(
10 | id='obstacles-cs285-v0',
11 | entry_point='cs285.envs.obstacles:Obstacles',
12 | max_episode_steps=500,
13 | )
14 | register(
15 | id='reacher-cs285-v0',
16 | entry_point='cs285.envs.reacher:Reacher7DOFEnv',
17 | max_episode_steps=500,
18 | )
19 |
--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs.cheetah.cheetah import HalfCheetahEnv
2 |
--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/cheetah.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import mujoco_py
3 | from gym import utils
4 | from gym.envs.mujoco import mujoco_env
5 |
6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
7 |
8 | def __init__(self):
9 |
10 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
11 | utils.EzPickle.__init__(self)
12 |
13 | self.skip = self.frame_skip
14 |
15 | self.action_dim = self.ac_dim = self.action_space.shape[0]
16 | self.observation_dim = self.obs_dim = self.observation_space.shape[0]
17 |
18 | def get_reward(self, observations, actions):
19 |
20 | """get reward/s of given (observations, actions) datapoint or datapoints
21 |
22 | Args:
23 | observations: (batchsize, obs_dim) or (obs_dim,)
24 | actions: (batchsize, ac_dim) or (ac_dim,)
25 |
26 | Return:
27 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
28 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
29 | """
30 |
31 | #initialize and reshape as needed, for batch mode
32 | self.reward_dict = {}
33 | if(len(observations.shape)==1):
34 | observations = np.expand_dims(observations, axis = 0)
35 | actions = np.expand_dims(actions, axis = 0)
36 | batch_mode = False
37 | else:
38 | batch_mode = True
39 |
40 | #get vars
41 | xvel = observations[:, 9].copy()
42 | body_angle = observations[:, 2].copy()
43 | front_leg = observations[:, 6].copy()
44 | front_shin = observations[:, 7].copy()
45 | front_foot = observations[:, 8].copy()
46 | zeros = np.zeros((observations.shape[0],)).copy()
47 |
48 | # ranges
49 | leg_range = 0.2
50 | shin_range = 0
51 | foot_range = 0
52 | penalty_factor = 10
53 |
54 | #calc rew
55 | self.reward_dict['run'] = xvel
56 |
57 | front_leg_rew = zeros.copy()
58 | front_leg_rew[front_leg>leg_range] = -penalty_factor
59 | self.reward_dict['leg'] = front_leg_rew
60 |
61 | front_shin_rew = zeros.copy()
62 | front_shin_rew[front_shin>shin_range] = -penalty_factor
63 | self.reward_dict['shin'] = front_shin_rew
64 |
65 | front_foot_rew = zeros.copy()
66 | front_foot_rew[front_foot>foot_range] = -penalty_factor
67 | self.reward_dict['foot'] = front_foot_rew
68 |
69 | # total reward
70 | self.reward_dict['r_total'] = self.reward_dict['run'] + self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot']
71 |
72 | #return
73 | dones = zeros.copy()
74 | if(not batch_mode):
75 | return self.reward_dict['r_total'][0], dones[0]
76 | return self.reward_dict['r_total'], dones
77 |
78 |
79 | def get_score(self, obs):
80 | xposafter = obs[0]
81 | return xposafter
82 |
83 | ##############################################
84 |
85 | def step(self, action):
86 |
87 | #step
88 | self.do_simulation(action, self.frame_skip)
89 |
90 | #obs/reward/done/score
91 | ob = self._get_obs()
92 | rew, done = self.get_reward(ob, action)
93 | score = self.get_score(ob)
94 |
95 | #return
96 | env_info = {'obs_dict': self.obs_dict,
97 | 'rewards': self.reward_dict,
98 | 'score': score}
99 | return ob, rew, done, env_info
100 |
101 | def _get_obs(self):
102 |
103 | self.obs_dict = {}
104 | self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy()
105 | self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy()
106 | self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy()
107 |
108 | return np.concatenate([
109 | self.obs_dict['joints_pos'], #9
110 | self.obs_dict['joints_vel'], #9
111 | self.obs_dict['com_torso'], #3
112 | ])
113 |
114 | ##############################################
115 |
116 | def reset_model(self, seed=None):
117 |
118 | # set reset pose/vel
119 | self.reset_pose = self.init_qpos + self.np_random.uniform(
120 | low=-.1, high=.1, size=self.model.nq)
121 | self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
122 |
123 | #reset the env to that pose/vel
124 | return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy())
125 |
126 |
127 | def do_reset(self, reset_pose, reset_vel, reset_goal=None):
128 |
129 | #reset
130 | self.set_state(reset_pose, reset_vel)
131 |
132 | #return
133 | return self._get_obs()
134 |
--------------------------------------------------------------------------------
/hw4/cs285/envs/obstacles/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs.obstacles.obstacles_env import Obstacles
2 |
--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs.reacher.reacher_env import Reacher7DOFEnv
2 |
--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/reacher_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 | from mujoco_py import MjViewer
5 | import os
6 |
7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 |
10 | # placeholder
11 | self.hand_sid = -2
12 | self.target_sid = -1
13 |
14 | curr_dir = os.path.dirname(os.path.abspath(__file__))
15 | mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2)
16 | utils.EzPickle.__init__(self)
17 | self.observation_dim = 26
18 | self.action_dim = 7
19 |
20 | self.hand_sid = self.model.site_name2id("finger")
21 | self.target_sid = self.model.site_name2id("target")
22 | self.skip = self.frame_skip
23 |
24 |
25 | def _get_obs(self):
26 | return np.concatenate([
27 | self.data.qpos.flat, #[7]
28 | self.data.qvel.flatten() / 10., #[7]
29 | self.data.site_xpos[self.hand_sid], #[3]
30 | self.model.site_pos[self.target_sid], #[3]
31 | ])
32 |
33 | def step(self, a):
34 |
35 | self.do_simulation(a, self.frame_skip)
36 | ob = self._get_obs()
37 | reward, done = self.get_reward(ob, a)
38 |
39 | score = self.get_score(ob)
40 |
41 | # finalize step
42 | env_info = {'ob': ob,
43 | 'rewards': self.reward_dict,
44 | 'score': score}
45 |
46 | return ob, reward, done, env_info
47 |
48 | def get_score(self, obs):
49 | hand_pos = obs[-6:-3]
50 | target_pos = obs[-3:]
51 | score = -1*np.abs(hand_pos-target_pos)
52 | return score
53 |
54 | def get_reward(self, observations, actions):
55 |
56 | """get reward/s of given (observations, actions) datapoint or datapoints
57 |
58 | Args:
59 | observations: (batchsize, obs_dim) or (obs_dim,)
60 | actions: (batchsize, ac_dim) or (ac_dim,)
61 |
62 | Return:
63 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
64 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
65 | """
66 |
67 | #initialize and reshape as needed, for batch mode
68 | self.reward_dict = {}
69 | if(len(observations.shape)==1):
70 | observations = np.expand_dims(observations, axis = 0)
71 | actions = np.expand_dims(actions, axis = 0)
72 | batch_mode = False
73 | else:
74 | batch_mode = True
75 |
76 | #get vars
77 | hand_pos = observations[:, -6:-3]
78 | target_pos = observations[:, -3:]
79 |
80 | #calc rew
81 | dist = np.linalg.norm(hand_pos - target_pos, axis=1)
82 | self.reward_dict['r_total'] = -10*dist
83 |
84 | #done is always false for this env
85 | dones = np.zeros((observations.shape[0],))
86 |
87 | #return
88 | if(not batch_mode):
89 | return self.reward_dict['r_total'][0], dones[0]
90 | return self.reward_dict['r_total'], dones
91 |
92 | def reset(self):
93 | _ = self.reset_model()
94 |
95 | self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1]
96 |
97 | observation, _reward, done, _info = self.step(np.zeros(7))
98 | ob = self._get_obs()
99 |
100 | return ob
101 |
102 | def reset_model(self, seed=None):
103 | if seed is not None:
104 | self.seed(seed)
105 |
106 | self.reset_pose = self.init_qpos.copy()
107 | self.reset_vel = self.init_qvel.copy()
108 |
109 | self.reset_goal = np.zeros(3)
110 | self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3)
111 | self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2)
112 | self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25)
113 |
114 | return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal)
115 |
116 | def do_reset(self, reset_pose, reset_vel, reset_goal):
117 |
118 | self.set_state(reset_pose, reset_vel)
119 |
120 | #reset target
121 | self.reset_goal = reset_goal.copy()
122 | self.model.site_pos[self.target_sid] = self.reset_goal
123 | self.sim.forward()
124 |
125 | #return
126 | return self._get_obs()
--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
1 | from gym.wrappers import Monitor
2 | import glob
3 | import io
4 | import base64
5 | from IPython.display import HTML
6 | from IPython import display as ipythondisplay
7 |
8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
9 |
10 | def show_video():
11 | mp4list = glob.glob('/content/video/*.mp4')
12 | if len(mp4list) > 0:
13 | mp4 = mp4list[0]
14 | video = io.open(mp4, 'r+b').read()
15 | encoded = base64.b64encode(video)
16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii'))))
20 | else:
21 | print("Could not find video")
22 |
23 |
24 | def wrap_env(env):
25 | env = Monitor(env, '/content/video', force=True)
26 | return env
27 |
--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tensorboardX import SummaryWriter
3 | import numpy as np
4 |
5 | class Logger:
6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
7 | self._log_dir = log_dir
8 | print('########################')
9 | print('logging outputs to ', log_dir)
10 | print('########################')
11 | self._n_logged_samples = n_logged_samples
12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 |
14 | def log_scalar(self, scalar, name, step_):
15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 |
17 | def log_scalars(self, scalar_dict, group_name, step, phase):
18 | """Will log all scalars in the same plot."""
19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 |
21 | def log_image(self, image, name, step):
22 | assert(len(image.shape) == 3) # [C, H, W]
23 | self._summ_writer.add_image('{}'.format(name), image, step)
24 |
25 | def log_video(self, video_frames, name, step, fps=10):
26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 |
29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 |
31 | # reshape the rollouts
32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 |
34 | # max rollout length
35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 | max_length = videos[0].shape[0]
37 | for i in range(max_videos_to_save):
38 | if videos[i].shape[0]>max_length:
39 | max_length = videos[i].shape[0]
40 |
41 | # pad rollouts to all be same length
42 | for i in range(max_videos_to_save):
43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!"
54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 |
56 | def log_figure(self, figure, name, step, phase):
57 | """figure: matplotlib.pyplot figure handle"""
58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 |
60 | def log_graph(self, array, name, step, phase):
61 | """figure: matplotlib.pyplot figure handle"""
62 | im = plot_graph(array)
63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 |
65 | def dump_scalars(self, log_path=None):
66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 | self._summ_writer.export_scalars_to_json(log_path)
68 |
69 | def flush(self):
70 | self._summ_writer.flush()
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import torch
4 | from torch import nn
5 |
6 | Activation = Union[str, nn.Module]
7 |
8 |
9 | _str_to_activation = {
10 | 'relu': nn.ReLU(),
11 | 'tanh': nn.Tanh(),
12 | 'leaky_relu': nn.LeakyReLU(),
13 | 'sigmoid': nn.Sigmoid(),
14 | 'selu': nn.SELU(),
15 | 'softplus': nn.Softplus(),
16 | 'identity': nn.Identity(),
17 | }
18 |
19 |
20 | def build_mlp(
21 | input_size: int,
22 | output_size: int,
23 | n_layers: int,
24 | size: int,
25 | activation: Activation = 'tanh',
26 | output_activation: Activation = 'identity',
27 | ):
28 | """
29 | Builds a feedforward neural network
30 | arguments:
31 | input_placeholder: placeholder variable for the state (batch_size, input_size)
32 | scope: variable scope of the network
33 | n_layers: number of hidden layers
34 | size: dimension of each hidden layer
35 | activation: activation of each hidden layer
36 | input_size: size of the input layer
37 | output_size: size of the output layer
38 | output_activation: activation of the output layer
39 | returns:
40 | output_placeholder: the result of a forward pass through the hidden layers + the output layer
41 | """
42 | if isinstance(activation, str):
43 | activation = _str_to_activation[activation]
44 | if isinstance(output_activation, str):
45 | output_activation = _str_to_activation[output_activation]
46 | layers = []
47 | in_size = input_size
48 | for _ in range(n_layers):
49 | layers.append(nn.Linear(in_size, size))
50 | layers.append(activation)
51 | in_size = size
52 | layers.append(nn.Linear(in_size, output_size))
53 | layers.append(output_activation)
54 | return nn.Sequential(*layers)
55 |
56 |
57 | device = None
58 |
59 |
60 | def init_gpu(use_gpu=True, gpu_id=0):
61 | global device
62 | if torch.cuda.is_available() and use_gpu:
63 | device = torch.device("cuda:" + str(gpu_id))
64 | print("Using GPU id {}".format(gpu_id))
65 | else:
66 | device = torch.device("cpu")
67 | print("GPU not detected. Defaulting to CPU.")
68 |
69 |
70 | def set_device(gpu_id):
71 | torch.cuda.set_device(gpu_id)
72 |
73 |
74 | def from_numpy(*args, **kwargs):
75 | return torch.from_numpy(*args, **kwargs).float().to(device)
76 |
77 |
78 | def to_numpy(tensor):
79 | return tensor.to('cpu').detach().numpy()
80 |
--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
1 | from cs285.infrastructure.utils import *
2 |
3 |
4 | class ReplayBuffer(object):
5 |
6 | def __init__(self, max_size=1000000):
7 |
8 | self.max_size = max_size
9 | self.paths = []
10 | self.obs = None
11 | self.acs = None
12 | self.concatenated_rews = None
13 | self.next_obs = None
14 | self.terminals = None
15 |
16 | def add_rollouts(self, paths, noised=False):
17 |
18 | # add new rollouts into our list of rollouts
19 | for path in paths:
20 | self.paths.append(path)
21 |
22 | # convert new rollouts into their component arrays, and append them onto our arrays
23 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
24 |
25 | if noised:
26 | observations = add_noise(observations)
27 | next_observations = add_noise(next_observations)
28 |
29 | if self.obs is None:
30 | self.obs = observations[-self.max_size:]
31 | self.acs = actions[-self.max_size:]
32 | self.next_obs = next_observations[-self.max_size:]
33 | self.terminals = terminals[-self.max_size:]
34 | self.concatenated_rews = concatenated_rews[-self.max_size:]
35 | else:
36 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
37 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
38 | self.next_obs = np.concatenate(
39 | [self.next_obs, next_observations]
40 | )[-self.max_size:]
41 | self.terminals = np.concatenate(
42 | [self.terminals, terminals]
43 | )[-self.max_size:]
44 | self.concatenated_rews = np.concatenate(
45 | [self.concatenated_rews, concatenated_rews]
46 | )[-self.max_size:]
47 |
48 | ########################################
49 | ########################################
50 |
51 | def sample_random_rollouts(self, num_rollouts):
52 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
53 | return self.paths[rand_indices]
54 |
55 | def sample_recent_rollouts(self, num_rollouts=1):
56 | return self.paths[-num_rollouts:]
57 |
58 | ########################################
59 | ########################################
60 |
61 | def sample_random_data(self, batch_size):
62 |
63 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
64 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
65 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
66 |
67 | def sample_recent_data(self, batch_size=1, concat_rew=True):
68 |
69 | if concat_rew:
70 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
71 | else:
72 | num_recent_rollouts_to_return = 0
73 | num_datapoints_so_far = 0
74 | index = -1
75 | while num_datapoints_so_far < batch_size:
76 | recent_rollout = self.paths[index]
77 | index -=1
78 | num_recent_rollouts_to_return +=1
79 | num_datapoints_so_far += get_pathlength(recent_rollout)
80 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
81 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
82 | return observations, actions, unconcatenated_rews, next_observations, terminals
83 |
--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import copy
4 |
5 | ############################################
6 | ############################################
7 |
8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics):
9 |
10 | model = models[0]
11 |
12 | # true
13 | true_states = perform_actions(env, action_sequence)['observation']
14 |
15 | # predicted
16 | ob = np.expand_dims(true_states[0],0)
17 | pred_states = []
18 | for ac in action_sequence:
19 | pred_states.append(ob)
20 | action = np.expand_dims(ac,0)
21 | ob = model.get_prediction(ob, action, data_statistics)
22 | pred_states = np.squeeze(pred_states)
23 |
24 | # mpe
25 | mpe = mean_squared_error(pred_states, true_states)
26 |
27 | return mpe, true_states, pred_states
28 |
29 | def perform_actions(env, actions):
30 | ob = env.reset()
31 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
32 | steps = 0
33 | for ac in actions:
34 | obs.append(ob)
35 | acs.append(ac)
36 | ob, rew, done, _ = env.step(ac)
37 | # add the observation after taking a step to next_obs
38 | next_obs.append(ob)
39 | rewards.append(rew)
40 | steps += 1
41 | # If the episode ended, the corresponding terminal value is 1
42 | # otherwise, it is 0
43 | if done:
44 | terminals.append(1)
45 | break
46 | else:
47 | terminals.append(0)
48 |
49 | return Path(obs, image_obs, acs, rewards, next_obs, terminals)
50 |
51 | def mean_squared_error(a, b):
52 | return np.mean((a-b)**2)
53 |
54 | ############################################
55 | ############################################
56 |
57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
58 | # TODO: get this from Piazza
59 |
60 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
61 | """
62 | Collect rollouts using policy
63 | until we have collected min_timesteps_per_batch steps
64 | """
65 | # TODO: get this from Piazza
66 |
67 | return paths, timesteps_this_batch
68 |
69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
70 | """
71 | Collect ntraj rollouts using policy
72 | """
73 | # TODO: get this from Piazza
74 |
75 | return paths
76 |
77 | ############################################
78 | ############################################
79 |
80 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
81 | """
82 | Take info (separate arrays) from a single rollout
83 | and return it in a single dictionary
84 | """
85 | if image_obs != []:
86 | image_obs = np.stack(image_obs, axis=0)
87 | return {"observation" : np.array(obs, dtype=np.float32),
88 | "image_obs" : np.array(image_obs, dtype=np.uint8),
89 | "reward" : np.array(rewards, dtype=np.float32),
90 | "action" : np.array(acs, dtype=np.float32),
91 | "next_observation": np.array(next_obs, dtype=np.float32),
92 | "terminal": np.array(terminals, dtype=np.float32)}
93 |
94 |
95 | def convert_listofrollouts(paths):
96 | """
97 | Take a list of rollout dictionaries
98 | and return separate arrays,
99 | where each array is a concatenation of that array from across the rollouts
100 | """
101 | observations = np.concatenate([path["observation"] for path in paths])
102 | actions = np.concatenate([path["action"] for path in paths])
103 | next_observations = np.concatenate([path["next_observation"] for path in paths])
104 | terminals = np.concatenate([path["terminal"] for path in paths])
105 | concatenated_rewards = np.concatenate([path["reward"] for path in paths])
106 | unconcatenated_rewards = [path["reward"] for path in paths]
107 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
108 |
109 | ############################################
110 | ############################################
111 |
112 | def get_pathlength(path):
113 | return len(path["reward"])
114 |
115 | def normalize(data, mean, std, eps=1e-8):
116 | return (data-mean)/(std+eps)
117 |
118 | def unnormalize(data, mean, std):
119 | return data*std+mean
120 |
121 | def add_noise(data_inp, noiseToSignal=0.01):
122 |
123 | data = copy.deepcopy(data_inp) #(num data points, dim)
124 |
125 | #mean of data
126 | mean_data = np.mean(data, axis=0)
127 |
128 | #if mean is 0,
129 | #make it 0.001 to avoid 0 issues later for dividing by std
130 | mean_data[mean_data == 0] = 0.000001
131 |
132 | #width of normal distribution to sample noise from
133 | #larger magnitude number = could have larger magnitude noise
134 | std_of_noise = mean_data * noiseToSignal
135 | for j in range(mean_data.shape[0]):
136 | data[:, j] = np.copy(data[:, j] + np.random.normal(
137 | 0, np.absolute(std_of_noise[j]), (data.shape[0],)))
138 |
139 | return data
140 |
--------------------------------------------------------------------------------
/hw4/cs285/models/base_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from typing import Any
3 |
4 |
5 | Prediction = Any
6 |
7 |
8 | class BaseModel(object):
9 | def update(self, ob_no, next_ob_no, re_n, terminal_n) -> dict:
10 | raise NotImplementedError
11 |
12 | def get_prediction(self, ob_no, ac_na, data_statistics) -> Prediction:
13 | raise NotImplementedError
14 |
15 | def convert_prediction_to_numpy(self, pred: Prediction) -> np.ndarray:
16 | """Allow caller to be pytorch-agnostic."""
17 | raise NotImplementedError
18 |
--------------------------------------------------------------------------------
/hw4/cs285/policies/MPC_policy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from .base_policy import BasePolicy
4 |
5 |
6 | class MPCPolicy(BasePolicy):
7 |
8 | def __init__(self,
9 | env,
10 | ac_dim,
11 | dyn_models,
12 | horizon,
13 | N,
14 | **kwargs
15 | ):
16 | super().__init__(**kwargs)
17 |
18 | # init vars
19 | self.env = env
20 | self.dyn_models = dyn_models
21 | self.horizon = horizon
22 | self.N = N
23 | self.data_statistics = None # NOTE must be updated from elsewhere
24 |
25 | self.ob_dim = self.env.observation_space.shape[0]
26 |
27 | # action space
28 | self.ac_space = self.env.action_space
29 | self.ac_dim = ac_dim
30 | self.low = self.ac_space.low
31 | self.high = self.ac_space.high
32 |
33 | def sample_action_sequences(self, num_sequences, horizon):
34 | # TODO(Q1) uniformly sample trajectories and return an array of
35 | # dimensions (num_sequences, horizon, self.ac_dim) in the range
36 | # [self.low, self.high]
37 | return random_action_sequences
38 |
39 | def get_action(self, obs):
40 |
41 | if self.data_statistics is None:
42 | # print("WARNING: performing random actions.")
43 | return self.sample_action_sequences(num_sequences=1, horizon=1)[0]
44 |
45 | # sample random actions (N x horizon)
46 | candidate_action_sequences = self.sample_action_sequences(
47 | num_sequences=self.N, horizon=self.horizon)
48 |
49 | # for each model in ensemble:
50 | predicted_sum_of_rewards_per_model = []
51 | for model in self.dyn_models:
52 | sum_of_rewards = self.calculate_sum_of_rewards(
53 | obs, candidate_action_sequences, model)
54 | predicted_sum_of_rewards_per_model.append(sum_of_rewards)
55 |
56 | # calculate mean_across_ensembles(predicted rewards)
57 | predicted_rewards = np.mean(
58 | predicted_sum_of_rewards_per_model, axis=0) # [ens, N] --> N
59 |
60 | # pick the action sequence and return the 1st element of that sequence
61 | best_action_sequence = None # TODO (Q2)
62 | action_to_take = None # TODO (Q2)
63 | return action_to_take[None] # Unsqueeze the first index
64 |
65 | def calculate_sum_of_rewards(self, obs, candidate_action_sequences, model):
66 | """
67 |
68 | :param obs: numpy array with the current observation. Shape [D_obs]
69 | :param candidate_action_sequences: numpy array with the candidate action
70 | sequences. Shape [N, H, D_action] where
71 | - N is the number of action sequences considered
72 | - H is the horizon
73 | - D_action is the action of the dimension
74 | :param model: The current dynamics model.
75 | :return: numpy array with the sum of rewards for each action sequence.
76 | The array should have shape [N].
77 | """
78 | sum_of_rewards = None # TODO (Q2)
79 | # For each candidate action sequence, predict a sequence of
80 | # states for each dynamics model in your ensemble.
81 | # Once you have a sequence of predicted states from each model in
82 | # your ensemble, calculate the sum of rewards for each sequence
83 | # using `self.env.get_reward(predicted_obs)`
84 | # You should sum across `self.horizon` time step.
85 | # Hint: you should use model.get_prediction and you shouldn't need
86 | # to import pytorch in this file.
87 | # Hint: Remember that the model can process observations and actions
88 | # in batch, which can be much faster than looping through each
89 | # action sequence.
90 | return sum_of_rewards
91 |
--------------------------------------------------------------------------------
/hw4/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import numpy as np
3 |
4 |
5 | class BasePolicy(object, metaclass=abc.ABCMeta):
6 | def get_action(self, obs: np.ndarray) -> np.ndarray:
7 | raise NotImplementedError
8 |
9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 | """Return a dictionary of logging information."""
11 | raise NotImplementedError
12 |
13 | def save(self, filepath: str):
14 | raise NotImplementedError
15 |
--------------------------------------------------------------------------------
/hw4/cs285/scripts/filter_events.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Usage:
4 |
5 | Run the command
6 | ```
7 | python filter_events.py --events SOME_DIRECTORY
8 | ```
9 |
10 | and it will generate a directory named `SOME_DIRECTORY_filtered` with the video
11 | events removed.
12 | """
13 | from __future__ import print_function
14 | import os
15 | import sys
16 | import argparse
17 | import tqdm
18 |
19 | # Adapted from
20 | # https://gist.github.com/serycjon/c9ad58ecc3176d87c49b69b598f4d6c6
21 |
22 | import tensorflow as tf
23 |
24 |
25 | def parse_arguments():
26 | parser = argparse.ArgumentParser(description='')
27 | parser.add_argument('--event', help='event file', required=True)
28 |
29 | return parser.parse_args()
30 |
31 |
32 | def main(args):
33 | out_path = os.path.dirname(args.event) + '_filtered'
34 | writer = tf.summary.FileWriter(out_path)
35 |
36 | total = None
37 | for event in tqdm.tqdm(tf.train.summary_iterator(args.event), total=total):
38 | event_type = event.WhichOneof('what')
39 | if event_type != 'summary':
40 | writer.add_event(event)
41 | else:
42 | wall_time = event.wall_time
43 | step = event.step
44 | filtered_values = [value for value in event.summary.value if
45 | 'rollouts' not in value.tag]
46 | summary = tf.Summary(value=filtered_values)
47 |
48 | filtered_event = tf.summary.Event(summary=summary,
49 | wall_time=wall_time,
50 | step=step)
51 | writer.add_event(filtered_event)
52 | writer.close()
53 | return 0
54 |
55 |
56 | if __name__ == '__main__':
57 | args = parse_arguments()
58 | sys.exit(main(args))
59 |
--------------------------------------------------------------------------------
/hw4/cs285/scripts/read_results.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import tensorflow as tf
3 |
4 | def get_section_results(file):
5 | """
6 | requires tensorflow==1.12.0
7 | """
8 | X = []
9 | Y = []
10 | for e in tf.train.summary_iterator(file):
11 | for v in e.summary.value:
12 | if v.tag == 'Train_EnvstepsSoFar':
13 | X.append(v.simple_value)
14 | elif v.tag == 'Eval_AverageReturn':
15 | Y.append(v.simple_value)
16 | return X, Y
17 |
18 | if __name__ == '__main__':
19 | import glob
20 |
21 | logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*'
22 | eventfile = glob.glob(logdir)[0]
23 |
24 | X, Y = get_section_results(eventfile)
25 | for i, (x, y) in enumerate(zip(X, Y)):
26 | print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y))
--------------------------------------------------------------------------------
/hw4/cs285/scripts/run_hw4_mb.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from cs285.infrastructure.rl_trainer import RL_Trainer
5 | from cs285.agents.mb_agent import MBAgent
6 |
7 |
8 | class MB_Trainer(object):
9 |
10 | def __init__(self, params):
11 |
12 | #####################
13 | ## SET AGENT PARAMS
14 | #####################
15 |
16 | computation_graph_args = {
17 | 'ensemble_size': params['ensemble_size'],
18 | 'n_layers': params['n_layers'],
19 | 'size': params['size'],
20 | 'learning_rate': params['learning_rate'],
21 | }
22 |
23 | train_args = {
24 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
25 | }
26 |
27 | controller_args = {
28 | 'mpc_horizon': params['mpc_horizon'],
29 | 'mpc_num_action_sequences': params['mpc_num_action_sequences'],
30 | }
31 |
32 | agent_params = {**computation_graph_args, **train_args, **controller_args}
33 |
34 | self.params = params
35 | self.params['agent_class'] = MBAgent
36 | self.params['agent_params'] = agent_params
37 |
38 | ################
39 | ## RL TRAINER
40 | ################
41 |
42 | self.rl_trainer = RL_Trainer(self.params)
43 |
44 | def run_training_loop(self):
45 |
46 | self.rl_trainer.run_training_loop(
47 | self.params['n_iter'],
48 | collect_policy = self.rl_trainer.agent.actor,
49 | eval_policy = self.rl_trainer.agent.actor,
50 | )
51 |
52 |
53 | def main():
54 |
55 | import argparse
56 | parser = argparse.ArgumentParser()
57 | parser.add_argument('--env_name', type=str) #reacher-cs285-v0, ant-cs285-v0, cheetah-cs285-v0, obstacles-cs285-v0
58 | parser.add_argument('--ep_len', type=int, default=200)
59 | parser.add_argument('--exp_name', type=str, default='todo')
60 | parser.add_argument('--n_iter', '-n', type=int, default=20)
61 |
62 | parser.add_argument('--ensemble_size', '-e', type=int, default=3)
63 | parser.add_argument('--mpc_horizon', type=int, default=10)
64 | parser.add_argument('--mpc_num_action_sequences', type=int, default=1000)
65 |
66 | parser.add_argument('--add_sl_noise', '-noise', action='store_true')
67 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1000)
68 | parser.add_argument('--batch_size_initial', type=int, default=20000) #(random) steps collected on 1st iteration (put into replay buffer)
69 | parser.add_argument('--batch_size', '-b', type=int, default=8000) #steps collected per train iteration (put into replay buffer)
70 | parser.add_argument('--train_batch_size', '-tb', type=int, default=512) ##steps used per gradient step (used for training)
71 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
72 |
73 | parser.add_argument('--learning_rate', '-lr', type=float, default=0.001)
74 | parser.add_argument('--n_layers', '-l', type=int, default=2)
75 | parser.add_argument('--size', '-s', type=int, default=250)
76 |
77 | parser.add_argument('--seed', type=int, default=1)
78 | parser.add_argument('--no_gpu', '-ngpu', action='store_true')
79 | parser.add_argument('--which_gpu', '-gpu_id', default=0)
80 | parser.add_argument('--video_log_freq', type=int, default=1) #-1 to disable
81 | parser.add_argument('--scalar_log_freq', type=int, default=1) #-1 to disable
82 | parser.add_argument('--save_params', action='store_true')
83 | args = parser.parse_args()
84 |
85 | # convert to dictionary
86 | params = vars(args)
87 |
88 | # HARDCODE EPISODE LENGTHS FOR THE ENVS USED IN THIS MB ASSIGNMENT
89 | if params['env_name']=='reacher-cs285-v0':
90 | params['ep_len']=200
91 | if params['env_name']=='cheetah-cs285-v0':
92 | params['ep_len']=500
93 | if params['env_name']=='obstacles-cs285-v0':
94 | params['ep_len']=100
95 |
96 | ##################################
97 | ### CREATE DIRECTORY FOR LOGGING
98 | ##################################
99 |
100 | logdir_prefix = 'hw4_' # keep for autograder
101 |
102 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data')
103 |
104 | if not (os.path.exists(data_path)):
105 | os.makedirs(data_path)
106 |
107 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
108 | logdir = os.path.join(data_path, logdir)
109 | params['logdir'] = logdir
110 | if not(os.path.exists(logdir)):
111 | os.makedirs(logdir)
112 |
113 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
114 |
115 | ###################
116 | ### RUN TRAINING
117 | ###################
118 |
119 | trainer = MB_Trainer(params)
120 | trainer.run_training_loop()
121 |
122 |
123 | if __name__ == "__main__":
124 | main()
125 |
--------------------------------------------------------------------------------
/hw4/cs285_hw4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw4/cs285_hw4.pdf
--------------------------------------------------------------------------------
/hw4/requirements.txt:
--------------------------------------------------------------------------------
1 | gym[atari]==0.17.2
2 | mujoco-py==2.0.2.2
3 | tensorboard==2.3.0
4 | tensorboardX==1.8
5 | matplotlib==2.2.2
6 | ipython==6.4.0
7 | moviepy==1.0.0
8 | pyvirtualdisplay==1.3.2
9 | torch==1.5.1
10 | opencv-python==4.4.0.42
11 | ipdb==0.13.3
12 | box2d-py
13 |
--------------------------------------------------------------------------------
/hw4/requirements_colab.txt:
--------------------------------------------------------------------------------
1 | gym[atari]==0.17.2
2 | tensorboard==2.3.0
3 | tensorboardX==1.8
4 | matplotlib==2.2.2
5 | ipython==6.4.0
6 | moviepy==1.0.0
7 | pyvirtualdisplay==1.3.2
8 | torch==1.5.1
9 | opencv-python==4.4.0.42
10 | ipdb==0.13.3
11 | box2d-py
12 |
--------------------------------------------------------------------------------
/hw4/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 |
4 | setup(
5 | name='cs285',
6 | version='0.1.0',
7 | packages=['cs285'],
8 | )
--------------------------------------------------------------------------------
/hw5/README.md:
--------------------------------------------------------------------------------
1 | ## Setup
2 |
3 | You can run this code on your own machine or on Google Colab.
4 |
5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally.
6 |
7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below:
8 |
9 | [](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw5/run_hw5_expl.ipynb)
10 |
11 | ## Complete the code
12 |
13 | The following files have blanks to be filled with your solutions from homework 1 and 3. The relevant sections are marked with `TODO: get this from Piazza'.
14 |
15 | - [infrastructure/utils.py](cs285/infrastructure/utils.py)
16 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py)
17 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py)
18 | - [policies/argmax_policy.py](cs285/policies/argmax_policy.py)
19 | - [critics/dqn_critic.py](cs285/critics/dqn_critic.py)
20 |
21 | You will then need to implement code in the following files:
22 | - [exploration/rnd_model.py](cs285/exploration/rnd_model.py)
23 | - [agents/explore_or_exploit_agent.py](cs285/agents/explore_or_exploit_agent.py)
24 | - [critics/cql_critic.py](cs285/critics/cql_critic.py)
25 |
26 | The relevant sections are marked with `TODO`.
27 |
28 | You may also want to look through [scripts/run_hw5_expl.py](cs285/scripts/run_hw5_expl.py) (if running locally) or [run_hw5_expl.ipynb](run_hw5_expl.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook.
29 |
30 | See the [assignment PDF](hw5.pdf) for more details on what files to edit.
31 |
32 | For this particular assignment, you will need to install networkx==2.5
33 |
--------------------------------------------------------------------------------
/hw5/cs285/agents/ac_agent.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | from cs285.critics.bootstrapped_continuous_critic import \
4 | BootstrappedContinuousCritic
5 | from cs285.infrastructure.replay_buffer import ReplayBuffer
6 | from cs285.infrastructure.utils import *
7 | from cs285.policies.MLP_policy import MLPPolicyAC
8 | from .base_agent import BaseAgent
9 |
10 |
11 | class ACAgent(BaseAgent):
12 | def __init__(self, env, agent_params):
13 | super(ACAgent, self).__init__()
14 |
15 | self.env = env
16 | self.agent_params = agent_params
17 |
18 | self.gamma = self.agent_params['gamma']
19 | self.standardize_advantages = self.agent_params['standardize_advantages']
20 |
21 | self.actor = MLPPolicyAC(
22 | self.agent_params['ac_dim'],
23 | self.agent_params['ob_dim'],
24 | self.agent_params['n_layers'],
25 | self.agent_params['size'],
26 | self.agent_params['discrete'],
27 | self.agent_params['learning_rate'],
28 | )
29 | self.critic = BootstrappedContinuousCritic(self.agent_params)
30 |
31 | self.replay_buffer = ReplayBuffer()
32 |
33 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
34 | raise NotImplementedError
35 | # Not needed for this homework
36 |
37 | ####################################
38 | ####################################
39 |
40 | def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
41 | raise NotImplementedError
42 | # Not needed for this homework
43 |
44 | ####################################
45 | ####################################
46 |
--------------------------------------------------------------------------------
/hw5/cs285/agents/base_agent.py:
--------------------------------------------------------------------------------
1 | class BaseAgent(object):
2 | def __init__(self, **kwargs):
3 | super(BaseAgent, self).__init__(**kwargs)
4 |
5 | def train(self) -> dict:
6 | """Return a dictionary of logging information."""
7 | raise NotImplementedError
8 |
9 | def add_to_replay_buffer(self, paths):
10 | raise NotImplementedError
11 |
12 | def sample(self, batch_size):
13 | raise NotImplementedError
14 |
15 | def save(self, path):
16 | raise NotImplementedError
--------------------------------------------------------------------------------
/hw5/cs285/agents/dqn_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pdb
3 |
4 | from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule
5 | from cs285.policies.argmax_policy import ArgMaxPolicy
6 | from cs285.critics.dqn_critic import DQNCritic
7 |
8 |
9 | class DQNAgent(object):
10 | def __init__(self, env, agent_params):
11 |
12 | self.env = env
13 | self.agent_params = agent_params
14 | self.batch_size = agent_params['batch_size']
15 | # import ipdb; ipdb.set_trace()
16 | self.last_obs = self.env.reset()
17 |
18 | self.num_actions = agent_params['ac_dim']
19 | self.learning_starts = agent_params['learning_starts']
20 | self.learning_freq = agent_params['learning_freq']
21 | self.target_update_freq = agent_params['target_update_freq']
22 |
23 | self.replay_buffer_idx = None
24 | self.exploration = agent_params['exploration_schedule']
25 | self.optimizer_spec = agent_params['optimizer_spec']
26 |
27 | self.critic = DQNCritic(agent_params, self.optimizer_spec)
28 | self.actor = ArgMaxPolicy(self.critic)
29 |
30 | lander = agent_params['env_name'].startswith('LunarLander')
31 | self.replay_buffer = MemoryOptimizedReplayBuffer(
32 | agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander)
33 | self.t = 0
34 | self.num_param_updates = 0
35 |
36 | def add_to_replay_buffer(self, paths):
37 | pass
38 |
39 | def step_env(self):
40 | """
41 | Step the env and store the transition
42 | At the end of this block of code, the simulator should have been
43 | advanced one step, and the replay buffer should contain one more transition.
44 | Note that self.last_obs must always point to the new latest observation.
45 | """
46 | raise NotImplementedError
47 | # Not needed for this homework
48 |
49 | ####################################
50 | ####################################
51 |
52 | def sample(self, batch_size):
53 | if self.replay_buffer.can_sample(self.batch_size):
54 | return self.replay_buffer.sample(batch_size)
55 | else:
56 | return [],[],[],[],[]
57 |
58 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
59 | raise NotImplementedError
60 | # Not needed for this homework
61 |
62 | ####################################
63 | ####################################
--------------------------------------------------------------------------------
/hw5/cs285/critics/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/hw5/cs285/critics/base_critic.py:
--------------------------------------------------------------------------------
1 | class BaseCritic(object):
2 | def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n):
3 | raise NotImplementedError
4 |
--------------------------------------------------------------------------------
/hw5/cs285/critics/bootstrapped_continuous_critic.py:
--------------------------------------------------------------------------------
1 | from .base_critic import BaseCritic
2 | from torch import nn
3 | from torch import optim
4 | import pdb
5 |
6 | from cs285.infrastructure import pytorch_util as ptu
7 |
8 |
9 | class BootstrappedContinuousCritic(nn.Module, BaseCritic):
10 | """
11 | Notes on notation:
12 |
13 | Prefixes and suffixes:
14 | ob - observation
15 | ac - action
16 | _no - this tensor should have shape (batch self.size /n/, observation dim)
17 | _na - this tensor should have shape (batch self.size /n/, action dim)
18 | _n - this tensor should have shape (batch self.size /n/)
19 |
20 | Note: batch self.size /n/ is defined at runtime.
21 | is None
22 | """
23 | def __init__(self, hparams):
24 | super().__init__()
25 | self.ob_dim = hparams['ob_dim']
26 | self.ac_dim = hparams['ac_dim']
27 | self.discrete = hparams['discrete']
28 | self.size = hparams['size']
29 | self.n_layers = hparams['n_layers']
30 | self.learning_rate = hparams['learning_rate']
31 |
32 | # critic parameters
33 | self.num_target_updates = hparams['num_target_updates']
34 | self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update']
35 | self.gamma = hparams['gamma']
36 | self.critic_network = ptu.build_mlp(
37 | self.ob_dim,
38 | 1,
39 | n_layers=self.n_layers,
40 | size=self.size,
41 | )
42 | self.critic_network.to(ptu.device)
43 | self.loss = nn.MSELoss()
44 | self.optimizer = optim.Adam(
45 | self.critic_network.parameters(),
46 | self.learning_rate,
47 | )
48 |
49 | def forward(self, obs):
50 | return self.critic_network(obs).squeeze(1)
51 |
52 | def forward_np(self, obs):
53 | obs = ptu.from_numpy(obs)
54 | predictions = self(obs)
55 | return ptu.to_numpy(predictions)
56 |
57 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
58 | """
59 | Update the parameters of the critic.
60 |
61 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from
62 | Agent.sample_trajectories
63 | let num_paths be the number of paths sampled from Agent.sample_trajectories
64 |
65 | arguments:
66 | ob_no: shape: (sum_of_path_lengths, ob_dim)
67 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
68 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
69 | the reward for each timestep
70 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
71 | at that timestep of 0 if the episode did not end
72 |
73 | returns:
74 | nothing
75 | """
76 | raise NotImplementedError
77 | # Not needed for this homework
78 |
79 | ####################################
80 | ####################################
81 |
--------------------------------------------------------------------------------
/hw5/cs285/critics/cql_critic.py:
--------------------------------------------------------------------------------
1 | from .base_critic import BaseCritic
2 | import torch
3 | import torch.optim as optim
4 | from torch.nn import utils
5 | from torch import nn
6 | import pdb
7 |
8 | from cs285.infrastructure import pytorch_util as ptu
9 |
10 |
11 | class CQLCritic(BaseCritic):
12 |
13 | def __init__(self, hparams, optimizer_spec, **kwargs):
14 | super().__init__(**kwargs)
15 | self.env_name = hparams['env_name']
16 | self.ob_dim = hparams['ob_dim']
17 |
18 | if isinstance(self.ob_dim, int):
19 | self.input_shape = (self.ob_dim,)
20 | else:
21 | self.input_shape = hparams['input_shape']
22 |
23 | self.ac_dim = hparams['ac_dim']
24 | self.double_q = hparams['double_q']
25 | self.grad_norm_clipping = hparams['grad_norm_clipping']
26 | self.gamma = hparams['gamma']
27 |
28 | self.optimizer_spec = optimizer_spec
29 | network_initializer = hparams['q_func']
30 | self.q_net = network_initializer(self.ob_dim, self.ac_dim)
31 | self.q_net_target = network_initializer(self.ob_dim, self.ac_dim)
32 | self.optimizer = self.optimizer_spec.constructor(
33 | self.q_net.parameters(),
34 | **self.optimizer_spec.optim_kwargs
35 | )
36 | self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR(
37 | self.optimizer,
38 | self.optimizer_spec.learning_rate_schedule,
39 | )
40 | self.loss = nn.MSELoss()
41 | self.q_net.to(ptu.device)
42 | self.q_net_target.to(ptu.device)
43 | self.cql_alpha = hparams['cql_alpha']
44 |
45 | def dqn_loss(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
46 | qa_t_values = self.q_net(ob_no)
47 | q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1)
48 | qa_tp1_values = self.q_net_target(next_ob_no)
49 |
50 | next_actions = self.q_net(next_ob_no).argmax(dim=1)
51 | q_tp1 = torch.gather(qa_tp1_values, 1, next_actions.unsqueeze(1)).squeeze(1)
52 |
53 | target = reward_n + self.gamma * q_tp1 * (1 - terminal_n)
54 | target = target.detach()
55 | loss = self.loss(q_t_values, target)
56 |
57 | return loss, qa_t_values, q_t_values
58 |
59 |
60 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
61 | """
62 | Update the parameters of the critic.
63 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from
64 | Agent.sample_trajectories
65 | let num_paths be the number of paths sampled from Agent.sample_trajectories
66 | arguments:
67 | ob_no: shape: (sum_of_path_lengths, ob_dim)
68 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
69 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
70 | the reward for each timestep
71 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
72 | at that timestep of 0 if the episode did not end
73 | returns:
74 | nothing
75 | """
76 | ob_no = ptu.from_numpy(ob_no)
77 | ac_na = ptu.from_numpy(ac_na).to(torch.long)
78 | next_ob_no = ptu.from_numpy(next_ob_no)
79 | reward_n = ptu.from_numpy(reward_n)
80 | terminal_n = ptu.from_numpy(terminal_n)
81 |
82 | loss, qa_t_values, q_t_values = self.dqn_loss(
83 | ob_no, ac_na, next_ob_no, reward_n, terminal_n
84 | )
85 |
86 | # CQL Implementation
87 | # TODO: Implement CQL as described in the pdf and paper
88 | # Hint: After calculating cql_loss, augment the loss appropriately
89 | cql_loss = None
90 |
91 | self.optimizer.zero_grad()
92 | loss.backward()
93 | self.optimizer.step()
94 |
95 | info = {'Training Loss': ptu.to_numpy(loss)}
96 |
97 | # TODO: Uncomment these lines after implementing CQL
98 | # info['CQL Loss'] = ptu.to_numpy(cql_loss)
99 | # info['Data q-values'] = ptu.to_numpy(q_t_values).mean()
100 | # info['OOD q-values'] = ptu.to_numpy(q_t_logsumexp).mean()
101 |
102 | return info
103 |
104 |
105 | def update_target_network(self):
106 | for target_param, param in zip(
107 | self.q_net_target.parameters(), self.q_net.parameters()
108 | ):
109 | target_param.data.copy_(param.data)
110 |
111 | def qa_values(self, obs):
112 | obs = ptu.from_numpy(obs)
113 | qa_values = self.q_net(obs)
114 | return ptu.to_numpy(qa_values)
115 |
--------------------------------------------------------------------------------
/hw5/cs285/critics/dqn_critic.py:
--------------------------------------------------------------------------------
1 | from .base_critic import BaseCritic
2 | import torch
3 | import torch.optim as optim
4 | from torch.nn import utils
5 | from torch import nn
6 | import pdb
7 |
8 | from cs285.infrastructure import pytorch_util as ptu
9 |
10 |
11 | class DQNCritic(BaseCritic):
12 |
13 | def __init__(self, hparams, optimizer_spec, **kwargs):
14 | super().__init__(**kwargs)
15 | self.env_name = hparams['env_name']
16 | self.ob_dim = hparams['ob_dim']
17 |
18 | if isinstance(self.ob_dim, int):
19 | self.input_shape = (self.ob_dim,)
20 | else:
21 | self.input_shape = hparams['input_shape']
22 |
23 | self.ac_dim = hparams['ac_dim']
24 | self.double_q = hparams['double_q']
25 | self.grad_norm_clipping = hparams['grad_norm_clipping']
26 | self.gamma = hparams['gamma']
27 |
28 | self.optimizer_spec = optimizer_spec
29 | network_initializer = hparams['q_func']
30 | self.q_net = network_initializer(self.ob_dim, self.ac_dim)
31 | self.q_net_target = network_initializer(self.ob_dim, self.ac_dim)
32 | self.optimizer = self.optimizer_spec.constructor(
33 | self.q_net.parameters(),
34 | **self.optimizer_spec.optim_kwargs
35 | )
36 | self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR(
37 | self.optimizer,
38 | self.optimizer_spec.learning_rate_schedule,
39 | )
40 | self.loss = nn.SmoothL1Loss() # AKA Huber loss
41 | self.q_net.to(ptu.device)
42 | self.q_net_target.to(ptu.device)
43 |
44 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n):
45 | """
46 | Update the parameters of the critic.
47 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from
48 | Agent.sample_trajectories
49 | let num_paths be the number of paths sampled from Agent.sample_trajectories
50 | arguments:
51 | ob_no: shape: (sum_of_path_lengths, ob_dim)
52 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
53 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing
54 | the reward for each timestep
55 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
56 | at that timestep of 0 if the episode did not end
57 | returns:
58 | nothing
59 | """
60 | raise NotImplementedError
61 | # TODO: Get this from homework 3
62 |
63 | ####################################
64 | ####################################
65 |
66 | def update_target_network(self):
67 | for target_param, param in zip(
68 | self.q_net_target.parameters(), self.q_net.parameters()
69 | ):
70 | target_param.data.copy_(param.data)
71 |
72 | def qa_values(self, obs):
73 | obs = ptu.from_numpy(obs)
74 | qa_values = self.q_net(obs)
75 | return ptu.to_numpy(qa_values)
76 |
--------------------------------------------------------------------------------
/hw5/cs285/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs import ant
2 | from cs285.envs import cheetah
3 | from cs285.envs import obstacles
4 | from cs285.envs import reacher
--------------------------------------------------------------------------------
/hw5/cs285/envs/ant/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | register(
4 | id='ant-cs285-v0',
5 | entry_point='cs285.envs.ant:AntEnv',
6 | max_episode_steps=1000,
7 | )
8 | from cs285.envs.ant.ant import AntEnv
9 |
--------------------------------------------------------------------------------
/hw5/cs285/envs/box2d/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/cs285/envs/box2d/__init__.py
--------------------------------------------------------------------------------
/hw5/cs285/envs/cheetah/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | register(
4 | id='cheetah-cs285-v0',
5 | entry_point='cs285.envs.cheetah:HalfCheetahEnv',
6 | max_episode_steps=1000,
7 | )
8 | from cs285.envs.cheetah.cheetah import HalfCheetahEnv
9 |
--------------------------------------------------------------------------------
/hw5/cs285/envs/cheetah/cheetah.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import mujoco_py
3 | from gym import utils
4 | from gym.envs.mujoco import mujoco_env
5 |
6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
7 |
8 | def __init__(self):
9 |
10 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
11 | utils.EzPickle.__init__(self)
12 |
13 | self.skip = self.frame_skip
14 |
15 | self.action_dim = self.ac_dim = self.action_space.shape[0]
16 | self.observation_dim = self.obs_dim = self.observation_space.shape[0]
17 |
18 | def get_reward(self, observations, actions):
19 |
20 | """get reward/s of given (observations, actions) datapoint or datapoints
21 |
22 | Args:
23 | observations: (batchsize, obs_dim) or (obs_dim,)
24 | actions: (batchsize, ac_dim) or (ac_dim,)
25 |
26 | Return:
27 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
28 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
29 | """
30 |
31 | #initialize and reshape as needed, for batch mode
32 | self.reward_dict = {}
33 | if(len(observations.shape)==1):
34 | observations = np.expand_dims(observations, axis = 0)
35 | actions = np.expand_dims(actions, axis = 0)
36 | batch_mode = False
37 | else:
38 | batch_mode = True
39 |
40 | #get vars
41 | xvel = observations[:, 9].copy()
42 | body_angle = observations[:, 2].copy()
43 | front_leg = observations[:, 6].copy()
44 | front_shin = observations[:, 7].copy()
45 | front_foot = observations[:, 8].copy()
46 | zeros = np.zeros((observations.shape[0],)).copy()
47 |
48 | # ranges
49 | leg_range = 0.2
50 | shin_range = 0
51 | foot_range = 0
52 | penalty_factor = 10
53 |
54 | #calc rew
55 | self.reward_dict['run'] = xvel
56 |
57 | front_leg_rew = zeros.copy()
58 | front_leg_rew[front_leg>leg_range] = -penalty_factor
59 | self.reward_dict['leg'] = front_leg_rew
60 |
61 | front_shin_rew = zeros.copy()
62 | front_shin_rew[front_shin>shin_range] = -penalty_factor
63 | self.reward_dict['shin'] = front_shin_rew
64 |
65 | front_foot_rew = zeros.copy()
66 | front_foot_rew[front_foot>foot_range] = -penalty_factor
67 | self.reward_dict['foot'] = front_foot_rew
68 |
69 | # total reward
70 | self.reward_dict['r_total'] = self.reward_dict['run'] + self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot']
71 |
72 | #return
73 | dones = zeros.copy()
74 | if(not batch_mode):
75 | return self.reward_dict['r_total'][0], dones[0]
76 | return self.reward_dict['r_total'], dones
77 |
78 |
79 | def get_score(self, obs):
80 | xposafter = obs[0]
81 | return xposafter
82 |
83 | ##############################################
84 |
85 | def step(self, action):
86 |
87 | #step
88 | self.do_simulation(action, self.frame_skip)
89 |
90 | #obs/reward/done/score
91 | ob = self._get_obs()
92 | rew, done = self.get_reward(ob, action)
93 | score = self.get_score(ob)
94 |
95 | #return
96 | env_info = {'obs_dict': self.obs_dict,
97 | 'rewards': self.reward_dict,
98 | 'score': score}
99 | return ob, rew, done, env_info
100 |
101 | def _get_obs(self):
102 |
103 | self.obs_dict = {}
104 | self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy()
105 | self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy()
106 | self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy()
107 |
108 | return np.concatenate([
109 | self.obs_dict['joints_pos'], #9
110 | self.obs_dict['joints_vel'], #9
111 | self.obs_dict['com_torso'], #3
112 | ])
113 |
114 | ##############################################
115 |
116 | def reset_model(self, seed=None):
117 |
118 | # set reset pose/vel
119 | self.reset_pose = self.init_qpos + self.np_random.uniform(
120 | low=-.1, high=.1, size=self.model.nq)
121 | self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
122 |
123 | #reset the env to that pose/vel
124 | return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy())
125 |
126 |
127 | def do_reset(self, reset_pose, reset_vel, reset_goal=None):
128 |
129 | #reset
130 | self.set_state(reset_pose, reset_vel)
131 |
132 | #return
133 | return self._get_obs()
134 |
--------------------------------------------------------------------------------
/hw5/cs285/envs/obstacles/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | register(
4 | id='obstacles-cs285-v0',
5 | entry_point='cs285.envs.obstacles:Obstacles',
6 | max_episode_steps=500,
7 | )
8 | from cs285.envs.obstacles.obstacles_env import Obstacles
9 |
--------------------------------------------------------------------------------
/hw5/cs285/envs/reacher/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 |
3 | register(
4 | id='reacher-cs285-v0',
5 | entry_point='cs285.envs.reacher:Reacher7DOFEnv',
6 | max_episode_steps=500,
7 | )
8 | from cs285.envs.reacher.reacher_env import Reacher7DOFEnv
9 |
--------------------------------------------------------------------------------
/hw5/cs285/envs/reacher/reacher_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 | from mujoco_py import MjViewer
5 | import os
6 |
7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
8 | def __init__(self):
9 |
10 | # placeholder
11 | self.hand_sid = -2
12 | self.target_sid = -1
13 |
14 | curr_dir = os.path.dirname(os.path.abspath(__file__))
15 | mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2)
16 | utils.EzPickle.__init__(self)
17 | self.observation_dim = 26
18 | self.action_dim = 7
19 |
20 | self.hand_sid = self.model.site_name2id("finger")
21 | self.target_sid = self.model.site_name2id("target")
22 | self.skip = self.frame_skip
23 |
24 |
25 | def _get_obs(self):
26 | return np.concatenate([
27 | self.data.qpos.flat, #[7]
28 | self.data.qvel.flatten() / 10., #[7]
29 | self.data.site_xpos[self.hand_sid], #[3]
30 | self.model.site_pos[self.target_sid], #[3]
31 | ])
32 |
33 | def step(self, a):
34 |
35 | self.do_simulation(a, self.frame_skip)
36 | ob = self._get_obs()
37 | reward, done = self.get_reward(ob, a)
38 |
39 | score = self.get_score(ob)
40 |
41 | # finalize step
42 | env_info = {'ob': ob,
43 | 'rewards': self.reward_dict,
44 | 'score': score}
45 |
46 | return ob, reward, done, env_info
47 |
48 | def get_score(self, obs):
49 | hand_pos = obs[-6:-3]
50 | target_pos = obs[-3:]
51 | score = -1*np.abs(hand_pos-target_pos)
52 | return score
53 |
54 | def get_reward(self, observations, actions):
55 |
56 | """get reward/s of given (observations, actions) datapoint or datapoints
57 |
58 | Args:
59 | observations: (batchsize, obs_dim) or (obs_dim,)
60 | actions: (batchsize, ac_dim) or (ac_dim,)
61 |
62 | Return:
63 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
64 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
65 | """
66 |
67 | #initialize and reshape as needed, for batch mode
68 | self.reward_dict = {}
69 | if(len(observations.shape)==1):
70 | observations = np.expand_dims(observations, axis = 0)
71 | actions = np.expand_dims(actions, axis = 0)
72 | batch_mode = False
73 | else:
74 | batch_mode = True
75 |
76 | #get vars
77 | hand_pos = observations[:, -6:-3]
78 | target_pos = observations[:, -3:]
79 |
80 | #calc rew
81 | dist = np.linalg.norm(hand_pos - target_pos, axis=1)
82 | self.reward_dict['r_total'] = -10*dist
83 |
84 | #done is always false for this env
85 | dones = np.zeros((observations.shape[0],))
86 |
87 | #return
88 | if(not batch_mode):
89 | return self.reward_dict['r_total'][0], dones[0]
90 | return self.reward_dict['r_total'], dones
91 |
92 | def reset(self):
93 | _ = self.reset_model()
94 |
95 | self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1]
96 |
97 | observation, _reward, done, _info = self.step(np.zeros(7))
98 | ob = self._get_obs()
99 |
100 | return ob
101 |
102 | def reset_model(self, seed=None):
103 | if seed is not None:
104 | self.seed(seed)
105 |
106 | self.reset_pose = self.init_qpos.copy()
107 | self.reset_vel = self.init_qvel.copy()
108 |
109 | self.reset_goal = np.zeros(3)
110 | self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3)
111 | self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2)
112 | self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25)
113 |
114 | return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal)
115 |
116 | def do_reset(self, reset_pose, reset_vel, reset_goal):
117 |
118 | self.set_state(reset_pose, reset_vel)
119 |
120 | #reset target
121 | self.reset_goal = reset_goal.copy()
122 | self.model.site_pos[self.target_sid] = self.reset_goal
123 | self.sim.forward()
124 |
125 | #return
126 | return self._get_obs()
--------------------------------------------------------------------------------
/hw5/cs285/exploration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/cs285/exploration/__init__.py
--------------------------------------------------------------------------------
/hw5/cs285/exploration/base_exploration_model.py:
--------------------------------------------------------------------------------
1 | class BaseExplorationModel(object):
2 | def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n):
3 | raise NotImplementedError
--------------------------------------------------------------------------------
/hw5/cs285/exploration/rnd_model.py:
--------------------------------------------------------------------------------
1 | from cs285.infrastructure import pytorch_util as ptu
2 | from .base_exploration_model import BaseExplorationModel
3 | import torch.optim as optim
4 | from torch import nn
5 | import torch
6 |
7 | def init_method_1(model):
8 | model.weight.data.uniform_()
9 | model.bias.data.uniform_()
10 |
11 | def init_method_2(model):
12 | model.weight.data.normal_()
13 | model.bias.data.normal_()
14 |
15 |
16 | class RNDModel(nn.Module, BaseExplorationModel):
17 | def __init__(self, hparams, optimizer_spec, **kwargs):
18 | super().__init__(**kwargs)
19 | self.ob_dim = hparams['ob_dim']
20 | self.output_size = hparams['rnd_output_size']
21 | self.n_layers = hparams['rnd_n_layers']
22 | self.size = hparams['rnd_size']
23 | self.optimizer_spec = optimizer_spec
24 |
25 | # TODO: Create two neural networks:
26 | # 1) f, the random function we are trying to learn
27 | # 2) f_hat, the function we are using to learn f
28 | # WARNING: Make sure you use different types of weight
29 | # initializations for these two functions
30 |
31 | # HINT 1) Check out the method ptu.build_mlp
32 | # HINT 2) There are two weight init methods defined above
33 |
34 | self.f = None
35 | self.f_hat = None
36 |
37 | self.optimizer = self.optimizer_spec.constructor(
38 | self.f_hat.parameters(),
39 | **self.optimizer_spec.optim_kwargs
40 | )
41 | self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR(
42 | self.optimizer,
43 | self.optimizer_spec.learning_rate_schedule,
44 | )
45 |
46 | self.f.to(ptu.device)
47 | self.f_hat.to(ptu.device)
48 |
49 | def forward(self, ob_no):
50 | # TODO: Get the prediction error for ob_no
51 | # HINT: Remember to detach the output of self.f!
52 | error = None
53 | return error
54 |
55 | def forward_np(self, ob_no):
56 | ob_no = ptu.from_numpy(ob_no)
57 | error = self(ob_no)
58 | return ptu.to_numpy(error)
59 |
60 | def update(self, ob_no):
61 | # TODO: Update f_hat using ob_no
62 | # Hint: Take the mean prediction error across the batch
63 | loss = None
64 | return loss.item()
65 |
--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/colab_utils.py:
--------------------------------------------------------------------------------
1 | from gym.wrappers import Monitor
2 | import glob
3 | import io
4 | import base64
5 | from IPython.display import HTML
6 | from IPython import display as ipythondisplay
7 |
8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI
9 |
10 | def show_video():
11 | mp4list = glob.glob('/content/video/*.mp4')
12 | if len(mp4list) > 0:
13 | mp4 = mp4list[0]
14 | video = io.open(mp4, 'r+b').read()
15 | encoded = base64.b64encode(video)
16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii'))))
20 | else:
21 | print("Could not find video")
22 |
23 |
24 | def wrap_env(env):
25 | env = Monitor(env, '/content/video', force=True)
26 | return env
27 |
--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
1 | import os
2 | from tensorboardX import SummaryWriter
3 | import numpy as np
4 |
5 | class Logger:
6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
7 | self._log_dir = log_dir
8 | print('########################')
9 | print('logging outputs to ', log_dir)
10 | print('########################')
11 | self._n_logged_samples = n_logged_samples
12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 |
14 | def log_scalar(self, scalar, name, step_):
15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 |
17 | def log_scalars(self, scalar_dict, group_name, step, phase):
18 | """Will log all scalars in the same plot."""
19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 |
21 | def log_image(self, image, name, step):
22 | assert(len(image.shape) == 3) # [C, H, W]
23 | self._summ_writer.add_image('{}'.format(name), image, step)
24 |
25 | def log_video(self, video_frames, name, step, fps=10):
26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 |
29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 |
31 | # reshape the rollouts
32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 |
34 | # max rollout length
35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 | max_length = videos[0].shape[0]
37 | for i in range(max_videos_to_save):
38 | if videos[i].shape[0]>max_length:
39 | max_length = videos[i].shape[0]
40 |
41 | # pad rollouts to all be same length
42 | for i in range(max_videos_to_save):
43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!"
54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 |
56 | def log_figure(self, figure, name, step, phase):
57 | """figure: matplotlib.pyplot figure handle"""
58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 |
60 | def log_graph(self, array, name, step, phase):
61 | """figure: matplotlib.pyplot figure handle"""
62 | im = plot_graph(array)
63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 |
65 | def dump_scalars(self, log_path=None):
66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 | self._summ_writer.export_scalars_to_json(log_path)
68 |
69 | def flush(self):
70 | self._summ_writer.flush()
71 |
72 |
73 |
74 |
75 |
--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/pytorch_util.py:
--------------------------------------------------------------------------------
1 | from typing import Union
2 |
3 | import torch
4 | from torch import nn
5 |
6 | Activation = Union[str, nn.Module]
7 |
8 |
9 | _str_to_activation = {
10 | 'relu': nn.ReLU(),
11 | 'tanh': nn.Tanh(),
12 | 'leaky_relu': nn.LeakyReLU(),
13 | 'sigmoid': nn.Sigmoid(),
14 | 'selu': nn.SELU(),
15 | 'softplus': nn.Softplus(),
16 | 'identity': nn.Identity(),
17 | }
18 |
19 |
20 | def build_mlp(
21 | input_size: int,
22 | output_size: int,
23 | n_layers: int,
24 | size: int,
25 | activation: Activation = 'tanh',
26 | output_activation: Activation = 'identity',
27 | init_method=None,
28 | ):
29 | """
30 | Builds a feedforward neural network
31 | arguments:
32 | input_placeholder: placeholder variable for the state (batch_size, input_size)
33 | scope: variable scope of the network
34 | n_layers: number of hidden layers
35 | size: dimension of each hidden layer
36 | activation: activation of each hidden layer
37 | input_size: size of the input layer
38 | output_size: size of the output layer
39 | output_activation: activation of the output layer
40 | returns:
41 | output_placeholder: the result of a forward pass through the hidden layers + the output layer
42 | """
43 | if isinstance(activation, str):
44 | activation = _str_to_activation[activation]
45 | if isinstance(output_activation, str):
46 | output_activation = _str_to_activation[output_activation]
47 | layers = []
48 | in_size = input_size
49 | for _ in range(n_layers):
50 | curr_layer = nn.Linear(in_size, size)
51 | if init_method is not None:
52 | curr_layer.apply(init_method)
53 | layers.append(curr_layer)
54 | layers.append(activation)
55 | in_size = size
56 |
57 | last_layer = nn.Linear(in_size, output_size)
58 | if init_method is not None:
59 | last_layer.apply(init_method)
60 |
61 | layers.append(last_layer)
62 | layers.append(output_activation)
63 |
64 | return nn.Sequential(*layers)
65 |
66 |
67 | device = None
68 |
69 |
70 | def init_gpu(use_gpu=True, gpu_id=0):
71 | global device
72 | if torch.cuda.is_available() and use_gpu:
73 | device = torch.device("cuda:" + str(gpu_id))
74 | print("Using GPU id {}".format(gpu_id))
75 | else:
76 | device = torch.device("cpu")
77 | print("GPU not detected. Defaulting to CPU.")
78 |
79 |
80 | def set_device(gpu_id):
81 | torch.cuda.set_device(gpu_id)
82 |
83 |
84 | def from_numpy(*args, **kwargs):
85 | return torch.from_numpy(*args, **kwargs).float().to(device)
86 |
87 |
88 | def to_numpy(tensor):
89 | return tensor.to('cpu').detach().numpy()
90 |
--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
1 | from cs285.infrastructure.utils import *
2 |
3 |
4 | class ReplayBuffer(object):
5 |
6 | def __init__(self, max_size=1000000):
7 |
8 | self.max_size = max_size
9 | self.paths = []
10 | self.obs = None
11 | self.acs = None
12 | self.concatenated_rews = None
13 | self.unconcatenated_rews = None
14 | self.next_obs = None
15 | self.terminals = None
16 |
17 | def add_rollouts(self, paths, noised=False):
18 |
19 | # add new rollouts into our list of rollouts
20 | for path in paths:
21 | tpath = dict()
22 | # print (path.keys())
23 | tpath['observation'] = path['observations']
24 | tpath['next_observation'] = path['next_observations']
25 | tpath['reward'] = path['rewards']
26 | tpath['action'] = path['actions']
27 | tpath['terminal'] = path['terminals']
28 | self.paths.append(tpath)
29 |
30 | # convert new rollouts into their component arrays, and append them onto our arrays
31 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(self.paths)
32 |
33 | if noised:
34 | observations = add_noise(observations)
35 | next_observations = add_noise(next_observations)
36 |
37 | if self.obs is None:
38 | self.obs = observations[-self.max_size:]
39 | self.acs = actions[-self.max_size:]
40 | self.next_obs = next_observations[-self.max_size:]
41 | self.terminals = terminals[-self.max_size:]
42 | self.concatenated_rews = concatenated_rews[-self.max_size:]
43 | self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
44 | else:
45 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
46 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
47 | self.next_obs = np.concatenate(
48 | [self.next_obs, next_observations]
49 | )[-self.max_size:]
50 | self.terminals = np.concatenate(
51 | [self.terminals, terminals]
52 | )[-self.max_size:]
53 | self.concatenated_rews = np.concatenate(
54 | [self.concatenated_rews, concatenated_rews]
55 | )[-self.max_size:]
56 | if isinstance(unconcatenated_rews, list):
57 | self.unconcatenated_rews += unconcatenated_rews # TODO keep only latest max_size around
58 | else:
59 | self.unconcatenated_rews.append(unconcatenated_rews) # TODO keep only latest max_size around
60 |
61 | print (self.terminals.sum())
62 | ########################################
63 | ########################################
64 |
65 | def sample_random_rollouts(self, num_rollouts):
66 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
67 | return self.paths[rand_indices]
68 |
69 | def sample_recent_rollouts(self, num_rollouts=1):
70 | return self.paths[-num_rollouts:]
71 |
72 | def can_sample(self, batch_size):
73 | # print (self.obs.shape[0])
74 | if self.obs.shape[0] > batch_size:
75 | return True
76 | else:
77 | return False
78 |
79 | ########################################
80 | ########################################
81 |
82 | def sample_random_data(self, batch_size):
83 |
84 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
85 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
86 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
87 |
88 | def sample(self, batch_size):
89 | return self.sample_random_data(batch_size)
90 |
91 | def sample_recent_data(self, batch_size=1, concat_rew=True):
92 |
93 | if concat_rew:
94 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
95 | else:
96 | num_recent_rollouts_to_return = 0
97 | num_datapoints_so_far = 0
98 | index = -1
99 | while num_datapoints_so_far < batch_size:
100 | recent_rollout = self.paths[index]
101 | index -=1
102 | num_recent_rollouts_to_return +=1
103 | num_datapoints_so_far += get_pathlength(recent_rollout)
104 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
105 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
106 | return observations, actions, unconcatenated_rews, next_observations, terminals
107 |
--------------------------------------------------------------------------------
/hw5/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import itertools
3 | from torch import nn
4 | from torch.nn import functional as F
5 | from torch import optim
6 |
7 | import numpy as np
8 | import torch
9 | from torch import distributions
10 |
11 | from cs285.infrastructure import pytorch_util as ptu
12 | from cs285.policies.base_policy import BasePolicy
13 |
14 |
15 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta):
16 |
17 | def __init__(self,
18 | ac_dim,
19 | ob_dim,
20 | n_layers,
21 | size,
22 | discrete=False,
23 | learning_rate=1e-4,
24 | training=True,
25 | nn_baseline=False,
26 | **kwargs
27 | ):
28 | super().__init__(**kwargs)
29 |
30 | # init vars
31 | self.ac_dim = ac_dim
32 | self.ob_dim = ob_dim
33 | self.n_layers = n_layers
34 | self.discrete = discrete
35 | self.size = size
36 | self.learning_rate = learning_rate
37 | self.training = training
38 | self.nn_baseline = nn_baseline
39 |
40 | if self.discrete:
41 | self.logits_na = ptu.build_mlp(input_size=self.ob_dim,
42 | output_size=self.ac_dim,
43 | n_layers=self.n_layers,
44 | size=self.size)
45 | self.logits_na.to(ptu.device)
46 | self.mean_net = None
47 | self.logstd = None
48 | self.optimizer = optim.Adam(self.logits_na.parameters(),
49 | self.learning_rate)
50 | else:
51 | self.logits_na = None
52 | self.mean_net = ptu.build_mlp(input_size=self.ob_dim,
53 | output_size=self.ac_dim,
54 | n_layers=self.n_layers, size=self.size)
55 | self.logstd = nn.Parameter(
56 | torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device)
57 | )
58 | self.mean_net.to(ptu.device)
59 | self.logstd.to(ptu.device)
60 | self.optimizer = optim.Adam(
61 | itertools.chain([self.logstd], self.mean_net.parameters()),
62 | self.learning_rate
63 | )
64 |
65 | if nn_baseline:
66 | self.baseline = ptu.build_mlp(
67 | input_size=self.ob_dim,
68 | output_size=1,
69 | n_layers=self.n_layers,
70 | size=self.size,
71 | )
72 | self.baseline.to(ptu.device)
73 | self.baseline_optimizer = optim.Adam(
74 | self.baseline.parameters(),
75 | self.learning_rate,
76 | )
77 | else:
78 | self.baseline = None
79 |
80 | ##################################
81 |
82 | def save(self, filepath):
83 | torch.save(self.state_dict(), filepath)
84 |
85 | ##################################
86 |
87 | # query the policy with observation(s) to get selected action(s)
88 | def get_action(self, obs: np.ndarray) -> np.ndarray:
89 | raise NotImplementedError
90 | # TODO: get this from hw1
91 |
92 | ####################################
93 | ####################################
94 |
95 | # update/train this policy
96 | def update(self, observations, actions, **kwargs):
97 | raise NotImplementedError
98 |
99 | # This function defines the forward pass of the network.
100 | # You can return anything you want, but you should be able to differentiate
101 | # through it. For example, you can return a torch.FloatTensor. You can also
102 | # return more flexible objects, such as a
103 | # `torch.distributions.Distribution` object. It's up to you!
104 | def forward(self, observation: torch.FloatTensor):
105 | raise NotImplementedError
106 | # TODO: get this from hw1
107 |
108 | ####################################
109 | ####################################
110 |
111 |
112 | #####################################################
113 | #####################################################
114 |
115 |
116 | class MLPPolicyAC(MLPPolicy):
117 | # MJ: cut acs_labels_na and qvals from the signature if they are not used
118 | def update(
119 | self, observations, actions,
120 | adv_n=None, acs_labels_na=None, qvals=None
121 | ):
122 | raise NotImplementedError
123 | # Not needed for this homework
124 |
125 | ####################################
126 | ####################################
127 |
--------------------------------------------------------------------------------
/hw5/cs285/policies/argmax_policy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pdb
3 |
4 |
5 | class ArgMaxPolicy(object):
6 |
7 | def __init__(self, critic):
8 | self.critic = critic
9 |
10 | def set_critic(self, critic):
11 | self.critic = critic
12 |
13 | def get_action(self, obs):
14 | # MJ: changed the dimension check to a 3
15 | if len(obs.shape) > 3:
16 | observation = obs
17 | else:
18 | observation = obs[None]
19 |
20 | raise NotImplementedError
21 | # TODO: get this from hw3
22 |
23 | ####################################
24 | ####################################
--------------------------------------------------------------------------------
/hw5/cs285/policies/base_policy.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import numpy as np
3 |
4 |
5 | class BasePolicy(object, metaclass=abc.ABCMeta):
6 | def get_action(self, obs: np.ndarray) -> np.ndarray:
7 | raise NotImplementedError
8 |
9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict:
10 | """Return a dictionary of logging information."""
11 | raise NotImplementedError
12 |
13 | def save(self, filepath: str):
14 | raise NotImplementedError
15 |
--------------------------------------------------------------------------------
/hw5/cs285/scripts/read_results.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import tensorflow as tf
3 |
4 | def get_section_results(file):
5 | """
6 | requires tensorflow==1.12.0
7 | """
8 | X = []
9 | Y = []
10 | for e in tf.train.summary_iterator(file):
11 | for v in e.summary.value:
12 | if v.tag == 'Train_EnvstepsSoFar':
13 | X.append(v.simple_value)
14 | elif v.tag == 'Eval_AverageReturn':
15 | Y.append(v.simple_value)
16 | return X, Y
17 |
18 | if __name__ == '__main__':
19 | import glob
20 |
21 | logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*'
22 | eventfile = glob.glob(logdir)[0]
23 |
24 | X, Y = get_section_results(eventfile)
25 | for i, (x, y) in enumerate(zip(X, Y)):
26 | print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y))
--------------------------------------------------------------------------------
/hw5/cs285/scripts/run_hw5_expl.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 |
4 | from cs285.infrastructure.rl_trainer import RL_Trainer
5 | from cs285.agents.explore_or_exploit_agent import ExplorationOrExploitationAgent
6 | from cs285.infrastructure.dqn_utils import get_env_kwargs, PiecewiseSchedule, ConstantSchedule
7 |
8 |
9 | class Q_Trainer(object):
10 |
11 | def __init__(self, params):
12 | self.params = params
13 |
14 | train_args = {
15 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
16 | 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
17 | 'train_batch_size': params['batch_size'],
18 | 'double_q': params['double_q'],
19 | }
20 |
21 | env_args = get_env_kwargs(params['env_name'])
22 |
23 | self.agent_params = {**train_args, **env_args, **params}
24 |
25 | self.params['agent_class'] = ExplorationOrExploitationAgent
26 | self.params['agent_params'] = self.agent_params
27 | self.params['train_batch_size'] = params['batch_size']
28 | self.params['env_wrappers'] = self.agent_params['env_wrappers']
29 |
30 | self.rl_trainer = RL_Trainer(self.params)
31 |
32 | def run_training_loop(self):
33 | self.rl_trainer.run_training_loop(
34 | self.agent_params['num_timesteps'],
35 | collect_policy = self.rl_trainer.agent.actor,
36 | eval_policy = self.rl_trainer.agent.actor,
37 | )
38 |
39 | def main():
40 |
41 | import argparse
42 | parser = argparse.ArgumentParser()
43 | parser.add_argument(
44 | '--env_name',
45 | default='PointmassHard-v0',
46 | choices=('PointmassEasy-v0', 'PointmassMedium-v0', 'PointmassHard-v0', 'PointmassVeryHard-v0')
47 | )
48 |
49 | parser.add_argument('--exp_name', type=str, default='todo')
50 |
51 | parser.add_argument('--eval_batch_size', type=int, default=1000)
52 | parser.add_argument('--batch_size', type=int, default=256)
53 |
54 | parser.add_argument('--use_rnd', action='store_true')
55 | parser.add_argument('--num_exploration_steps', type=int, default=10000)
56 | parser.add_argument('--unsupervised_exploration', action='store_true')
57 |
58 | parser.add_argument('--offline_exploitation', action='store_true')
59 | parser.add_argument('--cql_alpha', type=float, default=0.0)
60 |
61 | parser.add_argument('--exploit_rew_shift', type=float, default=0.0)
62 | parser.add_argument('--exploit_rew_scale', type=float, default=1.0)
63 |
64 | parser.add_argument('--rnd_output_size', type=int, default=5)
65 | parser.add_argument('--rnd_n_layers', type=int, default=2)
66 | parser.add_argument('--rnd_size', type=int, default=400)
67 |
68 | parser.add_argument('--seed', type=int, default=2)
69 | parser.add_argument('--no_gpu', '-ngpu', action='store_true')
70 | parser.add_argument('--which_gpu', '-gpu_id', default=0)
71 | parser.add_argument('--scalar_log_freq', type=int, default=int(1e3))
72 | parser.add_argument('--save_params', action='store_true')
73 |
74 | args = parser.parse_args()
75 |
76 | # convert to dictionary
77 | params = vars(args)
78 | params['double_q'] = True
79 | params['num_agent_train_steps_per_iter'] = 1
80 | params['num_critic_updates_per_agent_update'] = 1
81 | params['exploit_weight_schedule'] = ConstantSchedule(1.0)
82 | params['video_log_freq'] = -1 # This param is not used for DQN
83 | params['num_timesteps'] = 50000
84 | params['learning_starts'] = 2000
85 | params['eps'] = 0.2
86 | ##################################
87 | ### CREATE DIRECTORY FOR LOGGING
88 | ##################################
89 |
90 | if params['env_name']=='PointmassEasy-v0':
91 | params['ep_len']=50
92 | if params['env_name']=='PointmassMedium-v0':
93 | params['ep_len']=150
94 | if params['env_name']=='PointmassHard-v0':
95 | params['ep_len']=100
96 | if params['env_name']=='PointmassVeryHard-v0':
97 | params['ep_len']=200
98 |
99 | if params['use_rnd']:
100 | params['explore_weight_schedule'] = PiecewiseSchedule([(0,1), (params['num_exploration_steps'], 0)], outside_value=0.0)
101 | else:
102 | params['explore_weight_schedule'] = ConstantSchedule(0.0)
103 |
104 | if params['unsupervised_exploration']:
105 | params['explore_weight_schedule'] = ConstantSchedule(1.0)
106 | params['exploit_weight_schedule'] = ConstantSchedule(0.0)
107 |
108 | if not params['use_rnd']:
109 | params['learning_starts'] = params['num_exploration_steps']
110 |
111 |
112 | logdir_prefix = 'hw5_expl_' # keep for autograder
113 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data')
114 |
115 | if not (os.path.exists(data_path)):
116 | os.makedirs(data_path)
117 |
118 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
119 | logdir = os.path.join(data_path, logdir)
120 | params['logdir'] = logdir
121 | if not(os.path.exists(logdir)):
122 | os.makedirs(logdir)
123 |
124 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
125 |
126 | trainer = Q_Trainer(params)
127 | trainer.run_training_loop()
128 |
129 |
130 | if __name__ == "__main__":
131 | main()
132 |
--------------------------------------------------------------------------------
/hw5/hw5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/hw5.pdf
--------------------------------------------------------------------------------
/hw5/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.17.2
2 | mujoco-py==2.0.2.2
3 | tensorboard==2.3.0
4 | tensorboardX==1.8
5 | matplotlib==2.2.2
6 | ipython==6.4.0
7 | moviepy==1.0.0
8 | pyvirtualdisplay==1.3.2
9 | torch==1.5.1
10 | opencv-python==4.4.0.42
11 | networkx==2.5
12 | ipdb==0.13.3
13 | box2d-py
14 |
--------------------------------------------------------------------------------
/hw5/requirements_colab.txt:
--------------------------------------------------------------------------------
1 | gym==0.17.2
2 | tensorboard==2.3.0
3 | tensorboardX==1.8
4 | matplotlib==2.2.2
5 | ipython==6.4.0
6 | moviepy==1.0.0
7 | pyvirtualdisplay==1.3.2
8 | torch==1.5.1
9 | opencv-python==4.4.0.42
10 | networkx==2.5
11 | ipdb==0.13.3
12 | box2d-py
13 |
--------------------------------------------------------------------------------
/hw5/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 |
4 | setup(
5 | name='cs285',
6 | version='0.1.0',
7 | packages=['cs285'],
8 | )
--------------------------------------------------------------------------------