├── .gitignore ├── README.md ├── hw1 ├── README.md ├── cs285 │ ├── agents │ │ ├── __init__.py │ │ ├── base_agent.py │ │ └── bc_agent.py │ ├── expert_data │ │ ├── expert_data_Ant-v2.pkl │ │ ├── expert_data_HalfCheetah-v2.pkl │ │ ├── expert_data_Hopper-v2.pkl │ │ ├── expert_data_Humanoid-v2.pkl │ │ └── expert_data_Walker2d-v2.pkl │ ├── infrastructure │ │ ├── __init__.py │ │ ├── colab_utils.py │ │ ├── logger.py │ │ ├── pytorch_util.py │ │ ├── replay_buffer.py │ │ ├── rl_trainer.py │ │ └── utils.py │ ├── policies │ │ ├── MLP_policy.py │ │ ├── __init__.py │ │ ├── base_policy.py │ │ ├── experts │ │ │ ├── Ant.pkl │ │ │ ├── HalfCheetah.pkl │ │ │ ├── Hopper.pkl │ │ │ ├── Humanoid.pkl │ │ │ └── Walker2d.pkl │ │ └── loaded_gaussian_policy.py │ └── scripts │ │ ├── run_hw1.ipynb │ │ └── run_hw1.py ├── cs285_hw1.pdf ├── installation.md ├── requirements.txt ├── requirements_colab.txt └── setup.py ├── hw2 ├── README.md ├── cs285 │ ├── agents │ │ ├── __init__.py │ │ ├── base_agent.py │ │ └── pg_agent.py │ ├── infrastructure │ │ ├── __init__.py │ │ ├── colab_utils.py │ │ ├── logger.py │ │ ├── pytorch_util.py │ │ ├── replay_buffer.py │ │ ├── rl_trainer.py │ │ └── utils.py │ ├── policies │ │ ├── MLP_policy.py │ │ ├── __init__.py │ │ └── base_policy.py │ └── scripts │ │ ├── read_results.py │ │ ├── run_hw2.ipynb │ │ └── run_hw2.py ├── cs285_hw2.pdf ├── requirements.txt ├── requirements_colab.txt └── setup.py ├── hw3 ├── README.md ├── cs285 │ ├── agents │ │ ├── ac_agent.py │ │ ├── base_agent.py │ │ └── dqn_agent.py │ ├── critics │ │ ├── __init__.py │ │ ├── base_critic.py │ │ ├── bootstrapped_continuous_critic.py │ │ └── dqn_critic.py │ ├── envs │ │ ├── __init__.py │ │ └── box2d │ │ │ ├── __init__.py │ │ │ └── lunar_lander.py │ ├── infrastructure │ │ ├── atari_wrappers.py │ │ ├── colab_utils.py │ │ ├── dqn_utils.py │ │ ├── logger.py │ │ ├── pytorch_util.py │ │ ├── replay_buffer.py │ │ ├── rl_trainer.py │ │ └── utils.py │ ├── policies │ │ ├── MLP_policy.py │ │ ├── argmax_policy.py │ │ └── base_policy.py │ └── scripts │ │ ├── read_results.py │ │ ├── run_hw3_actor_critic.ipynb │ │ ├── run_hw3_actor_critic.py │ │ ├── run_hw3_dqn.ipynb │ │ └── run_hw3_dqn.py ├── cs285_hw3.pdf ├── requirements.txt ├── requirements_colab.txt └── setup.py ├── hw4 ├── README.md ├── cs285 │ ├── agents │ │ ├── base_agent.py │ │ └── mb_agent.py │ ├── envs │ │ ├── __init__.py │ │ ├── cheetah │ │ │ ├── __init__.py │ │ │ └── cheetah.py │ │ ├── obstacles │ │ │ ├── __init__.py │ │ │ └── obstacles_env.py │ │ └── reacher │ │ │ ├── __init__.py │ │ │ ├── assets │ │ │ └── sawyer.xml │ │ │ └── reacher_env.py │ ├── infrastructure │ │ ├── colab_utils.py │ │ ├── logger.py │ │ ├── pytorch_util.py │ │ ├── replay_buffer.py │ │ ├── rl_trainer.py │ │ └── utils.py │ ├── models │ │ ├── base_model.py │ │ └── ff_model.py │ ├── policies │ │ ├── MPC_policy.py │ │ └── base_policy.py │ └── scripts │ │ ├── filter_events.py │ │ ├── read_results.py │ │ ├── run_hw4_mb.ipynb │ │ └── run_hw4_mb.py ├── cs285_hw4.pdf ├── requirements.txt ├── requirements_colab.txt └── setup.py └── hw5 ├── README.md ├── cs285 ├── agents │ ├── ac_agent.py │ ├── base_agent.py │ ├── dqn_agent.py │ └── explore_or_exploit_agent.py ├── critics │ ├── __init__.py │ ├── base_critic.py │ ├── bootstrapped_continuous_critic.py │ ├── cql_critic.py │ └── dqn_critic.py ├── envs │ ├── __init__.py │ ├── ant │ │ ├── __init__.py │ │ └── ant.py │ ├── box2d │ │ ├── __init__.py │ │ └── lunar_lander.py │ ├── cheetah │ │ ├── __init__.py │ │ └── cheetah.py │ ├── obstacles │ │ ├── __init__.py │ │ └── obstacles_env.py │ ├── pointmass │ │ └── pointmass.py │ └── reacher │ │ ├── __init__.py │ │ ├── assets │ │ └── sawyer.xml │ │ └── reacher_env.py ├── exploration │ ├── __init__.py │ ├── base_exploration_model.py │ └── rnd_model.py ├── infrastructure │ ├── atari_wrappers.py │ ├── colab_utils.py │ ├── dqn_utils.py │ ├── logger.py │ ├── pytorch_util.py │ ├── replay_buffer.py │ ├── rl_trainer.py │ └── utils.py ├── policies │ ├── MLP_policy.py │ ├── argmax_policy.py │ └── base_policy.py └── scripts │ ├── read_results.py │ └── run_hw5_expl.py ├── hw5.pdf ├── requirements.txt ├── requirements_colab.txt ├── run_hw5_expl.ipynb └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | data/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Assignments for [Berkeley CS 285: Deep Reinforcement Learning, Decision Making, and Control](http://rail.eecs.berkeley.edu/deeprlcourse/). 2 | -------------------------------------------------------------------------------- /hw1/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | You can run this code on your own machine or on Google Colab. 4 | 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](installation.md) for instructions. 6 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below: 7 | 8 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw1/cs285/scripts/run_hw1.ipynb) 9 | 10 | ## Complete the code 11 | 12 | Fill in sections marked with `TODO`. In particular, see 13 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) 14 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py) 15 | - [infrastructure/replay_buffer.py](cs285/infrastructure/replay_buffer.py) 16 | - [infrastructure/utils.py](cs285/infrastructure/utils.py) 17 | - [infrastructure/pytorch_util.py](cs285/infrastructure/pytorch_util.py) 18 | 19 | Look for sections maked with `HW1` to see how the edits you make will be used. 20 | Some other files that you may find relevant 21 | - [scripts/run_hw1.py](cs285/scripts/run_hw1.py) (if running locally) or [scripts/run_hw1.ipynb](cs285/scripts/run_hw1.ipynb) (if running on Colab) 22 | - [agents/bc_agent.py](cs285/agents/bc_agent.py) 23 | 24 | See the homework pdf for more details. 25 | 26 | ## Run the code 27 | 28 | Tip: While debugging, you probably want to keep the flag `--video_log_freq -1` which will disable video logging and speed up the experiment. However, feel free to remove it to save videos of your awesome policy! 29 | 30 | If running on Colab, adjust the `#@params` in the `Args` class according to the commmand line arguments above. 31 | 32 | ### Section 1 (Behavior Cloning) 33 | Command for problem 1: 34 | 35 | ``` 36 | python cs285/scripts/run_hw1.py \ 37 | --expert_policy_file cs285/policies/experts/Ant.pkl \ 38 | --env_name Ant-v2 --exp_name bc_ant --n_iter 1 \ 39 | --expert_data cs285/expert_data/expert_data_Ant-v2.pkl 40 | --video_log_freq -1 41 | ``` 42 | 43 | Make sure to also try another environment. 44 | See the homework PDF for more details on what else you need to run. 45 | To generate videos of the policy, remove the `--video_log_freq -1` flag. 46 | 47 | ### Section 2 (DAgger) 48 | Command for section 1: 49 | (Note the `--do_dagger` flag, and the higher value for `n_iter`) 50 | 51 | ``` 52 | python cs285/scripts/run_hw1.py \ 53 | --expert_policy_file cs285/policies/experts/Ant.pkl \ 54 | --env_name Ant-v2 --exp_name dagger_ant --n_iter 10 \ 55 | --do_dagger --expert_data cs285/expert_data/expert_data_Ant-v2.pkl \ 56 | --video_log_freq -1 57 | ``` 58 | 59 | Make sure to also try another environment. 60 | See the homework PDF for more details on what else you need to run. 61 | 62 | ## Visualization the saved tensorboard event file: 63 | 64 | You can visualize your runs using tensorboard: 65 | ``` 66 | tensorboard --logdir data 67 | ``` 68 | 69 | You will see scalar summaries as well as videos of your trained policies (in the 'images' tab). 70 | 71 | You can choose to visualize specific runs with a comma-separated list: 72 | ``` 73 | tensorboard --logdir data/run1,data/run2,data/run3... 74 | ``` 75 | 76 | If running on Colab, you will be using the `%tensorboard` [line magic](https://ipython.readthedocs.io/en/stable/interactive/magics.html) to do the same thing; see the [notebook](cs285/scripts/run_hw1.ipynb) for more details. 77 | 78 | -------------------------------------------------------------------------------- /hw1/cs285/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/agents/__init__.py -------------------------------------------------------------------------------- /hw1/cs285/agents/base_agent.py: -------------------------------------------------------------------------------- 1 | 2 | class BaseAgent(object): 3 | def __init__(self, **kwargs): 4 | super(BaseAgent, self).__init__(**kwargs) 5 | 6 | def train(self) -> dict: 7 | """Return a dictionary of logging information.""" 8 | raise NotImplementedError 9 | 10 | def add_to_replay_buffer(self, paths): 11 | raise NotImplementedError 12 | 13 | def sample(self, batch_size): 14 | raise NotImplementedError 15 | 16 | def save(self, path): 17 | raise NotImplementedError 18 | -------------------------------------------------------------------------------- /hw1/cs285/agents/bc_agent.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure.replay_buffer import ReplayBuffer 2 | from cs285.policies.MLP_policy import MLPPolicySL 3 | from .base_agent import BaseAgent 4 | 5 | 6 | class BCAgent(BaseAgent): 7 | def __init__(self, env, agent_params): 8 | super(BCAgent, self).__init__() 9 | 10 | # init vars 11 | self.env = env 12 | self.agent_params = agent_params 13 | 14 | # actor/policy 15 | self.actor = MLPPolicySL( 16 | self.agent_params['ac_dim'], 17 | self.agent_params['ob_dim'], 18 | self.agent_params['n_layers'], 19 | self.agent_params['size'], 20 | discrete=self.agent_params['discrete'], 21 | learning_rate=self.agent_params['learning_rate'], 22 | ) 23 | 24 | # replay buffer 25 | self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size']) 26 | 27 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 28 | # training a BC agent refers to updating its actor using 29 | # the given observations and corresponding action labels 30 | log = self.actor.update(ob_no, ac_na) # HW1: you will modify this 31 | return log 32 | 33 | def add_to_replay_buffer(self, paths): 34 | self.replay_buffer.add_rollouts(paths) 35 | 36 | def sample(self, batch_size): 37 | return self.replay_buffer.sample_random_data(batch_size) # HW1: you will modify this 38 | 39 | def save(self, path): 40 | return self.actor.save(path) -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_Ant-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Ant-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_Hopper-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/infrastructure/__init__.py -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/colab_utils.py: -------------------------------------------------------------------------------- 1 | from gym.wrappers import Monitor 2 | import glob 3 | import io 4 | import base64 5 | from IPython.display import HTML 6 | from IPython import display as ipythondisplay 7 | 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI 9 | 10 | def show_video(): 11 | mp4list = glob.glob('/content/video/*.mp4') 12 | if len(mp4list) > 0: 13 | mp4 = mp4list[0] 14 | video = io.open(mp4, 'r+b').read() 15 | encoded = base64.b64encode(video) 16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) 20 | else: 21 | print("Could not find video") 22 | 23 | 24 | def wrap_env(env): 25 | env = Monitor(env, '/content/video', force=True) 26 | return env -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorboardX import SummaryWriter 3 | import numpy as np 4 | 5 | class Logger: 6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 7 | self._log_dir = log_dir 8 | print('########################') 9 | print('logging outputs to ', log_dir) 10 | print('########################') 11 | self._n_logged_samples = n_logged_samples 12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 13 | 14 | def log_scalar(self, scalar, name, step_): 15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 16 | 17 | def log_scalars(self, scalar_dict, group_name, step, phase): 18 | """Will log all scalars in the same plot.""" 19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 20 | 21 | def log_image(self, image, name, step): 22 | assert(len(image.shape) == 3) # [C, H, W] 23 | self._summ_writer.add_image('{}'.format(name), image, step) 24 | 25 | def log_video(self, video_frames, name, step, fps=10): 26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 28 | 29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 30 | 31 | # reshape the rollouts 32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 33 | 34 | # max rollout length 35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 36 | max_length = videos[0].shape[0] 37 | for i in range(max_videos_to_save): 38 | if videos[i].shape[0]>max_length: 39 | max_length = videos[i].shape[0] 40 | 41 | # pad rollouts to all be same length 42 | for i in range(max_videos_to_save): 43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 55 | 56 | def log_figure(self, figure, name, step, phase): 57 | """figure: matplotlib.pyplot figure handle""" 58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 59 | 60 | def log_graph(self, array, name, step, phase): 61 | """figure: matplotlib.pyplot figure handle""" 62 | im = plot_graph(array) 63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 64 | 65 | def dump_scalars(self, log_path=None): 66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 67 | self._summ_writer.export_scalars_to_json(log_path) 68 | 69 | def flush(self): 70 | self._summ_writer.flush() 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/pytorch_util.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | from torch import nn 5 | 6 | Activation = Union[str, nn.Module] 7 | 8 | 9 | _str_to_activation = { 10 | 'relu': nn.ReLU(), 11 | 'tanh': nn.Tanh(), 12 | 'leaky_relu': nn.LeakyReLU(), 13 | 'sigmoid': nn.Sigmoid(), 14 | 'selu': nn.SELU(), 15 | 'softplus': nn.Softplus(), 16 | 'identity': nn.Identity(), 17 | } 18 | 19 | 20 | def build_mlp( 21 | input_size: int, 22 | output_size: int, 23 | n_layers: int, 24 | size: int, 25 | activation: Activation = 'tanh', 26 | output_activation: Activation = 'identity', 27 | ) -> nn.Module: 28 | """ 29 | Builds a feedforward neural network 30 | 31 | arguments: 32 | n_layers: number of hidden layers 33 | size: dimension of each hidden layer 34 | activation: activation of each hidden layer 35 | 36 | input_size: size of the input layer 37 | output_size: size of the output layer 38 | output_activation: activation of the output layer 39 | 40 | returns: 41 | MLP (nn.Module) 42 | """ 43 | if isinstance(activation, str): 44 | activation = _str_to_activation[activation] 45 | if isinstance(output_activation, str): 46 | output_activation = _str_to_activation[output_activation] 47 | 48 | # TODO: return a MLP. This should be an instance of nn.Module 49 | # Note: nn.Sequential is an instance of nn.Module. 50 | raise NotImplementedError 51 | 52 | 53 | device = None 54 | 55 | 56 | def init_gpu(use_gpu=True, gpu_id=0): 57 | global device 58 | if torch.cuda.is_available() and use_gpu: 59 | device = torch.device("cuda:" + str(gpu_id)) 60 | print("Using GPU id {}".format(gpu_id)) 61 | else: 62 | device = torch.device("cpu") 63 | print("GPU not detected. Defaulting to CPU.") 64 | 65 | 66 | def set_device(gpu_id): 67 | torch.cuda.set_device(gpu_id) 68 | 69 | 70 | def from_numpy(*args, **kwargs): 71 | return torch.from_numpy(*args, **kwargs).float().to(device) 72 | 73 | 74 | def to_numpy(tensor): 75 | return tensor.to('cpu').detach().numpy() 76 | -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure.utils import * 2 | 3 | 4 | class ReplayBuffer(object): 5 | 6 | def __init__(self, max_size=1000000): 7 | 8 | self.max_size = max_size 9 | 10 | # store each rollout 11 | self.paths = [] 12 | 13 | # store (concatenated) component arrays from each rollout 14 | self.obs = None 15 | self.acs = None 16 | self.rews = None 17 | self.next_obs = None 18 | self.terminals = None 19 | 20 | def __len__(self): 21 | if self.obs: 22 | return self.obs.shape[0] 23 | else: 24 | return 0 25 | 26 | def add_rollouts(self, paths, concat_rew=True): 27 | 28 | # add new rollouts into our list of rollouts 29 | for path in paths: 30 | self.paths.append(path) 31 | 32 | # convert new rollouts into their component arrays, and append them onto 33 | # our arrays 34 | observations, actions, rewards, next_observations, terminals = ( 35 | convert_listofrollouts(paths, concat_rew)) 36 | 37 | if self.obs is None: 38 | self.obs = observations[-self.max_size:] 39 | self.acs = actions[-self.max_size:] 40 | self.rews = rewards[-self.max_size:] 41 | self.next_obs = next_observations[-self.max_size:] 42 | self.terminals = terminals[-self.max_size:] 43 | else: 44 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 45 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 46 | if concat_rew: 47 | self.rews = np.concatenate( 48 | [self.rews, rewards] 49 | )[-self.max_size:] 50 | else: 51 | if isinstance(rewards, list): 52 | self.rews += rewards 53 | else: 54 | self.rews.append(rewards) 55 | self.rews = self.rews[-self.max_size:] 56 | self.next_obs = np.concatenate( 57 | [self.next_obs, next_observations] 58 | )[-self.max_size:] 59 | self.terminals = np.concatenate( 60 | [self.terminals, terminals] 61 | )[-self.max_size:] 62 | 63 | ######################################## 64 | ######################################## 65 | 66 | def sample_random_data(self, batch_size): 67 | assert ( 68 | self.obs.shape[0] 69 | == self.acs.shape[0] 70 | == self.rews.shape[0] 71 | == self.next_obs.shape[0] 72 | == self.terminals.shape[0] 73 | ) 74 | 75 | ## TODO return batch_size number of random entries from each of the 5 component arrays above 76 | ## HINT 1: use np.random.permutation to sample random indices 77 | ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array) 78 | ## HINT 3: look at the sample_recent_data function below 79 | 80 | return TODO, TODO, TODO, TODO, TODO 81 | 82 | def sample_recent_data(self, batch_size=1): 83 | return ( 84 | self.obs[-batch_size:], 85 | self.acs[-batch_size:], 86 | self.rews[-batch_size:], 87 | self.next_obs[-batch_size:], 88 | self.terminals[-batch_size:], 89 | ) 90 | -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | ############################################ 5 | ############################################ 6 | 7 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): 8 | 9 | # initialize env for the beginning of a new rollout 10 | ob = TODO # HINT: should be the output of resetting the env 11 | 12 | # init vars 13 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] 14 | steps = 0 15 | while True: 16 | 17 | # render image of the simulated env 18 | if render: 19 | if 'rgb_array' in render_mode: 20 | if hasattr(env, 'sim'): 21 | image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1]) 22 | else: 23 | image_obs.append(env.render(mode=render_mode)) 24 | if 'human' in render_mode: 25 | env.render(mode=render_mode) 26 | time.sleep(env.model.opt.timestep) 27 | 28 | # use the most recent ob to decide what to do 29 | obs.append(ob) 30 | ac = TODO # HINT: query the policy's get_action function 31 | ac = ac[0] 32 | acs.append(ac) 33 | 34 | # take that action and record results 35 | ob, rew, done, _ = env.step(ac) 36 | 37 | # record result of taking that action 38 | steps += 1 39 | next_obs.append(ob) 40 | rewards.append(rew) 41 | 42 | # TODO end the rollout if the rollout ended 43 | # HINT: rollout can end due to done, or due to max_path_length 44 | rollout_done = TODO # HINT: this is either 0 or 1 45 | terminals.append(rollout_done) 46 | 47 | if rollout_done: 48 | break 49 | 50 | return Path(obs, image_obs, acs, rewards, next_obs, terminals) 51 | 52 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): 53 | """ 54 | Collect rollouts until we have collected min_timesteps_per_batch steps. 55 | 56 | TODO implement this function 57 | Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths 58 | Hint2: use get_pathlength to count the timesteps collected in each path 59 | """ 60 | timesteps_this_batch = 0 61 | paths = [] 62 | while timesteps_this_batch < min_timesteps_per_batch: 63 | 64 | TODO 65 | 66 | return paths, timesteps_this_batch 67 | 68 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): 69 | """ 70 | Collect ntraj rollouts. 71 | 72 | TODO implement this function 73 | Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths 74 | """ 75 | paths = [] 76 | 77 | TODO 78 | 79 | return paths 80 | 81 | ############################################ 82 | ############################################ 83 | 84 | def Path(obs, image_obs, acs, rewards, next_obs, terminals): 85 | """ 86 | Take info (separate arrays) from a single rollout 87 | and return it in a single dictionary 88 | """ 89 | if image_obs != []: 90 | image_obs = np.stack(image_obs, axis=0) 91 | return {"observation" : np.array(obs, dtype=np.float32), 92 | "image_obs" : np.array(image_obs, dtype=np.uint8), 93 | "reward" : np.array(rewards, dtype=np.float32), 94 | "action" : np.array(acs, dtype=np.float32), 95 | "next_observation": np.array(next_obs, dtype=np.float32), 96 | "terminal": np.array(terminals, dtype=np.float32)} 97 | 98 | 99 | def convert_listofrollouts(paths, concat_rew=True): 100 | """ 101 | Take a list of rollout dictionaries 102 | and return separate arrays, 103 | where each array is a concatenation of that array from across the rollouts 104 | """ 105 | observations = np.concatenate([path["observation"] for path in paths]) 106 | actions = np.concatenate([path["action"] for path in paths]) 107 | if concat_rew: 108 | rewards = np.concatenate([path["reward"] for path in paths]) 109 | else: 110 | rewards = [path["reward"] for path in paths] 111 | next_observations = np.concatenate([path["next_observation"] for path in paths]) 112 | terminals = np.concatenate([path["terminal"] for path in paths]) 113 | return observations, actions, rewards, next_observations, terminals 114 | 115 | ############################################ 116 | ############################################ 117 | 118 | def get_pathlength(path): 119 | return len(path["reward"]) -------------------------------------------------------------------------------- /hw1/cs285/policies/MLP_policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import itertools 3 | from typing import Any 4 | from torch import nn 5 | from torch.nn import functional as F 6 | from torch import optim 7 | 8 | import numpy as np 9 | import torch 10 | from torch import distributions 11 | 12 | from cs285.infrastructure import pytorch_util as ptu 13 | from cs285.policies.base_policy import BasePolicy 14 | 15 | 16 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): 17 | 18 | def __init__(self, 19 | ac_dim, 20 | ob_dim, 21 | n_layers, 22 | size, 23 | discrete=False, 24 | learning_rate=1e-4, 25 | training=True, 26 | nn_baseline=False, 27 | **kwargs 28 | ): 29 | super().__init__(**kwargs) 30 | 31 | # init vars 32 | self.ac_dim = ac_dim 33 | self.ob_dim = ob_dim 34 | self.n_layers = n_layers 35 | self.discrete = discrete 36 | self.size = size 37 | self.learning_rate = learning_rate 38 | self.training = training 39 | self.nn_baseline = nn_baseline 40 | 41 | if self.discrete: 42 | self.logits_na = ptu.build_mlp( 43 | input_size=self.ob_dim, 44 | output_size=self.ac_dim, 45 | n_layers=self.n_layers, 46 | size=self.size, 47 | ) 48 | self.logits_na.to(ptu.device) 49 | self.mean_net = None 50 | self.logstd = None 51 | self.optimizer = optim.Adam(self.logits_na.parameters(), 52 | self.learning_rate) 53 | else: 54 | self.logits_na = None 55 | self.mean_net = ptu.build_mlp( 56 | input_size=self.ob_dim, 57 | output_size=self.ac_dim, 58 | n_layers=self.n_layers, size=self.size, 59 | ) 60 | self.mean_net.to(ptu.device) 61 | self.logstd = nn.Parameter( 62 | torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device) 63 | ) 64 | self.logstd.to(ptu.device) 65 | self.optimizer = optim.Adam( 66 | itertools.chain([self.logstd], self.mean_net.parameters()), 67 | self.learning_rate 68 | ) 69 | 70 | ################################## 71 | 72 | def save(self, filepath): 73 | torch.save(self.state_dict(), filepath) 74 | 75 | ################################## 76 | 77 | def get_action(self, obs: np.ndarray) -> np.ndarray: 78 | if len(obs.shape) > 1: 79 | observation = obs 80 | else: 81 | observation = obs[None] 82 | 83 | # TODO return the action that the policy prescribes 84 | raise NotImplementedError 85 | 86 | # update/train this policy 87 | def update(self, observations, actions, **kwargs): 88 | raise NotImplementedError 89 | 90 | # This function defines the forward pass of the network. 91 | # You can return anything you want, but you should be able to differentiate 92 | # through it. For example, you can return a torch.FloatTensor. You can also 93 | # return more flexible objects, such as a 94 | # `torch.distributions.Distribution` object. It's up to you! 95 | def forward(self, observation: torch.FloatTensor) -> Any: 96 | raise NotImplementedError 97 | 98 | 99 | ##################################################### 100 | ##################################################### 101 | 102 | class MLPPolicySL(MLPPolicy): 103 | def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs): 104 | super().__init__(ac_dim, ob_dim, n_layers, size, **kwargs) 105 | self.loss = nn.MSELoss() 106 | 107 | def update( 108 | self, observations, actions, 109 | adv_n=None, acs_labels_na=None, qvals=None 110 | ): 111 | # TODO: update the policy and return the loss 112 | loss = TODO 113 | return { 114 | # You can add extra logging information here, but keep this line 115 | 'Training Loss': ptu.to_numpy(loss), 116 | } 117 | -------------------------------------------------------------------------------- /hw1/cs285/policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/__init__.py -------------------------------------------------------------------------------- /hw1/cs285/policies/base_policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import numpy as np 3 | 4 | 5 | class BasePolicy(object, metaclass=abc.ABCMeta): 6 | def get_action(self, obs: np.ndarray) -> np.ndarray: 7 | raise NotImplementedError 8 | 9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: 10 | """Return a dictionary of logging information.""" 11 | raise NotImplementedError 12 | 13 | def save(self, filepath: str): 14 | raise NotImplementedError 15 | -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/Ant.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Ant.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/HalfCheetah.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/HalfCheetah.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/Hopper.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Hopper.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/Humanoid.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Humanoid.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/Walker2d.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285/policies/experts/Walker2d.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/loaded_gaussian_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from cs285.infrastructure import pytorch_util as ptu 4 | from .base_policy import BasePolicy 5 | from torch import nn 6 | import torch 7 | import pickle 8 | 9 | 10 | def create_linear_layer(W, b) -> nn.Linear: 11 | out_features, in_features = W.shape 12 | linear_layer = nn.Linear( 13 | in_features, 14 | out_features, 15 | ) 16 | linear_layer.weight.data = ptu.from_numpy(W.T) 17 | linear_layer.bias.data = ptu.from_numpy(b[0]) 18 | return linear_layer 19 | 20 | 21 | def read_layer(l): 22 | assert list(l.keys()) == ['AffineLayer'] 23 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] 24 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer'][ 25 | 'b'].astype(np.float32) 26 | 27 | 28 | class LoadedGaussianPolicy(BasePolicy, nn.Module): 29 | def __init__(self, filename, **kwargs): 30 | super().__init__(**kwargs) 31 | 32 | with open(filename, 'rb') as f: 33 | data = pickle.loads(f.read()) 34 | 35 | self.nonlin_type = data['nonlin_type'] 36 | if self.nonlin_type == 'lrelu': 37 | self.non_lin = nn.LeakyReLU(0.01) 38 | elif self.nonlin_type == 'tanh': 39 | self.non_lin = nn.Tanh() 40 | else: 41 | raise NotImplementedError() 42 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 43 | 44 | assert policy_type == 'GaussianPolicy', ( 45 | 'Policy type {} not supported'.format(policy_type) 46 | ) 47 | self.policy_params = data[policy_type] 48 | 49 | assert set(self.policy_params.keys()) == { 50 | 'logstdevs_1_Da', 'hidden', 'obsnorm', 'out' 51 | } 52 | 53 | # Build the policy. First, observation normalization. 54 | assert list(self.policy_params['obsnorm'].keys()) == ['Standardizer'] 55 | obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D'] 56 | obsnorm_meansq = self.policy_params['obsnorm']['Standardizer'][ 57 | 'meansq_1_D'] 58 | obsnorm_stdev = np.sqrt( 59 | np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 60 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) 61 | 62 | self.obs_norm_mean = nn.Parameter(ptu.from_numpy(obsnorm_mean)) 63 | self.obs_norm_std = nn.Parameter(ptu.from_numpy(obsnorm_stdev)) 64 | self.hidden_layers = nn.ModuleList() 65 | 66 | # Hidden layers next 67 | assert list(self.policy_params['hidden'].keys()) == ['FeedforwardNet'] 68 | layer_params = self.policy_params['hidden']['FeedforwardNet'] 69 | for layer_name in sorted(layer_params.keys()): 70 | l = layer_params[layer_name] 71 | W, b = read_layer(l) 72 | linear_layer = create_linear_layer(W, b) 73 | self.hidden_layers.append(linear_layer) 74 | 75 | # Output layer 76 | W, b = read_layer(self.policy_params['out']) 77 | self.output_layer = create_linear_layer(W, b) 78 | 79 | def forward(self, obs): 80 | normed_obs = (obs - self.obs_norm_mean) / (self.obs_norm_std + 1e-6) 81 | h = normed_obs 82 | for layer in self.hidden_layers: 83 | h = layer(h) 84 | h = self.non_lin(h) 85 | return self.output_layer(h) 86 | 87 | ################################## 88 | 89 | def update(self, obs_no, acs_na, adv_n=None, acs_labels_na=None): 90 | raise NotImplementedError(""" 91 | This policy class simply loads in a particular type of policy and 92 | queries it. Do not try to train it. 93 | """) 94 | 95 | def get_action(self, obs): 96 | if len(obs.shape) > 1: 97 | observation = obs 98 | else: 99 | observation = obs[None, :] 100 | observation = ptu.from_numpy(observation.astype(np.float32)) 101 | action = self(observation) 102 | return ptu.to_numpy(action) 103 | 104 | def save(self, filepath): 105 | torch.save(self.state_dict(), filepath) 106 | -------------------------------------------------------------------------------- /hw1/cs285_hw1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw1/cs285_hw1.pdf -------------------------------------------------------------------------------- /hw1/installation.md: -------------------------------------------------------------------------------- 1 | ## Install mujoco: 2 | ``` 3 | mkdir ~/.mujoco 4 | cd ~/.mujoco 5 | wget https://www.roboti.us/download/mujoco200_linux.zip 6 | unzip mujoco200_linux.zip 7 | mv mujoco200_linux mujoco200 8 | rm mujoco200_linux.zip 9 | cp . 10 | ``` 11 | The above instructions download MuJoCo for Linux. If you are on Mac or Windows, you will need to change the `wget` address to either 12 | `https://www.roboti.us/download/mujoco200_macos.zip` or `https://www.roboti.us/download/mujoco200_win64.zip`. 13 | 14 | Finally, add the following to bottom of your bashrc: 15 | ``` 16 | export LD_LIBRARY_PATH=~/.mujoco/mujoco200/bin/ 17 | ``` 18 | 19 | ## Install other dependencies 20 | 21 | 22 | There are two options: 23 | 24 | A. (Recommended) Install with conda: 25 | 26 | 1. Install conda, if you don't already have it, by following the instructions at [this link](https://docs.conda.io/projects/conda/en/latest/user-guide/install/) 27 | 28 | ``` 29 | 30 | This install will modify the `PATH` variable in your bashrc. 31 | You need to open a new terminal for that path change to take place (to be able to find 'conda' in the next step). 32 | 33 | 2. Create a conda environment that will contain python 3: 34 | ``` 35 | conda create -n cs285 python=3.6 36 | ``` 37 | 38 | 3. activate the environment (do this every time you open a new terminal and want to run code): 39 | ``` 40 | source activate cs285 41 | ``` 42 | 43 | 4. Install the requirements into this conda environment 44 | ``` 45 | pip install --user -r requirements.txt 46 | ``` 47 | 48 | 5. Allow your code to be able to see 'cs285' 49 | ``` 50 | cd 51 | $ pip install -e . 52 | ``` 53 | 54 | This conda environment requires activating it every time you open a new terminal (in order to run code), but the benefit is that the required dependencies for this codebase will not affect existing/other versions of things on your computer. This stand-alone environment will have everything that is necessary. 55 | 56 | 57 | B. Install on system Python: 58 | ``` 59 | pip install -r requirements.txt 60 | ``` -------------------------------------------------------------------------------- /hw1/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.17.2 2 | mujoco-py==2.0.2.2 3 | tensorboard==2.3.0 4 | tensorboardX==1.8 5 | matplotlib==2.2.2 6 | ipython==6.4.0 7 | moviepy==1.0.0 8 | pyvirtualdisplay==1.3.2 9 | torch==1.6.0 10 | opencv-python==4.4.0.42 11 | ipdb==0.13.3 12 | box2d-py 13 | -------------------------------------------------------------------------------- /hw1/requirements_colab.txt: -------------------------------------------------------------------------------- 1 | gym==0.17.2 2 | tensorboard==2.3.0 3 | tensorboardX==1.8 4 | matplotlib==2.2.2 5 | ipython==6.4.0 6 | moviepy==1.0.0 7 | pyvirtualdisplay==1.3.2 8 | torch==1.6.0 9 | opencv-python==4.4.0.42 10 | ipdb==0.13.3 11 | box2d-py 12 | -------------------------------------------------------------------------------- /hw1/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) -------------------------------------------------------------------------------- /hw2/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | You can run this code on your own machine or on Google Colab. 4 | 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. If you completed this installation for homework 1, you do not need to repeat it. 6 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badge below: 7 | 8 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw2/cs285/scripts/run_hw2.ipynb) 9 | 10 | ## Complete the code 11 | 12 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with "TODO: get this from hw1". 13 | 14 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) 15 | - [infrastructure/utils.py](cs285/infrastructure/utils.py) 16 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py) 17 | 18 | You will then need to complete the following new files for homework 2. The relevant sections are marked with "TODO". 19 | - [agents/pg_agent.py](cs285/agents/pg_agent.py) 20 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py) 21 | 22 | You will also want to look through [scripts/run_hw2.py](cs285/scripts/run_hw2.py) (if running locally) or [scripts/run_hw2.ipynb](cs285/scripts/run_hw1.2pynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook. 23 | 24 | You will be running your policy gradients implementation in four experiments total, investigating the effects of design decisions like reward-to-go estimators, neural network baselines for variance reduction, and advantage normalization. See the [assignment PDF](cs285_hw2.pdf) for more details. 25 | 26 | ## Plotting your results 27 | 28 | We have provided a snippet that may be used for reading your Tensorboard eventfiles in [scripts/read_results.py](cs285/scripts/read_results.py). Reading these eventfiles and plotting them with [matplotlib](https://matplotlib.org/) or [seaborn](https://seaborn.pydata.org/) will produce the cleanest results for your submission. For debugging purposes, we recommend visualizing the Tensorboard logs using `tensorboard --logdir data`. 29 | -------------------------------------------------------------------------------- /hw2/cs285/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_agent import BaseAgent 2 | from .pg_agent import PGAgent 3 | 4 | -------------------------------------------------------------------------------- /hw2/cs285/agents/base_agent.py: -------------------------------------------------------------------------------- 1 | class BaseAgent(object): 2 | def __init__(self, **kwargs): 3 | super(BaseAgent, self).__init__(**kwargs) 4 | 5 | def train(self) -> dict: 6 | """Return a dictionary of logging information.""" 7 | raise NotImplementedError 8 | 9 | def add_to_replay_buffer(self, paths): 10 | raise NotImplementedError 11 | 12 | def sample(self, batch_size): 13 | raise NotImplementedError 14 | 15 | def save(self, path): 16 | raise NotImplementedError -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285/infrastructure/__init__.py -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/colab_utils.py: -------------------------------------------------------------------------------- 1 | from gym.wrappers import Monitor 2 | import glob 3 | import io 4 | import base64 5 | from IPython.display import HTML 6 | from IPython import display as ipythondisplay 7 | 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI 9 | 10 | def show_video(): 11 | mp4list = glob.glob('/content/video/*.mp4') 12 | if len(mp4list) > 0: 13 | mp4 = mp4list[0] 14 | video = io.open(mp4, 'r+b').read() 15 | encoded = base64.b64encode(video) 16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) 20 | else: 21 | print("Could not find video") 22 | 23 | 24 | def wrap_env(env): 25 | env = Monitor(env, '/content/video', force=True) 26 | return env -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorboardX import SummaryWriter 3 | import numpy as np 4 | 5 | class Logger: 6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 7 | self._log_dir = log_dir 8 | print('########################') 9 | print('logging outputs to ', log_dir) 10 | print('########################') 11 | self._n_logged_samples = n_logged_samples 12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 13 | 14 | def log_scalar(self, scalar, name, step_): 15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 16 | 17 | def log_scalars(self, scalar_dict, group_name, step, phase): 18 | """Will log all scalars in the same plot.""" 19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 20 | 21 | def log_image(self, image, name, step): 22 | assert(len(image.shape) == 3) # [C, H, W] 23 | self._summ_writer.add_image('{}'.format(name), image, step) 24 | 25 | def log_video(self, video_frames, name, step, fps=10): 26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 28 | 29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 30 | 31 | # reshape the rollouts 32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 33 | 34 | # max rollout length 35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 36 | max_length = videos[0].shape[0] 37 | for i in range(max_videos_to_save): 38 | if videos[i].shape[0]>max_length: 39 | max_length = videos[i].shape[0] 40 | 41 | # pad rollouts to all be same length 42 | for i in range(max_videos_to_save): 43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 55 | 56 | def log_figure(self, figure, name, step, phase): 57 | """figure: matplotlib.pyplot figure handle""" 58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 59 | 60 | def log_graph(self, array, name, step, phase): 61 | """figure: matplotlib.pyplot figure handle""" 62 | im = plot_graph(array) 63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 64 | 65 | def dump_scalars(self, log_path=None): 66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 67 | self._summ_writer.export_scalars_to_json(log_path) 68 | 69 | def flush(self): 70 | self._summ_writer.flush() 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/pytorch_util.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | from torch import nn 5 | 6 | Activation = Union[str, nn.Module] 7 | 8 | 9 | _str_to_activation = { 10 | 'relu': nn.ReLU(), 11 | 'tanh': nn.Tanh(), 12 | 'leaky_relu': nn.LeakyReLU(), 13 | 'sigmoid': nn.Sigmoid(), 14 | 'selu': nn.SELU(), 15 | 'softplus': nn.Softplus(), 16 | 'identity': nn.Identity(), 17 | } 18 | 19 | 20 | def build_mlp( 21 | input_size: int, 22 | output_size: int, 23 | n_layers: int, 24 | size: int, 25 | activation: Activation = 'tanh', 26 | output_activation: Activation = 'identity', 27 | ): 28 | """ 29 | Builds a feedforward neural network 30 | 31 | arguments: 32 | input_placeholder: placeholder variable for the state (batch_size, input_size) 33 | scope: variable scope of the network 34 | 35 | n_layers: number of hidden layers 36 | size: dimension of each hidden layer 37 | activation: activation of each hidden layer 38 | 39 | input_size: size of the input layer 40 | output_size: size of the output layer 41 | output_activation: activation of the output layer 42 | 43 | returns: 44 | output_placeholder: the result of a forward pass through the hidden layers + the output layer 45 | """ 46 | if isinstance(activation, str): 47 | activation = _str_to_activation[activation] 48 | if isinstance(output_activation, str): 49 | output_activation = _str_to_activation[output_activation] 50 | layers = [] 51 | in_size = input_size 52 | for _ in range(n_layers): 53 | layers.append(nn.Linear(in_size, size)) 54 | layers.append(activation) 55 | in_size = size 56 | layers.append(nn.Linear(in_size, output_size)) 57 | layers.append(output_activation) 58 | return nn.Sequential(*layers) 59 | 60 | 61 | device = None 62 | 63 | 64 | def init_gpu(use_gpu=True, gpu_id=0): 65 | global device 66 | if torch.cuda.is_available() and use_gpu: 67 | device = torch.device("cuda:" + str(gpu_id)) 68 | print("Using GPU id {}".format(gpu_id)) 69 | else: 70 | device = torch.device("cpu") 71 | print("GPU not detected. Defaulting to CPU.") 72 | 73 | 74 | def set_device(gpu_id): 75 | torch.cuda.set_device(gpu_id) 76 | 77 | 78 | def from_numpy(*args, **kwargs): 79 | return torch.from_numpy(*args, **kwargs).float().to(device) 80 | 81 | 82 | def to_numpy(tensor): 83 | return tensor.to('cpu').detach().numpy() 84 | -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure.utils import * 2 | 3 | 4 | class ReplayBuffer(object): 5 | 6 | def __init__(self, max_size=1000000): 7 | 8 | self.max_size = max_size 9 | self.paths = [] 10 | self.obs = None 11 | self.acs = None 12 | self.concatenated_rews = None 13 | self.unconcatenated_rews = None 14 | self.next_obs = None 15 | self.terminals = None 16 | 17 | def add_rollouts(self, paths, noised=False): 18 | 19 | # add new rollouts into our list of rollouts 20 | for path in paths: 21 | self.paths.append(path) 22 | 23 | # convert new rollouts into their component arrays, and append them onto our arrays 24 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) 25 | 26 | if noised: 27 | observations = add_noise(observations) 28 | next_observations = add_noise(next_observations) 29 | 30 | if self.obs is None: 31 | self.obs = observations[-self.max_size:] 32 | self.acs = actions[-self.max_size:] 33 | self.next_obs = next_observations[-self.max_size:] 34 | self.terminals = terminals[-self.max_size:] 35 | self.concatenated_rews = concatenated_rews[-self.max_size:] 36 | self.unconcatenated_rews = unconcatenated_rews[-self.max_size:] 37 | else: 38 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 39 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 40 | self.next_obs = np.concatenate( 41 | [self.next_obs, next_observations] 42 | )[-self.max_size:] 43 | self.terminals = np.concatenate( 44 | [self.terminals, terminals] 45 | )[-self.max_size:] 46 | self.concatenated_rews = np.concatenate( 47 | [self.concatenated_rews, concatenated_rews] 48 | )[-self.max_size:] 49 | if isinstance(unconcatenated_rews, list): 50 | self.unconcatenated_rews += unconcatenated_rews # TODO keep only latest max_size around 51 | else: 52 | self.unconcatenated_rews.append(unconcatenated_rews) # TODO keep only latest max_size around 53 | 54 | ######################################## 55 | ######################################## 56 | 57 | def sample_random_rollouts(self, num_rollouts): 58 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] 59 | return self.paths[rand_indices] 60 | 61 | def sample_recent_rollouts(self, num_rollouts=1): 62 | return self.paths[-num_rollouts:] 63 | 64 | ######################################## 65 | ######################################## 66 | 67 | def sample_random_data(self, batch_size): 68 | 69 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 70 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] 71 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] 72 | 73 | def sample_recent_data(self, batch_size=1, concat_rew=True): 74 | 75 | if concat_rew: 76 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 77 | else: 78 | num_recent_rollouts_to_return = 0 79 | num_datapoints_so_far = 0 80 | index = -1 81 | while num_datapoints_so_far < batch_size: 82 | recent_rollout = self.paths[index] 83 | index -=1 84 | num_recent_rollouts_to_return +=1 85 | num_datapoints_so_far += get_pathlength(recent_rollout) 86 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] 87 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) 88 | return observations, actions, unconcatenated_rews, next_observations, terminals -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import copy 4 | 5 | ############################################ 6 | ############################################ 7 | 8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics): 9 | 10 | model = models[0] 11 | 12 | # true 13 | true_states = perform_actions(env, action_sequence)['observation'] 14 | 15 | # predicted 16 | ob = np.expand_dims(true_states[0],0) 17 | pred_states = [] 18 | for ac in action_sequence: 19 | pred_states.append(ob) 20 | action = np.expand_dims(ac,0) 21 | ob = model.get_prediction(ob, action, data_statistics) 22 | pred_states = np.squeeze(pred_states) 23 | 24 | # mpe 25 | mpe = mean_squared_error(pred_states, true_states) 26 | 27 | return mpe, true_states, pred_states 28 | 29 | def perform_actions(env, actions): 30 | ob = env.reset() 31 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] 32 | steps = 0 33 | for ac in actions: 34 | obs.append(ob) 35 | acs.append(ac) 36 | ob, rew, done, _ = env.step(ac) 37 | # add the observation after taking a step to next_obs 38 | next_obs.append(ob) 39 | rewards.append(rew) 40 | steps += 1 41 | # If the episode ended, the corresponding terminal value is 1 42 | # otherwise, it is 0 43 | if done: 44 | terminals.append(1) 45 | break 46 | else: 47 | terminals.append(0) 48 | 49 | return Path(obs, image_obs, acs, rewards, next_obs, terminals) 50 | 51 | def mean_squared_error(a, b): 52 | return np.mean((a-b)**2) 53 | 54 | ############################################ 55 | ############################################ 56 | 57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): 58 | # TODO: get this from hw1 59 | return Path(obs, image_obs, acs, rewards, next_obs, terminals) 60 | 61 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): 62 | # TODO: get this from hw1 63 | return paths, timesteps_this_batch 64 | 65 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): 66 | # TODO: get this from hw1 67 | return paths 68 | 69 | ############################################ 70 | ############################################ 71 | 72 | def Path(obs, image_obs, acs, rewards, next_obs, terminals): 73 | """ 74 | Take info (separate arrays) from a single rollout 75 | and return it in a single dictionary 76 | """ 77 | if image_obs != []: 78 | image_obs = np.stack(image_obs, axis=0) 79 | return {"observation" : np.array(obs, dtype=np.float32), 80 | "image_obs" : np.array(image_obs, dtype=np.uint8), 81 | "reward" : np.array(rewards, dtype=np.float32), 82 | "action" : np.array(acs, dtype=np.float32), 83 | "next_observation": np.array(next_obs, dtype=np.float32), 84 | "terminal": np.array(terminals, dtype=np.float32)} 85 | 86 | 87 | def convert_listofrollouts(paths): 88 | """ 89 | Take a list of rollout dictionaries 90 | and return separate arrays, 91 | where each array is a concatenation of that array from across the rollouts 92 | """ 93 | observations = np.concatenate([path["observation"] for path in paths]) 94 | actions = np.concatenate([path["action"] for path in paths]) 95 | next_observations = np.concatenate([path["next_observation"] for path in paths]) 96 | terminals = np.concatenate([path["terminal"] for path in paths]) 97 | concatenated_rewards = np.concatenate([path["reward"] for path in paths]) 98 | unconcatenated_rewards = [path["reward"] for path in paths] 99 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards 100 | 101 | ############################################ 102 | ############################################ 103 | 104 | def get_pathlength(path): 105 | return len(path["reward"]) 106 | 107 | def normalize(data, mean, std, eps=1e-8): 108 | return (data-mean)/(std+eps) 109 | 110 | def unnormalize(data, mean, std): 111 | return data*std+mean 112 | 113 | def add_noise(data_inp, noiseToSignal=0.01): 114 | 115 | data = copy.deepcopy(data_inp) #(num data points, dim) 116 | 117 | #mean of data 118 | mean_data = np.mean(data, axis=0) 119 | 120 | #if mean is 0, 121 | #make it 0.001 to avoid 0 issues later for dividing by std 122 | mean_data[mean_data == 0] = 0.000001 123 | 124 | #width of normal distribution to sample noise from 125 | #larger magnitude number = could have larger magnitude noise 126 | std_of_noise = mean_data * noiseToSignal 127 | for j in range(mean_data.shape[0]): 128 | data[:, j] = np.copy(data[:, j] + np.random.normal( 129 | 0, np.absolute(std_of_noise[j]), (data.shape[0],))) 130 | 131 | return data -------------------------------------------------------------------------------- /hw2/cs285/policies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285/policies/__init__.py -------------------------------------------------------------------------------- /hw2/cs285/policies/base_policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import numpy as np 3 | 4 | 5 | class BasePolicy(object, metaclass=abc.ABCMeta): 6 | def get_action(self, obs: np.ndarray) -> np.ndarray: 7 | raise NotImplementedError 8 | 9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: 10 | """Return a dictionary of logging information.""" 11 | raise NotImplementedError 12 | 13 | def save(self, filepath: str): 14 | raise NotImplementedError 15 | -------------------------------------------------------------------------------- /hw2/cs285/scripts/read_results.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import tensorflow as tf 3 | 4 | def get_section_results(file): 5 | """ 6 | requires tensorflow==1.12.0 7 | """ 8 | X = [] 9 | Y = [] 10 | for e in tf.train.summary_iterator(file): 11 | for v in e.summary.value: 12 | if v.tag == 'Train_EnvstepsSoFar': 13 | X.append(v.simple_value) 14 | elif v.tag == 'Eval_AverageReturn': 15 | Y.append(v.simple_value) 16 | return X, Y 17 | 18 | if __name__ == '__main__': 19 | import glob 20 | 21 | logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*' 22 | eventfile = glob.glob(logdir)[0] 23 | 24 | X, Y = get_section_results(eventfile) 25 | for i, (x, y) in enumerate(zip(X, Y)): 26 | print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y)) -------------------------------------------------------------------------------- /hw2/cs285/scripts/run_hw2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from cs285.infrastructure.rl_trainer import RL_Trainer 5 | from cs285.agents.pg_agent import PGAgent 6 | 7 | class PG_Trainer(object): 8 | 9 | def __init__(self, params): 10 | 11 | ##################### 12 | ## SET AGENT PARAMS 13 | ##################### 14 | 15 | computation_graph_args = { 16 | 'n_layers': params['n_layers'], 17 | 'size': params['size'], 18 | 'learning_rate': params['learning_rate'], 19 | } 20 | 21 | estimate_advantage_args = { 22 | 'gamma': params['discount'], 23 | 'standardize_advantages': not(params['dont_standardize_advantages']), 24 | 'reward_to_go': params['reward_to_go'], 25 | 'nn_baseline': params['nn_baseline'], 26 | } 27 | 28 | train_args = { 29 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 30 | } 31 | 32 | agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args} 33 | 34 | self.params = params 35 | self.params['agent_class'] = PGAgent 36 | self.params['agent_params'] = agent_params 37 | self.params['batch_size_initial'] = self.params['batch_size'] 38 | 39 | ################ 40 | ## RL TRAINER 41 | ################ 42 | 43 | self.rl_trainer = RL_Trainer(self.params) 44 | 45 | def run_training_loop(self): 46 | 47 | self.rl_trainer.run_training_loop( 48 | self.params['n_iter'], 49 | collect_policy = self.rl_trainer.agent.actor, 50 | eval_policy = self.rl_trainer.agent.actor, 51 | ) 52 | 53 | 54 | def main(): 55 | 56 | import argparse 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument('--env_name', type=str) 59 | parser.add_argument('--exp_name', type=str, default='todo') 60 | parser.add_argument('--n_iter', '-n', type=int, default=200) 61 | 62 | parser.add_argument('--reward_to_go', '-rtg', action='store_true') 63 | parser.add_argument('--nn_baseline', action='store_true') 64 | parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true') 65 | parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration 66 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration 67 | 68 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) 69 | parser.add_argument('--discount', type=float, default=1.0) 70 | parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) 71 | parser.add_argument('--n_layers', '-l', type=int, default=2) 72 | parser.add_argument('--size', '-s', type=int, default=64) 73 | 74 | parser.add_argument('--ep_len', type=int) #students shouldn't change this away from env's default 75 | parser.add_argument('--seed', type=int, default=1) 76 | parser.add_argument('--no_gpu', '-ngpu', action='store_true') 77 | parser.add_argument('--which_gpu', '-gpu_id', default=0) 78 | parser.add_argument('--video_log_freq', type=int, default=-1) 79 | parser.add_argument('--scalar_log_freq', type=int, default=1) 80 | 81 | parser.add_argument('--save_params', action='store_true') 82 | 83 | args = parser.parse_args() 84 | 85 | # convert to dictionary 86 | params = vars(args) 87 | 88 | ## ensure compatibility with hw1 code 89 | params['train_batch_size'] = params['batch_size'] 90 | 91 | ################################## 92 | ### CREATE DIRECTORY FOR LOGGING 93 | ################################## 94 | 95 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data') 96 | 97 | if not (os.path.exists(data_path)): 98 | os.makedirs(data_path) 99 | 100 | logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 101 | logdir = os.path.join(data_path, logdir) 102 | params['logdir'] = logdir 103 | if not(os.path.exists(logdir)): 104 | os.makedirs(logdir) 105 | 106 | ################### 107 | ### RUN TRAINING 108 | ################### 109 | 110 | trainer = PG_Trainer(params) 111 | trainer.run_training_loop() 112 | 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /hw2/cs285_hw2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw2/cs285_hw2.pdf -------------------------------------------------------------------------------- /hw2/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.5.1 2 | gym==0.17.2 3 | mujoco-py==2.0.2.2 4 | tensorboard==2.3.0 5 | tensorboardX==1.8 6 | matplotlib==2.2.2 7 | ipython==6.4.0 8 | moviepy==1.0.0 9 | pyvirtualdisplay==1.3.2 10 | ipdb==0.13.3 11 | box2d-py 12 | tensorflow==1.12.0 -------------------------------------------------------------------------------- /hw2/requirements_colab.txt: -------------------------------------------------------------------------------- 1 | torch==1.5.1+cu101 2 | gym==0.17.2 3 | tensorboard==2.3.0 4 | tensorboardX==1.8 5 | matplotlib==2.2.2 6 | ipython==6.4.0 7 | moviepy==1.0.0 8 | pyvirtualdisplay==1.3.2 9 | ipdb==0.13.3 10 | box2d-py 11 | tensorflow==2.3.0 -------------------------------------------------------------------------------- /hw2/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) -------------------------------------------------------------------------------- /hw3/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | You can run this code on your own machine or on Google Colab. 4 | 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally. 6 | 7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below: 8 | 9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw3/cs285/scripts/run_hw3_dqn.ipynb) **Part I (Q-learning)** 10 | 11 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw3/cs285/scripts/run_hw3_actor_critic.ipynb) **Part II (Actor-critic)** 12 | 13 | ## Complete the code 14 | 15 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with `TODO: get this from hw1 or hw2`. 16 | 17 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) 18 | - [infrastructure/utils.py](cs285/infrastructure/utils.py) 19 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py) 20 | 21 | You will then need to implement new routines in the following files for homework 3 part 1 (Q-learning): 22 | - [agents/dqn_agent.py](cs285/agents/dqn_agent.py) 23 | - [critics/dqn_critic.py](cs285/critics/dqn_critic.py) 24 | - [policies/argmax_policy.py](cs285/policies/argmax_policy.py) 25 | 26 | and in the following files for part 2 (actor-critic): 27 | - [agents/ac_agent.py](cs285/agents/ac_agent.py) 28 | - [critics/bootstrapped_continuous_critic.py](cs285/critics/bootstrapped_continuous_critic.py) 29 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py) 30 | 31 | The relevant sections are marked with `TODO`. 32 | 33 | You may also want to look through [run_hw3_dqn.py](cs285/scripts/run_hw3_dqn.py) and [run_hw3_actor_critic.py](cs285/scripts/run_hw3_actor_critic.py) (if running locally) or [run_hw3_dqn.ipynb](cs285/scripts/run_hw3_dqn.ipynb) and [run_hw3_actor_critic.ipynb](cs285/scripts/run_hw3_actor_critic.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook. 34 | 35 | See the [assignment PDF](cs285_hw3.pdf) for more details on what files to edit. 36 | 37 | -------------------------------------------------------------------------------- /hw3/cs285/agents/ac_agent.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from cs285.critics.bootstrapped_continuous_critic import \ 4 | BootstrappedContinuousCritic 5 | from cs285.infrastructure.replay_buffer import ReplayBuffer 6 | from cs285.infrastructure.utils import * 7 | from cs285.policies.MLP_policy import MLPPolicyAC 8 | from .base_agent import BaseAgent 9 | 10 | 11 | class ACAgent(BaseAgent): 12 | def __init__(self, env, agent_params): 13 | super(ACAgent, self).__init__() 14 | 15 | self.env = env 16 | self.agent_params = agent_params 17 | 18 | self.gamma = self.agent_params['gamma'] 19 | self.standardize_advantages = self.agent_params['standardize_advantages'] 20 | 21 | self.actor = MLPPolicyAC( 22 | self.agent_params['ac_dim'], 23 | self.agent_params['ob_dim'], 24 | self.agent_params['n_layers'], 25 | self.agent_params['size'], 26 | self.agent_params['discrete'], 27 | self.agent_params['learning_rate'], 28 | ) 29 | self.critic = BootstrappedContinuousCritic(self.agent_params) 30 | 31 | self.replay_buffer = ReplayBuffer() 32 | 33 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 34 | # TODO Implement the following pseudocode: 35 | # for agent_params['num_critic_updates_per_agent_update'] steps, 36 | # update the critic 37 | 38 | # advantage = estimate_advantage(...) 39 | 40 | # for agent_params['num_actor_updates_per_agent_update'] steps, 41 | # update the actor 42 | 43 | loss = OrderedDict() 44 | loss['Critic_Loss'] = TODO 45 | loss['Actor_Loss'] = TODO 46 | 47 | return loss 48 | 49 | def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): 50 | # TODO Implement the following pseudocode: 51 | # 1) query the critic with ob_no, to get V(s) 52 | # 2) query the critic with next_ob_no, to get V(s') 53 | # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') 54 | # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) 55 | # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) 56 | adv_n = TODO 57 | 58 | if self.standardize_advantages: 59 | adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) 60 | return adv_n 61 | 62 | def add_to_replay_buffer(self, paths): 63 | self.replay_buffer.add_rollouts(paths) 64 | 65 | def sample(self, batch_size): 66 | return self.replay_buffer.sample_recent_data(batch_size) 67 | -------------------------------------------------------------------------------- /hw3/cs285/agents/base_agent.py: -------------------------------------------------------------------------------- 1 | class BaseAgent(object): 2 | def __init__(self, **kwargs): 3 | super(BaseAgent, self).__init__(**kwargs) 4 | 5 | def train(self) -> dict: 6 | """Return a dictionary of logging information.""" 7 | raise NotImplementedError 8 | 9 | def add_to_replay_buffer(self, paths): 10 | raise NotImplementedError 11 | 12 | def sample(self, batch_size): 13 | raise NotImplementedError 14 | 15 | def save(self, path): 16 | raise NotImplementedError -------------------------------------------------------------------------------- /hw3/cs285/agents/dqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule 4 | from cs285.policies.argmax_policy import ArgMaxPolicy 5 | from cs285.critics.dqn_critic import DQNCritic 6 | 7 | 8 | class DQNAgent(object): 9 | def __init__(self, env, agent_params): 10 | 11 | self.env = env 12 | self.agent_params = agent_params 13 | self.batch_size = agent_params['batch_size'] 14 | # import ipdb; ipdb.set_trace() 15 | self.last_obs = self.env.reset() 16 | 17 | self.num_actions = agent_params['ac_dim'] 18 | self.learning_starts = agent_params['learning_starts'] 19 | self.learning_freq = agent_params['learning_freq'] 20 | self.target_update_freq = agent_params['target_update_freq'] 21 | 22 | self.replay_buffer_idx = None 23 | self.exploration = agent_params['exploration_schedule'] 24 | self.optimizer_spec = agent_params['optimizer_spec'] 25 | 26 | self.critic = DQNCritic(agent_params, self.optimizer_spec) 27 | self.actor = ArgMaxPolicy(self.critic) 28 | 29 | lander = agent_params['env_name'].startswith('LunarLander') 30 | self.replay_buffer = MemoryOptimizedReplayBuffer( 31 | agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) 32 | self.t = 0 33 | self.num_param_updates = 0 34 | 35 | def add_to_replay_buffer(self, paths): 36 | pass 37 | 38 | def step_env(self): 39 | """ 40 | Step the env and store the transition 41 | At the end of this block of code, the simulator should have been 42 | advanced one step, and the replay buffer should contain one more transition. 43 | Note that self.last_obs must always point to the new latest observation. 44 | """ 45 | 46 | # TODO store the latest observation ("frame") into the replay buffer 47 | # HINT: the replay buffer used here is `MemoryOptimizedReplayBuffer` 48 | # in dqn_utils.py 49 | self.replay_buffer_idx = TODO 50 | 51 | eps = self.exploration.value(self.t) 52 | 53 | # TODO use epsilon greedy exploration when selecting action 54 | perform_random_action = TODO 55 | if perform_random_action: 56 | # HINT: take random action 57 | # with probability eps (see np.random.random()) 58 | # OR if your current step number (see self.t) is less that self.learning_starts 59 | action = TODO 60 | else: 61 | # HINT: Your actor will take in multiple previous observations ("frames") in order 62 | # to deal with the partial observability of the environment. Get the most recent 63 | # `frame_history_len` observations using functionality from the replay buffer, 64 | # and then use those observations as input to your actor. 65 | action = TODO 66 | 67 | # TODO take a step in the environment using the action from the policy 68 | # HINT1: remember that self.last_obs must always point to the newest/latest observation 69 | # HINT2: remember the following useful function that you've seen before: 70 | #obs, reward, done, info = env.step(action) 71 | TODO 72 | 73 | # TODO store the result of taking this action into the replay buffer 74 | # HINT1: see your replay buffer's `store_effect` function 75 | # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above 76 | TODO 77 | 78 | # TODO if taking this step resulted in done, reset the env (and the latest observation) 79 | TODO 80 | 81 | def sample(self, batch_size): 82 | if self.replay_buffer.can_sample(self.batch_size): 83 | return self.replay_buffer.sample(batch_size) 84 | else: 85 | return [],[],[],[],[] 86 | 87 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 88 | log = {} 89 | if (self.t > self.learning_starts 90 | and self.t % self.learning_freq == 0 91 | and self.replay_buffer.can_sample(self.batch_size) 92 | ): 93 | 94 | # TODO fill in the call to the update function using the appropriate tensors 95 | log = self.critic.update( 96 | TODO 97 | ) 98 | 99 | # TODO update the target network periodically 100 | # HINT: your critic already has this functionality implemented 101 | if self.num_param_updates % self.target_update_freq == 0: 102 | TODO 103 | 104 | self.num_param_updates += 1 105 | 106 | self.t += 1 107 | return log 108 | -------------------------------------------------------------------------------- /hw3/cs285/critics/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /hw3/cs285/critics/base_critic.py: -------------------------------------------------------------------------------- 1 | class BaseCritic(object): 2 | def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n): 3 | raise NotImplementedError 4 | -------------------------------------------------------------------------------- /hw3/cs285/critics/bootstrapped_continuous_critic.py: -------------------------------------------------------------------------------- 1 | from .base_critic import BaseCritic 2 | from torch import nn 3 | from torch import optim 4 | 5 | from cs285.infrastructure import pytorch_util as ptu 6 | 7 | 8 | class BootstrappedContinuousCritic(nn.Module, BaseCritic): 9 | """ 10 | Notes on notation: 11 | 12 | Prefixes and suffixes: 13 | ob - observation 14 | ac - action 15 | _no - this tensor should have shape (batch self.size /n/, observation dim) 16 | _na - this tensor should have shape (batch self.size /n/, action dim) 17 | _n - this tensor should have shape (batch self.size /n/) 18 | 19 | Note: batch self.size /n/ is defined at runtime. 20 | is None 21 | """ 22 | def __init__(self, hparams): 23 | super().__init__() 24 | self.ob_dim = hparams['ob_dim'] 25 | self.ac_dim = hparams['ac_dim'] 26 | self.discrete = hparams['discrete'] 27 | self.size = hparams['size'] 28 | self.n_layers = hparams['n_layers'] 29 | self.learning_rate = hparams['learning_rate'] 30 | 31 | # critic parameters 32 | self.num_target_updates = hparams['num_target_updates'] 33 | self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update'] 34 | self.gamma = hparams['gamma'] 35 | self.critic_network = ptu.build_mlp( 36 | self.ob_dim, 37 | 1, 38 | n_layers=self.n_layers, 39 | size=self.size, 40 | ) 41 | self.critic_network.to(ptu.device) 42 | self.loss = nn.MSELoss() 43 | self.optimizer = optim.Adam( 44 | self.critic_network.parameters(), 45 | self.learning_rate, 46 | ) 47 | 48 | def forward(self, obs): 49 | return self.critic_network(obs).squeeze(1) 50 | 51 | def forward_np(self, obs): 52 | obs = ptu.from_numpy(obs) 53 | predictions = self(obs) 54 | return ptu.to_numpy(predictions) 55 | 56 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): 57 | """ 58 | Update the parameters of the critic. 59 | 60 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from 61 | Agent.sample_trajectories 62 | let num_paths be the number of paths sampled from Agent.sample_trajectories 63 | 64 | arguments: 65 | ob_no: shape: (sum_of_path_lengths, ob_dim) 66 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward 67 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing 68 | the reward for each timestep 69 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended 70 | at that timestep of 0 if the episode did not end 71 | 72 | returns: 73 | training loss 74 | """ 75 | # TODO: Implement the pseudocode below: do the following ( 76 | # self.num_grad_steps_per_target_update * self.num_target_updates) 77 | # times: 78 | # every self.num_grad_steps_per_target_update steps (which includes the 79 | # first step), recompute the target values by 80 | # a) calculating V(s') by querying the critic with next_ob_no 81 | # b) and computing the target values as r(s, a) + gamma * V(s') 82 | # every time, update this critic using the observations and targets 83 | # 84 | # HINT: don't forget to use terminal_n to cut off the V(s') (ie set it 85 | # to 0) when a terminal state is reached 86 | # HINT: make sure to squeeze the output of the critic_network to ensure 87 | # that its dimensions match the reward 88 | 89 | return loss.item() 90 | -------------------------------------------------------------------------------- /hw3/cs285/critics/dqn_critic.py: -------------------------------------------------------------------------------- 1 | from .base_critic import BaseCritic 2 | import torch 3 | import torch.optim as optim 4 | from torch.nn import utils 5 | from torch import nn 6 | 7 | from cs285.infrastructure import pytorch_util as ptu 8 | 9 | 10 | class DQNCritic(BaseCritic): 11 | 12 | def __init__(self, hparams, optimizer_spec, **kwargs): 13 | super().__init__(**kwargs) 14 | self.env_name = hparams['env_name'] 15 | self.ob_dim = hparams['ob_dim'] 16 | 17 | if isinstance(self.ob_dim, int): 18 | self.input_shape = (self.ob_dim,) 19 | else: 20 | self.input_shape = hparams['input_shape'] 21 | 22 | self.ac_dim = hparams['ac_dim'] 23 | self.double_q = hparams['double_q'] 24 | self.grad_norm_clipping = hparams['grad_norm_clipping'] 25 | self.gamma = hparams['gamma'] 26 | 27 | self.optimizer_spec = optimizer_spec 28 | network_initializer = hparams['q_func'] 29 | self.q_net = network_initializer(self.ob_dim, self.ac_dim) 30 | self.q_net_target = network_initializer(self.ob_dim, self.ac_dim) 31 | self.optimizer = self.optimizer_spec.constructor( 32 | self.q_net.parameters(), 33 | **self.optimizer_spec.optim_kwargs 34 | ) 35 | self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR( 36 | self.optimizer, 37 | self.optimizer_spec.learning_rate_schedule, 38 | ) 39 | self.loss = nn.SmoothL1Loss() # AKA Huber loss 40 | self.q_net.to(ptu.device) 41 | self.q_net_target.to(ptu.device) 42 | 43 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): 44 | """ 45 | Update the parameters of the critic. 46 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from 47 | Agent.sample_trajectories 48 | let num_paths be the number of paths sampled from Agent.sample_trajectories 49 | arguments: 50 | ob_no: shape: (sum_of_path_lengths, ob_dim) 51 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward 52 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing 53 | the reward for each timestep 54 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended 55 | at that timestep of 0 if the episode did not end 56 | returns: 57 | nothing 58 | """ 59 | ob_no = ptu.from_numpy(ob_no) 60 | ac_na = ptu.from_numpy(ac_na).to(torch.long) 61 | next_ob_no = ptu.from_numpy(next_ob_no) 62 | reward_n = ptu.from_numpy(reward_n) 63 | terminal_n = ptu.from_numpy(terminal_n) 64 | 65 | qa_t_values = self.q_net(ob_no) 66 | q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) 67 | 68 | # TODO compute the Q-values from the target network 69 | qa_tp1_values = TODO 70 | 71 | if self.double_q: 72 | # You must fill this part for Q2 of the Q-learning portion of the homework. 73 | # In double Q-learning, the best action is selected using the Q-network that 74 | # is being updated, but the Q-value for this action is obtained from the 75 | # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. 76 | TODO 77 | else: 78 | q_tp1, _ = qa_tp1_values.max(dim=1) 79 | 80 | # TODO compute targets for minimizing Bellman error 81 | # HINT: as you saw in lecture, this would be: 82 | #currentReward + self.gamma * qValuesOfNextTimestep * (not terminal) 83 | target = TODO 84 | target = target.detach() 85 | 86 | assert q_t_values.shape == target.shape 87 | loss = self.loss(q_t_values, target) 88 | 89 | self.optimizer.zero_grad() 90 | loss.backward() 91 | utils.clip_grad_value_(self.q_net.parameters(), self.grad_norm_clipping) 92 | self.optimizer.step() 93 | 94 | return { 95 | 'Training Loss': ptu.to_numpy(loss), 96 | } 97 | 98 | def update_target_network(self): 99 | for target_param, param in zip( 100 | self.q_net_target.parameters(), self.q_net.parameters() 101 | ): 102 | target_param.data.copy_(param.data) 103 | 104 | def qa_values(self, obs): 105 | obs = ptu.from_numpy(obs) 106 | qa_values = self.q_net(obs) 107 | return ptu.to_numpy(qa_values) 108 | -------------------------------------------------------------------------------- /hw3/cs285/envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285/envs/__init__.py -------------------------------------------------------------------------------- /hw3/cs285/envs/box2d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285/envs/box2d/__init__.py -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/colab_utils.py: -------------------------------------------------------------------------------- 1 | from gym.wrappers import Monitor 2 | import glob 3 | import io 4 | import base64 5 | from IPython.display import HTML 6 | from IPython import display as ipythondisplay 7 | 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI 9 | 10 | def show_video(): 11 | mp4list = glob.glob('/content/video/*.mp4') 12 | if len(mp4list) > 0: 13 | mp4 = mp4list[0] 14 | video = io.open(mp4, 'r+b').read() 15 | encoded = base64.b64encode(video) 16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) 20 | else: 21 | print("Could not find video") 22 | 23 | 24 | def wrap_env(env): 25 | env = Monitor(env, '/content/video', force=True) 26 | return env 27 | -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorboardX import SummaryWriter 3 | import numpy as np 4 | 5 | class Logger: 6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 7 | self._log_dir = log_dir 8 | print('########################') 9 | print('logging outputs to ', log_dir) 10 | print('########################') 11 | self._n_logged_samples = n_logged_samples 12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 13 | 14 | def log_scalar(self, scalar, name, step_): 15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 16 | 17 | def log_scalars(self, scalar_dict, group_name, step, phase): 18 | """Will log all scalars in the same plot.""" 19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 20 | 21 | def log_image(self, image, name, step): 22 | assert(len(image.shape) == 3) # [C, H, W] 23 | self._summ_writer.add_image('{}'.format(name), image, step) 24 | 25 | def log_video(self, video_frames, name, step, fps=10): 26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 28 | 29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 30 | 31 | # reshape the rollouts 32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 33 | 34 | # max rollout length 35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 36 | max_length = videos[0].shape[0] 37 | for i in range(max_videos_to_save): 38 | if videos[i].shape[0]>max_length: 39 | max_length = videos[i].shape[0] 40 | 41 | # pad rollouts to all be same length 42 | for i in range(max_videos_to_save): 43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 55 | 56 | def log_figure(self, figure, name, step, phase): 57 | """figure: matplotlib.pyplot figure handle""" 58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 59 | 60 | def log_graph(self, array, name, step, phase): 61 | """figure: matplotlib.pyplot figure handle""" 62 | im = plot_graph(array) 63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 64 | 65 | def dump_scalars(self, log_path=None): 66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 67 | self._summ_writer.export_scalars_to_json(log_path) 68 | 69 | def flush(self): 70 | self._summ_writer.flush() 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/pytorch_util.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | from torch import nn 5 | 6 | Activation = Union[str, nn.Module] 7 | 8 | 9 | _str_to_activation = { 10 | 'relu': nn.ReLU(), 11 | 'tanh': nn.Tanh(), 12 | 'leaky_relu': nn.LeakyReLU(), 13 | 'sigmoid': nn.Sigmoid(), 14 | 'selu': nn.SELU(), 15 | 'softplus': nn.Softplus(), 16 | 'identity': nn.Identity(), 17 | } 18 | 19 | 20 | def build_mlp( 21 | input_size: int, 22 | output_size: int, 23 | n_layers: int, 24 | size: int, 25 | activation: Activation = 'tanh', 26 | output_activation: Activation = 'identity', 27 | ): 28 | """ 29 | Builds a feedforward neural network 30 | arguments: 31 | input_placeholder: placeholder variable for the state (batch_size, input_size) 32 | scope: variable scope of the network 33 | n_layers: number of hidden layers 34 | size: dimension of each hidden layer 35 | activation: activation of each hidden layer 36 | input_size: size of the input layer 37 | output_size: size of the output layer 38 | output_activation: activation of the output layer 39 | returns: 40 | output_placeholder: the result of a forward pass through the hidden layers + the output layer 41 | """ 42 | if isinstance(activation, str): 43 | activation = _str_to_activation[activation] 44 | if isinstance(output_activation, str): 45 | output_activation = _str_to_activation[output_activation] 46 | layers = [] 47 | in_size = input_size 48 | for _ in range(n_layers): 49 | layers.append(nn.Linear(in_size, size)) 50 | layers.append(activation) 51 | in_size = size 52 | layers.append(nn.Linear(in_size, output_size)) 53 | layers.append(output_activation) 54 | return nn.Sequential(*layers) 55 | 56 | 57 | device = None 58 | 59 | 60 | def init_gpu(use_gpu=True, gpu_id=0): 61 | global device 62 | if torch.cuda.is_available() and use_gpu: 63 | device = torch.device("cuda:" + str(gpu_id)) 64 | print("Using GPU id {}".format(gpu_id)) 65 | else: 66 | device = torch.device("cpu") 67 | print("GPU not detected. Defaulting to CPU.") 68 | 69 | 70 | def set_device(gpu_id): 71 | torch.cuda.set_device(gpu_id) 72 | 73 | 74 | def from_numpy(*args, **kwargs): 75 | return torch.from_numpy(*args, **kwargs).float().to(device) 76 | 77 | 78 | def to_numpy(tensor): 79 | return tensor.to('cpu').detach().numpy() 80 | -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure.utils import * 2 | 3 | 4 | class ReplayBuffer(object): 5 | 6 | def __init__(self, max_size=1000000): 7 | 8 | self.max_size = max_size 9 | self.paths = [] 10 | self.obs = None 11 | self.acs = None 12 | self.concatenated_rews = None 13 | self.next_obs = None 14 | self.terminals = None 15 | 16 | def add_rollouts(self, paths, noised=False): 17 | 18 | # add new rollouts into our list of rollouts 19 | for path in paths: 20 | self.paths.append(path) 21 | 22 | # convert new rollouts into their component arrays, and append them onto our arrays 23 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) 24 | 25 | if noised: 26 | observations = add_noise(observations) 27 | next_observations = add_noise(next_observations) 28 | 29 | if self.obs is None: 30 | self.obs = observations[-self.max_size:] 31 | self.acs = actions[-self.max_size:] 32 | self.next_obs = next_observations[-self.max_size:] 33 | self.terminals = terminals[-self.max_size:] 34 | self.concatenated_rews = concatenated_rews[-self.max_size:] 35 | else: 36 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 37 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 38 | self.next_obs = np.concatenate( 39 | [self.next_obs, next_observations] 40 | )[-self.max_size:] 41 | self.terminals = np.concatenate( 42 | [self.terminals, terminals] 43 | )[-self.max_size:] 44 | self.concatenated_rews = np.concatenate( 45 | [self.concatenated_rews, concatenated_rews] 46 | )[-self.max_size:] 47 | 48 | ######################################## 49 | ######################################## 50 | 51 | def sample_random_rollouts(self, num_rollouts): 52 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] 53 | return self.paths[rand_indices] 54 | 55 | def sample_recent_rollouts(self, num_rollouts=1): 56 | return self.paths[-num_rollouts:] 57 | 58 | ######################################## 59 | ######################################## 60 | 61 | def sample_random_data(self, batch_size): 62 | 63 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 64 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] 65 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] 66 | 67 | def sample_recent_data(self, batch_size=1, concat_rew=True): 68 | 69 | if concat_rew: 70 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 71 | else: 72 | num_recent_rollouts_to_return = 0 73 | num_datapoints_so_far = 0 74 | index = -1 75 | while num_datapoints_so_far < batch_size: 76 | recent_rollout = self.paths[index] 77 | index -=1 78 | num_recent_rollouts_to_return +=1 79 | num_datapoints_so_far += get_pathlength(recent_rollout) 80 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] 81 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) 82 | return observations, actions, unconcatenated_rews, next_observations, terminals 83 | -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import copy 4 | 5 | ############################################ 6 | ############################################ 7 | 8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics): 9 | 10 | model = models[0] 11 | 12 | # true 13 | true_states = perform_actions(env, action_sequence)['observation'] 14 | 15 | # predicted 16 | ob = np.expand_dims(true_states[0],0) 17 | pred_states = [] 18 | for ac in action_sequence: 19 | pred_states.append(ob) 20 | action = np.expand_dims(ac,0) 21 | ob = model.get_prediction(ob, action, data_statistics) 22 | pred_states = np.squeeze(pred_states) 23 | 24 | # mpe 25 | mpe = mean_squared_error(pred_states, true_states) 26 | 27 | return mpe, true_states, pred_states 28 | 29 | def perform_actions(env, actions): 30 | ob = env.reset() 31 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] 32 | steps = 0 33 | for ac in actions: 34 | obs.append(ob) 35 | acs.append(ac) 36 | ob, rew, done, _ = env.step(ac) 37 | # add the observation after taking a step to next_obs 38 | next_obs.append(ob) 39 | rewards.append(rew) 40 | steps += 1 41 | # If the episode ended, the corresponding terminal value is 1 42 | # otherwise, it is 0 43 | if done: 44 | terminals.append(1) 45 | break 46 | else: 47 | terminals.append(0) 48 | 49 | return Path(obs, image_obs, acs, rewards, next_obs, terminals) 50 | 51 | def mean_squared_error(a, b): 52 | return np.mean((a-b)**2) 53 | 54 | ############################################ 55 | ############################################ 56 | 57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): 58 | # TODO: get this from Piazza 59 | 60 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): 61 | """ 62 | Collect rollouts using policy 63 | until we have collected min_timesteps_per_batch steps 64 | """ 65 | # TODO: get this from Piazza 66 | 67 | return paths, timesteps_this_batch 68 | 69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): 70 | """ 71 | Collect ntraj rollouts using policy 72 | """ 73 | # TODO: get this from Piazza 74 | 75 | return paths 76 | 77 | ############################################ 78 | ############################################ 79 | 80 | def Path(obs, image_obs, acs, rewards, next_obs, terminals): 81 | """ 82 | Take info (separate arrays) from a single rollout 83 | and return it in a single dictionary 84 | """ 85 | if image_obs != []: 86 | image_obs = np.stack(image_obs, axis=0) 87 | return {"observation" : np.array(obs, dtype=np.float32), 88 | "image_obs" : np.array(image_obs, dtype=np.uint8), 89 | "reward" : np.array(rewards, dtype=np.float32), 90 | "action" : np.array(acs, dtype=np.float32), 91 | "next_observation": np.array(next_obs, dtype=np.float32), 92 | "terminal": np.array(terminals, dtype=np.float32)} 93 | 94 | 95 | def convert_listofrollouts(paths): 96 | """ 97 | Take a list of rollout dictionaries 98 | and return separate arrays, 99 | where each array is a concatenation of that array from across the rollouts 100 | """ 101 | observations = np.concatenate([path["observation"] for path in paths]) 102 | actions = np.concatenate([path["action"] for path in paths]) 103 | next_observations = np.concatenate([path["next_observation"] for path in paths]) 104 | terminals = np.concatenate([path["terminal"] for path in paths]) 105 | concatenated_rewards = np.concatenate([path["reward"] for path in paths]) 106 | unconcatenated_rewards = [path["reward"] for path in paths] 107 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards 108 | 109 | ############################################ 110 | ############################################ 111 | 112 | def get_pathlength(path): 113 | return len(path["reward"]) 114 | 115 | def normalize(data, mean, std, eps=1e-8): 116 | return (data-mean)/(std+eps) 117 | 118 | def unnormalize(data, mean, std): 119 | return data*std+mean 120 | 121 | def add_noise(data_inp, noiseToSignal=0.01): 122 | 123 | data = copy.deepcopy(data_inp) #(num data points, dim) 124 | 125 | #mean of data 126 | mean_data = np.mean(data, axis=0) 127 | 128 | #if mean is 0, 129 | #make it 0.001 to avoid 0 issues later for dividing by std 130 | mean_data[mean_data == 0] = 0.000001 131 | 132 | #width of normal distribution to sample noise from 133 | #larger magnitude number = could have larger magnitude noise 134 | std_of_noise = mean_data * noiseToSignal 135 | for j in range(mean_data.shape[0]): 136 | data[:, j] = np.copy(data[:, j] + np.random.normal( 137 | 0, np.absolute(std_of_noise[j]), (data.shape[0],))) 138 | 139 | return data 140 | -------------------------------------------------------------------------------- /hw3/cs285/policies/MLP_policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import itertools 3 | from torch import nn 4 | from torch.nn import functional as F 5 | from torch import optim 6 | 7 | import numpy as np 8 | import torch 9 | from torch import distributions 10 | 11 | from cs285.infrastructure import pytorch_util as ptu 12 | from cs285.policies.base_policy import BasePolicy 13 | 14 | 15 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): 16 | 17 | def __init__(self, 18 | ac_dim, 19 | ob_dim, 20 | n_layers, 21 | size, 22 | discrete=False, 23 | learning_rate=1e-4, 24 | training=True, 25 | nn_baseline=False, 26 | **kwargs 27 | ): 28 | super().__init__(**kwargs) 29 | 30 | # init vars 31 | self.ac_dim = ac_dim 32 | self.ob_dim = ob_dim 33 | self.n_layers = n_layers 34 | self.discrete = discrete 35 | self.size = size 36 | self.learning_rate = learning_rate 37 | self.training = training 38 | self.nn_baseline = nn_baseline 39 | 40 | if self.discrete: 41 | self.logits_na = ptu.build_mlp(input_size=self.ob_dim, 42 | output_size=self.ac_dim, 43 | n_layers=self.n_layers, 44 | size=self.size) 45 | self.logits_na.to(ptu.device) 46 | self.mean_net = None 47 | self.logstd = None 48 | self.optimizer = optim.Adam(self.logits_na.parameters(), 49 | self.learning_rate) 50 | else: 51 | self.logits_na = None 52 | self.mean_net = ptu.build_mlp(input_size=self.ob_dim, 53 | output_size=self.ac_dim, 54 | n_layers=self.n_layers, size=self.size) 55 | self.logstd = nn.Parameter( 56 | torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device) 57 | ) 58 | self.mean_net.to(ptu.device) 59 | self.logstd.to(ptu.device) 60 | self.optimizer = optim.Adam( 61 | itertools.chain([self.logstd], self.mean_net.parameters()), 62 | self.learning_rate 63 | ) 64 | 65 | if nn_baseline: 66 | self.baseline = ptu.build_mlp( 67 | input_size=self.ob_dim, 68 | output_size=1, 69 | n_layers=self.n_layers, 70 | size=self.size, 71 | ) 72 | self.baseline.to(ptu.device) 73 | self.baseline_optimizer = optim.Adam( 74 | self.baseline.parameters(), 75 | self.learning_rate, 76 | ) 77 | else: 78 | self.baseline = None 79 | 80 | ################################## 81 | 82 | def save(self, filepath): 83 | torch.save(self.state_dict(), filepath) 84 | 85 | ################################## 86 | 87 | # query the policy with observation(s) to get selected action(s) 88 | def get_action(self, obs: np.ndarray) -> np.ndarray: 89 | # TODO: get this from Piazza 90 | return action 91 | 92 | # update/train this policy 93 | def update(self, observations, actions, **kwargs): 94 | raise NotImplementedError 95 | 96 | # This function defines the forward pass of the network. 97 | # You can return anything you want, but you should be able to differentiate 98 | # through it. For example, you can return a torch.FloatTensor. You can also 99 | # return more flexible objects, such as a 100 | # `torch.distributions.Distribution` object. It's up to you! 101 | def forward(self, observation: torch.FloatTensor): 102 | # TODO: get this from Piazza 103 | return action_distribution 104 | 105 | 106 | ##################################################### 107 | ##################################################### 108 | 109 | 110 | class MLPPolicyAC(MLPPolicy): 111 | def update(self, observations, actions, adv_n=None): 112 | # TODO: update the policy and return the loss 113 | loss = TODO 114 | return loss.item() 115 | -------------------------------------------------------------------------------- /hw3/cs285/policies/argmax_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ArgMaxPolicy(object): 5 | 6 | def __init__(self, critic): 7 | self.critic = critic 8 | 9 | def get_action(self, obs): 10 | if len(obs.shape) > 3: 11 | observation = obs 12 | else: 13 | observation = obs[None] 14 | 15 | ## TODO return the action that maxinmizes the Q-value 16 | # at the current observation as the output 17 | actions = TODO 18 | 19 | return action.squeeze() -------------------------------------------------------------------------------- /hw3/cs285/policies/base_policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import numpy as np 3 | 4 | 5 | class BasePolicy(object, metaclass=abc.ABCMeta): 6 | def get_action(self, obs: np.ndarray) -> np.ndarray: 7 | raise NotImplementedError 8 | 9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: 10 | """Return a dictionary of logging information.""" 11 | raise NotImplementedError 12 | 13 | def save(self, filepath: str): 14 | raise NotImplementedError 15 | -------------------------------------------------------------------------------- /hw3/cs285/scripts/read_results.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import tensorflow as tf 3 | 4 | def get_section_results(file): 5 | """ 6 | requires tensorflow==1.12.0 7 | """ 8 | X = [] 9 | Y = [] 10 | for e in tf.train.summary_iterator(file): 11 | for v in e.summary.value: 12 | if v.tag == 'Train_EnvstepsSoFar': 13 | X.append(v.simple_value) 14 | elif v.tag == 'Eval_AverageReturn': 15 | Y.append(v.simple_value) 16 | return X, Y 17 | 18 | if __name__ == '__main__': 19 | import glob 20 | 21 | logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*' 22 | eventfile = glob.glob(logdir)[0] 23 | 24 | X, Y = get_section_results(eventfile) 25 | for i, (x, y) in enumerate(zip(X, Y)): 26 | print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y)) -------------------------------------------------------------------------------- /hw3/cs285/scripts/run_hw3_actor_critic.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from cs285.agents.ac_agent import ACAgent 5 | from cs285.infrastructure.rl_trainer import RL_Trainer 6 | 7 | 8 | class AC_Trainer(object): 9 | 10 | def __init__(self, params): 11 | 12 | ##################### 13 | ## SET AGENT PARAMS 14 | ##################### 15 | 16 | computation_graph_args = { 17 | 'n_layers': params['n_layers'], 18 | 'size': params['size'], 19 | 'learning_rate': params['learning_rate'], 20 | 'num_target_updates': params['num_target_updates'], 21 | 'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'], 22 | } 23 | 24 | estimate_advantage_args = { 25 | 'gamma': params['discount'], 26 | 'standardize_advantages': not(params['dont_standardize_advantages']), 27 | } 28 | 29 | train_args = { 30 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 31 | 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], 32 | 'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'], 33 | } 34 | 35 | agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args} 36 | 37 | self.params = params 38 | self.params['agent_class'] = ACAgent 39 | self.params['agent_params'] = agent_params 40 | self.params['batch_size_initial'] = self.params['batch_size'] 41 | 42 | ################ 43 | ## RL TRAINER 44 | ################ 45 | 46 | self.rl_trainer = RL_Trainer(self.params) 47 | 48 | def run_training_loop(self): 49 | 50 | self.rl_trainer.run_training_loop( 51 | self.params['n_iter'], 52 | collect_policy = self.rl_trainer.agent.actor, 53 | eval_policy = self.rl_trainer.agent.actor, 54 | ) 55 | 56 | 57 | def main(): 58 | 59 | import argparse 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument('--env_name', type=str, default='CartPole-v0') 62 | parser.add_argument('--ep_len', type=int, default=200) 63 | parser.add_argument('--exp_name', type=str, default='todo') 64 | parser.add_argument('--n_iter', '-n', type=int, default=200) 65 | 66 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) 67 | parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1) 68 | parser.add_argument('--num_actor_updates_per_agent_update', type=int, default=1) 69 | 70 | parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration 71 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration 72 | parser.add_argument('--train_batch_size', '-tb', type=int, default=1000) ##steps used per gradient step 73 | 74 | parser.add_argument('--discount', type=float, default=1.0) 75 | parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) 76 | parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true') 77 | parser.add_argument('--num_target_updates', '-ntu', type=int, default=10) 78 | parser.add_argument('--num_grad_steps_per_target_update', '-ngsptu', type=int, default=10) 79 | parser.add_argument('--n_layers', '-l', type=int, default=2) 80 | parser.add_argument('--size', '-s', type=int, default=64) 81 | 82 | parser.add_argument('--seed', type=int, default=1) 83 | parser.add_argument('--no_gpu', '-ngpu', action='store_true') 84 | parser.add_argument('--which_gpu', '-gpu_id', default=0) 85 | parser.add_argument('--video_log_freq', type=int, default=-1) 86 | parser.add_argument('--scalar_log_freq', type=int, default=10) 87 | 88 | parser.add_argument('--save_params', action='store_true') 89 | 90 | args = parser.parse_args() 91 | 92 | # convert to dictionary 93 | params = vars(args) 94 | 95 | # for policy gradient, we made a design decision 96 | # to force batch_size = train_batch_size 97 | # note that, to avoid confusion, you don't even have a train_batch_size argument anymore (above) 98 | params['train_batch_size'] = params['batch_size'] 99 | 100 | ################################## 101 | ### CREATE DIRECTORY FOR LOGGING 102 | ################################## 103 | 104 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') 105 | 106 | if not (os.path.exists(data_path)): 107 | os.makedirs(data_path) 108 | 109 | logdir = 'hw3_ ' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 110 | logdir = os.path.join(data_path, logdir) 111 | params['logdir'] = logdir 112 | if not(os.path.exists(logdir)): 113 | os.makedirs(logdir) 114 | 115 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") 116 | 117 | ################### 118 | ### RUN TRAINING 119 | ################### 120 | 121 | trainer = AC_Trainer(params) 122 | trainer.run_training_loop() 123 | 124 | 125 | if __name__ == "__main__": 126 | main() 127 | -------------------------------------------------------------------------------- /hw3/cs285/scripts/run_hw3_dqn.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from cs285.infrastructure.rl_trainer import RL_Trainer 5 | from cs285.agents.dqn_agent import DQNAgent 6 | from cs285.infrastructure.dqn_utils import get_env_kwargs 7 | 8 | 9 | class Q_Trainer(object): 10 | 11 | def __init__(self, params): 12 | self.params = params 13 | 14 | train_args = { 15 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 16 | 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], 17 | 'train_batch_size': params['batch_size'], 18 | 'double_q': params['double_q'], 19 | } 20 | 21 | env_args = get_env_kwargs(params['env_name']) 22 | 23 | self.agent_params = {**train_args, **env_args, **params} 24 | 25 | self.params['agent_class'] = DQNAgent 26 | self.params['agent_params'] = self.agent_params 27 | self.params['train_batch_size'] = params['batch_size'] 28 | self.params['env_wrappers'] = self.agent_params['env_wrappers'] 29 | 30 | self.rl_trainer = RL_Trainer(self.params) 31 | 32 | def run_training_loop(self): 33 | self.rl_trainer.run_training_loop( 34 | self.agent_params['num_timesteps'], 35 | collect_policy = self.rl_trainer.agent.actor, 36 | eval_policy = self.rl_trainer.agent.actor, 37 | ) 38 | 39 | def main(): 40 | 41 | import argparse 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | '--env_name', 45 | default='MsPacman-v0', 46 | choices=('PongNoFrameskip-v4', 'LunarLander-v3', 'MsPacman-v0') 47 | ) 48 | 49 | parser.add_argument('--ep_len', type=int, default=200) 50 | parser.add_argument('--exp_name', type=str, default='todo') 51 | 52 | parser.add_argument('--eval_batch_size', type=int, default=1000) 53 | 54 | parser.add_argument('--batch_size', type=int, default=32) 55 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) 56 | parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1) 57 | parser.add_argument('--double_q', action='store_true') 58 | 59 | parser.add_argument('--seed', type=int, default=1) 60 | parser.add_argument('--no_gpu', '-ngpu', action='store_true') 61 | parser.add_argument('--which_gpu', '-gpu_id', default=0) 62 | parser.add_argument('--scalar_log_freq', type=int, default=int(1e4)) 63 | parser.add_argument('--video_log_freq', type=int, default=-1) 64 | 65 | parser.add_argument('--save_params', action='store_true') 66 | 67 | args = parser.parse_args() 68 | 69 | # convert to dictionary 70 | params = vars(args) 71 | params['video_log_freq'] = -1 # This param is not used for DQN 72 | ################################## 73 | ### CREATE DIRECTORY FOR LOGGING 74 | ################################## 75 | 76 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') 77 | 78 | if not (os.path.exists(data_path)): 79 | os.makedirs(data_path) 80 | 81 | logdir = 'hw3_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 82 | logdir = os.path.join(data_path, logdir) 83 | params['logdir'] = logdir 84 | if not(os.path.exists(logdir)): 85 | os.makedirs(logdir) 86 | 87 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") 88 | 89 | trainer = Q_Trainer(params) 90 | trainer.run_training_loop() 91 | 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /hw3/cs285_hw3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw3/cs285_hw3.pdf -------------------------------------------------------------------------------- /hw3/requirements.txt: -------------------------------------------------------------------------------- 1 | gym[atari]==0.17.2 2 | mujoco-py==2.0.2.2 3 | tensorboard==2.3.0 4 | tensorboardX==1.8 5 | matplotlib==2.2.2 6 | ipython==6.4.0 7 | moviepy==1.0.0 8 | pyvirtualdisplay==1.3.2 9 | torch==1.5.1 10 | opencv-python==4.4.0.42 11 | ipdb==0.13.3 12 | box2d-py 13 | -------------------------------------------------------------------------------- /hw3/requirements_colab.txt: -------------------------------------------------------------------------------- 1 | gym[atari]==0.17.2 2 | tensorboard==2.3.0 3 | tensorboardX==1.8 4 | matplotlib==2.2.2 5 | ipython==6.4.0 6 | moviepy==1.0.0 7 | pyvirtualdisplay==1.3.2 8 | torch==1.5.1 9 | opencv-python==4.4.0.42 10 | ipdb==0.13.3 11 | box2d-py 12 | -------------------------------------------------------------------------------- /hw3/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) -------------------------------------------------------------------------------- /hw4/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | You can run this code on your own machine or on Google Colab. 4 | 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally. 6 | 7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below: 8 | 9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw4/cs285/scripts/run_hw4_mb.ipynb) 10 | 11 | ## Complete the code 12 | 13 | The following files have blanks to be filled with your solutions from homework 1. The relevant sections are marked with `TODO: get this from Piazza'. 14 | 15 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) 16 | - [infrastructure/utils.py](cs285/infrastructure/utils.py) 17 | 18 | You will then need to implement code in the following files: 19 | - [agents/mb_agent.py](cs285/agents/mb_agent.py) 20 | - [models/ff_model.py](cs285/models/ff_model.py) 21 | - [policies/MPC_policy.py](cs285/policies/MPC_policy.py) 22 | 23 | The relevant sections are marked with `TODO`. 24 | 25 | You may also want to look through [scripts/run_hw4_mb.py](cs285/scripts/run_hw4_mb.py) (if running locally) or [scripts/run_hw4_mb.ipynb](cs285/scripts/run_hw4_mb.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook. 26 | 27 | See the [assignment PDF](cs285_hw4.pdf) for more details on what files to edit. 28 | 29 | -------------------------------------------------------------------------------- /hw4/cs285/agents/base_agent.py: -------------------------------------------------------------------------------- 1 | class BaseAgent(object): 2 | def __init__(self, **kwargs): 3 | super(BaseAgent, self).__init__(**kwargs) 4 | 5 | def train(self) -> dict: 6 | """Return a dictionary of logging information.""" 7 | raise NotImplementedError 8 | 9 | def add_to_replay_buffer(self, paths): 10 | raise NotImplementedError 11 | 12 | def sample(self, batch_size): 13 | raise NotImplementedError 14 | 15 | def save(self, path): 16 | raise NotImplementedError -------------------------------------------------------------------------------- /hw4/cs285/agents/mb_agent.py: -------------------------------------------------------------------------------- 1 | from .base_agent import BaseAgent 2 | from cs285.models.ff_model import FFModel 3 | from cs285.policies.MPC_policy import MPCPolicy 4 | from cs285.infrastructure.replay_buffer import ReplayBuffer 5 | from cs285.infrastructure.utils import * 6 | 7 | 8 | class MBAgent(BaseAgent): 9 | def __init__(self, env, agent_params): 10 | super(MBAgent, self).__init__() 11 | 12 | self.env = env.unwrapped 13 | self.agent_params = agent_params 14 | self.ensemble_size = self.agent_params['ensemble_size'] 15 | 16 | self.dyn_models = [] 17 | for i in range(self.ensemble_size): 18 | model = FFModel( 19 | self.agent_params['ac_dim'], 20 | self.agent_params['ob_dim'], 21 | self.agent_params['n_layers'], 22 | self.agent_params['size'], 23 | self.agent_params['learning_rate'], 24 | ) 25 | self.dyn_models.append(model) 26 | 27 | self.actor = MPCPolicy( 28 | self.env, 29 | ac_dim=self.agent_params['ac_dim'], 30 | dyn_models=self.dyn_models, 31 | horizon=self.agent_params['mpc_horizon'], 32 | N=self.agent_params['mpc_num_action_sequences'], 33 | ) 34 | 35 | self.replay_buffer = ReplayBuffer() 36 | 37 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 38 | 39 | # training a MB agent refers to updating the predictive model using observed state transitions 40 | # NOTE: each model in the ensemble is trained on a different random batch of size batch_size 41 | losses = [] 42 | num_data = ob_no.shape[0] 43 | num_data_per_ens = int(num_data / self.ensemble_size) 44 | 45 | for i in range(self.ensemble_size): 46 | 47 | # select which datapoints to use for this model of the ensemble 48 | # you might find the num_data_per_env variable defined above useful 49 | 50 | observations = # TODO(Q1) 51 | actions = # TODO(Q1) 52 | next_observations = # TODO(Q1) 53 | 54 | # use datapoints to update one of the dyn_models 55 | model = # TODO(Q1) 56 | log = model.update(observations, actions, next_observations, 57 | self.data_statistics) 58 | loss = log['Training Loss'] 59 | losses.append(loss) 60 | 61 | avg_loss = np.mean(losses) 62 | return { 63 | 'Training Loss': avg_loss, 64 | } 65 | 66 | def add_to_replay_buffer(self, paths, add_sl_noise=False): 67 | 68 | # add data to replay buffer 69 | self.replay_buffer.add_rollouts(paths, noised=add_sl_noise) 70 | 71 | # get updated mean/std of the data in our replay buffer 72 | self.data_statistics = { 73 | 'obs_mean': np.mean(self.replay_buffer.obs, axis=0), 74 | 'obs_std': np.std(self.replay_buffer.obs, axis=0), 75 | 'acs_mean': np.mean(self.replay_buffer.acs, axis=0), 76 | 'acs_std': np.std(self.replay_buffer.acs, axis=0), 77 | 'delta_mean': np.mean( 78 | self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0), 79 | 'delta_std': np.std( 80 | self.replay_buffer.next_obs - self.replay_buffer.obs, axis=0), 81 | } 82 | 83 | # update the actor's data_statistics too, so actor.get_action can be calculated correctly 84 | self.actor.data_statistics = self.data_statistics 85 | 86 | def sample(self, batch_size): 87 | # NOTE: sampling batch_size * ensemble_size, 88 | # so each model in our ensemble can get trained on batch_size data 89 | return self.replay_buffer.sample_random_data( 90 | batch_size * self.ensemble_size) 91 | -------------------------------------------------------------------------------- /hw4/cs285/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | def register_envs(): 4 | register( 5 | id='cheetah-cs285-v0', 6 | entry_point='cs285.envs.cheetah:HalfCheetahEnv', 7 | max_episode_steps=1000, 8 | ) 9 | register( 10 | id='obstacles-cs285-v0', 11 | entry_point='cs285.envs.obstacles:Obstacles', 12 | max_episode_steps=500, 13 | ) 14 | register( 15 | id='reacher-cs285-v0', 16 | entry_point='cs285.envs.reacher:Reacher7DOFEnv', 17 | max_episode_steps=500, 18 | ) 19 | -------------------------------------------------------------------------------- /hw4/cs285/envs/cheetah/__init__.py: -------------------------------------------------------------------------------- 1 | from cs285.envs.cheetah.cheetah import HalfCheetahEnv 2 | -------------------------------------------------------------------------------- /hw4/cs285/envs/cheetah/cheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mujoco_py 3 | from gym import utils 4 | from gym.envs.mujoco import mujoco_env 5 | 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | 8 | def __init__(self): 9 | 10 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) 11 | utils.EzPickle.__init__(self) 12 | 13 | self.skip = self.frame_skip 14 | 15 | self.action_dim = self.ac_dim = self.action_space.shape[0] 16 | self.observation_dim = self.obs_dim = self.observation_space.shape[0] 17 | 18 | def get_reward(self, observations, actions): 19 | 20 | """get reward/s of given (observations, actions) datapoint or datapoints 21 | 22 | Args: 23 | observations: (batchsize, obs_dim) or (obs_dim,) 24 | actions: (batchsize, ac_dim) or (ac_dim,) 25 | 26 | Return: 27 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) 28 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) 29 | """ 30 | 31 | #initialize and reshape as needed, for batch mode 32 | self.reward_dict = {} 33 | if(len(observations.shape)==1): 34 | observations = np.expand_dims(observations, axis = 0) 35 | actions = np.expand_dims(actions, axis = 0) 36 | batch_mode = False 37 | else: 38 | batch_mode = True 39 | 40 | #get vars 41 | xvel = observations[:, 9].copy() 42 | body_angle = observations[:, 2].copy() 43 | front_leg = observations[:, 6].copy() 44 | front_shin = observations[:, 7].copy() 45 | front_foot = observations[:, 8].copy() 46 | zeros = np.zeros((observations.shape[0],)).copy() 47 | 48 | # ranges 49 | leg_range = 0.2 50 | shin_range = 0 51 | foot_range = 0 52 | penalty_factor = 10 53 | 54 | #calc rew 55 | self.reward_dict['run'] = xvel 56 | 57 | front_leg_rew = zeros.copy() 58 | front_leg_rew[front_leg>leg_range] = -penalty_factor 59 | self.reward_dict['leg'] = front_leg_rew 60 | 61 | front_shin_rew = zeros.copy() 62 | front_shin_rew[front_shin>shin_range] = -penalty_factor 63 | self.reward_dict['shin'] = front_shin_rew 64 | 65 | front_foot_rew = zeros.copy() 66 | front_foot_rew[front_foot>foot_range] = -penalty_factor 67 | self.reward_dict['foot'] = front_foot_rew 68 | 69 | # total reward 70 | self.reward_dict['r_total'] = self.reward_dict['run'] + self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot'] 71 | 72 | #return 73 | dones = zeros.copy() 74 | if(not batch_mode): 75 | return self.reward_dict['r_total'][0], dones[0] 76 | return self.reward_dict['r_total'], dones 77 | 78 | 79 | def get_score(self, obs): 80 | xposafter = obs[0] 81 | return xposafter 82 | 83 | ############################################## 84 | 85 | def step(self, action): 86 | 87 | #step 88 | self.do_simulation(action, self.frame_skip) 89 | 90 | #obs/reward/done/score 91 | ob = self._get_obs() 92 | rew, done = self.get_reward(ob, action) 93 | score = self.get_score(ob) 94 | 95 | #return 96 | env_info = {'obs_dict': self.obs_dict, 97 | 'rewards': self.reward_dict, 98 | 'score': score} 99 | return ob, rew, done, env_info 100 | 101 | def _get_obs(self): 102 | 103 | self.obs_dict = {} 104 | self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy() 105 | self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy() 106 | self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy() 107 | 108 | return np.concatenate([ 109 | self.obs_dict['joints_pos'], #9 110 | self.obs_dict['joints_vel'], #9 111 | self.obs_dict['com_torso'], #3 112 | ]) 113 | 114 | ############################################## 115 | 116 | def reset_model(self, seed=None): 117 | 118 | # set reset pose/vel 119 | self.reset_pose = self.init_qpos + self.np_random.uniform( 120 | low=-.1, high=.1, size=self.model.nq) 121 | self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 122 | 123 | #reset the env to that pose/vel 124 | return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy()) 125 | 126 | 127 | def do_reset(self, reset_pose, reset_vel, reset_goal=None): 128 | 129 | #reset 130 | self.set_state(reset_pose, reset_vel) 131 | 132 | #return 133 | return self._get_obs() 134 | -------------------------------------------------------------------------------- /hw4/cs285/envs/obstacles/__init__.py: -------------------------------------------------------------------------------- 1 | from cs285.envs.obstacles.obstacles_env import Obstacles 2 | -------------------------------------------------------------------------------- /hw4/cs285/envs/reacher/__init__.py: -------------------------------------------------------------------------------- 1 | from cs285.envs.reacher.reacher_env import Reacher7DOFEnv 2 | -------------------------------------------------------------------------------- /hw4/cs285/envs/reacher/reacher_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | from mujoco_py import MjViewer 5 | import os 6 | 7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | 10 | # placeholder 11 | self.hand_sid = -2 12 | self.target_sid = -1 13 | 14 | curr_dir = os.path.dirname(os.path.abspath(__file__)) 15 | mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2) 16 | utils.EzPickle.__init__(self) 17 | self.observation_dim = 26 18 | self.action_dim = 7 19 | 20 | self.hand_sid = self.model.site_name2id("finger") 21 | self.target_sid = self.model.site_name2id("target") 22 | self.skip = self.frame_skip 23 | 24 | 25 | def _get_obs(self): 26 | return np.concatenate([ 27 | self.data.qpos.flat, #[7] 28 | self.data.qvel.flatten() / 10., #[7] 29 | self.data.site_xpos[self.hand_sid], #[3] 30 | self.model.site_pos[self.target_sid], #[3] 31 | ]) 32 | 33 | def step(self, a): 34 | 35 | self.do_simulation(a, self.frame_skip) 36 | ob = self._get_obs() 37 | reward, done = self.get_reward(ob, a) 38 | 39 | score = self.get_score(ob) 40 | 41 | # finalize step 42 | env_info = {'ob': ob, 43 | 'rewards': self.reward_dict, 44 | 'score': score} 45 | 46 | return ob, reward, done, env_info 47 | 48 | def get_score(self, obs): 49 | hand_pos = obs[-6:-3] 50 | target_pos = obs[-3:] 51 | score = -1*np.abs(hand_pos-target_pos) 52 | return score 53 | 54 | def get_reward(self, observations, actions): 55 | 56 | """get reward/s of given (observations, actions) datapoint or datapoints 57 | 58 | Args: 59 | observations: (batchsize, obs_dim) or (obs_dim,) 60 | actions: (batchsize, ac_dim) or (ac_dim,) 61 | 62 | Return: 63 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) 64 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) 65 | """ 66 | 67 | #initialize and reshape as needed, for batch mode 68 | self.reward_dict = {} 69 | if(len(observations.shape)==1): 70 | observations = np.expand_dims(observations, axis = 0) 71 | actions = np.expand_dims(actions, axis = 0) 72 | batch_mode = False 73 | else: 74 | batch_mode = True 75 | 76 | #get vars 77 | hand_pos = observations[:, -6:-3] 78 | target_pos = observations[:, -3:] 79 | 80 | #calc rew 81 | dist = np.linalg.norm(hand_pos - target_pos, axis=1) 82 | self.reward_dict['r_total'] = -10*dist 83 | 84 | #done is always false for this env 85 | dones = np.zeros((observations.shape[0],)) 86 | 87 | #return 88 | if(not batch_mode): 89 | return self.reward_dict['r_total'][0], dones[0] 90 | return self.reward_dict['r_total'], dones 91 | 92 | def reset(self): 93 | _ = self.reset_model() 94 | 95 | self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1] 96 | 97 | observation, _reward, done, _info = self.step(np.zeros(7)) 98 | ob = self._get_obs() 99 | 100 | return ob 101 | 102 | def reset_model(self, seed=None): 103 | if seed is not None: 104 | self.seed(seed) 105 | 106 | self.reset_pose = self.init_qpos.copy() 107 | self.reset_vel = self.init_qvel.copy() 108 | 109 | self.reset_goal = np.zeros(3) 110 | self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3) 111 | self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2) 112 | self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25) 113 | 114 | return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal) 115 | 116 | def do_reset(self, reset_pose, reset_vel, reset_goal): 117 | 118 | self.set_state(reset_pose, reset_vel) 119 | 120 | #reset target 121 | self.reset_goal = reset_goal.copy() 122 | self.model.site_pos[self.target_sid] = self.reset_goal 123 | self.sim.forward() 124 | 125 | #return 126 | return self._get_obs() -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/colab_utils.py: -------------------------------------------------------------------------------- 1 | from gym.wrappers import Monitor 2 | import glob 3 | import io 4 | import base64 5 | from IPython.display import HTML 6 | from IPython import display as ipythondisplay 7 | 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI 9 | 10 | def show_video(): 11 | mp4list = glob.glob('/content/video/*.mp4') 12 | if len(mp4list) > 0: 13 | mp4 = mp4list[0] 14 | video = io.open(mp4, 'r+b').read() 15 | encoded = base64.b64encode(video) 16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) 20 | else: 21 | print("Could not find video") 22 | 23 | 24 | def wrap_env(env): 25 | env = Monitor(env, '/content/video', force=True) 26 | return env 27 | -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorboardX import SummaryWriter 3 | import numpy as np 4 | 5 | class Logger: 6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 7 | self._log_dir = log_dir 8 | print('########################') 9 | print('logging outputs to ', log_dir) 10 | print('########################') 11 | self._n_logged_samples = n_logged_samples 12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 13 | 14 | def log_scalar(self, scalar, name, step_): 15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 16 | 17 | def log_scalars(self, scalar_dict, group_name, step, phase): 18 | """Will log all scalars in the same plot.""" 19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 20 | 21 | def log_image(self, image, name, step): 22 | assert(len(image.shape) == 3) # [C, H, W] 23 | self._summ_writer.add_image('{}'.format(name), image, step) 24 | 25 | def log_video(self, video_frames, name, step, fps=10): 26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 28 | 29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 30 | 31 | # reshape the rollouts 32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 33 | 34 | # max rollout length 35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 36 | max_length = videos[0].shape[0] 37 | for i in range(max_videos_to_save): 38 | if videos[i].shape[0]>max_length: 39 | max_length = videos[i].shape[0] 40 | 41 | # pad rollouts to all be same length 42 | for i in range(max_videos_to_save): 43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 55 | 56 | def log_figure(self, figure, name, step, phase): 57 | """figure: matplotlib.pyplot figure handle""" 58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 59 | 60 | def log_graph(self, array, name, step, phase): 61 | """figure: matplotlib.pyplot figure handle""" 62 | im = plot_graph(array) 63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 64 | 65 | def dump_scalars(self, log_path=None): 66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 67 | self._summ_writer.export_scalars_to_json(log_path) 68 | 69 | def flush(self): 70 | self._summ_writer.flush() 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/pytorch_util.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | from torch import nn 5 | 6 | Activation = Union[str, nn.Module] 7 | 8 | 9 | _str_to_activation = { 10 | 'relu': nn.ReLU(), 11 | 'tanh': nn.Tanh(), 12 | 'leaky_relu': nn.LeakyReLU(), 13 | 'sigmoid': nn.Sigmoid(), 14 | 'selu': nn.SELU(), 15 | 'softplus': nn.Softplus(), 16 | 'identity': nn.Identity(), 17 | } 18 | 19 | 20 | def build_mlp( 21 | input_size: int, 22 | output_size: int, 23 | n_layers: int, 24 | size: int, 25 | activation: Activation = 'tanh', 26 | output_activation: Activation = 'identity', 27 | ): 28 | """ 29 | Builds a feedforward neural network 30 | arguments: 31 | input_placeholder: placeholder variable for the state (batch_size, input_size) 32 | scope: variable scope of the network 33 | n_layers: number of hidden layers 34 | size: dimension of each hidden layer 35 | activation: activation of each hidden layer 36 | input_size: size of the input layer 37 | output_size: size of the output layer 38 | output_activation: activation of the output layer 39 | returns: 40 | output_placeholder: the result of a forward pass through the hidden layers + the output layer 41 | """ 42 | if isinstance(activation, str): 43 | activation = _str_to_activation[activation] 44 | if isinstance(output_activation, str): 45 | output_activation = _str_to_activation[output_activation] 46 | layers = [] 47 | in_size = input_size 48 | for _ in range(n_layers): 49 | layers.append(nn.Linear(in_size, size)) 50 | layers.append(activation) 51 | in_size = size 52 | layers.append(nn.Linear(in_size, output_size)) 53 | layers.append(output_activation) 54 | return nn.Sequential(*layers) 55 | 56 | 57 | device = None 58 | 59 | 60 | def init_gpu(use_gpu=True, gpu_id=0): 61 | global device 62 | if torch.cuda.is_available() and use_gpu: 63 | device = torch.device("cuda:" + str(gpu_id)) 64 | print("Using GPU id {}".format(gpu_id)) 65 | else: 66 | device = torch.device("cpu") 67 | print("GPU not detected. Defaulting to CPU.") 68 | 69 | 70 | def set_device(gpu_id): 71 | torch.cuda.set_device(gpu_id) 72 | 73 | 74 | def from_numpy(*args, **kwargs): 75 | return torch.from_numpy(*args, **kwargs).float().to(device) 76 | 77 | 78 | def to_numpy(tensor): 79 | return tensor.to('cpu').detach().numpy() 80 | -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure.utils import * 2 | 3 | 4 | class ReplayBuffer(object): 5 | 6 | def __init__(self, max_size=1000000): 7 | 8 | self.max_size = max_size 9 | self.paths = [] 10 | self.obs = None 11 | self.acs = None 12 | self.concatenated_rews = None 13 | self.next_obs = None 14 | self.terminals = None 15 | 16 | def add_rollouts(self, paths, noised=False): 17 | 18 | # add new rollouts into our list of rollouts 19 | for path in paths: 20 | self.paths.append(path) 21 | 22 | # convert new rollouts into their component arrays, and append them onto our arrays 23 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) 24 | 25 | if noised: 26 | observations = add_noise(observations) 27 | next_observations = add_noise(next_observations) 28 | 29 | if self.obs is None: 30 | self.obs = observations[-self.max_size:] 31 | self.acs = actions[-self.max_size:] 32 | self.next_obs = next_observations[-self.max_size:] 33 | self.terminals = terminals[-self.max_size:] 34 | self.concatenated_rews = concatenated_rews[-self.max_size:] 35 | else: 36 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 37 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 38 | self.next_obs = np.concatenate( 39 | [self.next_obs, next_observations] 40 | )[-self.max_size:] 41 | self.terminals = np.concatenate( 42 | [self.terminals, terminals] 43 | )[-self.max_size:] 44 | self.concatenated_rews = np.concatenate( 45 | [self.concatenated_rews, concatenated_rews] 46 | )[-self.max_size:] 47 | 48 | ######################################## 49 | ######################################## 50 | 51 | def sample_random_rollouts(self, num_rollouts): 52 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] 53 | return self.paths[rand_indices] 54 | 55 | def sample_recent_rollouts(self, num_rollouts=1): 56 | return self.paths[-num_rollouts:] 57 | 58 | ######################################## 59 | ######################################## 60 | 61 | def sample_random_data(self, batch_size): 62 | 63 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 64 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] 65 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] 66 | 67 | def sample_recent_data(self, batch_size=1, concat_rew=True): 68 | 69 | if concat_rew: 70 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 71 | else: 72 | num_recent_rollouts_to_return = 0 73 | num_datapoints_so_far = 0 74 | index = -1 75 | while num_datapoints_so_far < batch_size: 76 | recent_rollout = self.paths[index] 77 | index -=1 78 | num_recent_rollouts_to_return +=1 79 | num_datapoints_so_far += get_pathlength(recent_rollout) 80 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] 81 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) 82 | return observations, actions, unconcatenated_rews, next_observations, terminals 83 | -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import copy 4 | 5 | ############################################ 6 | ############################################ 7 | 8 | def calculate_mean_prediction_error(env, action_sequence, models, data_statistics): 9 | 10 | model = models[0] 11 | 12 | # true 13 | true_states = perform_actions(env, action_sequence)['observation'] 14 | 15 | # predicted 16 | ob = np.expand_dims(true_states[0],0) 17 | pred_states = [] 18 | for ac in action_sequence: 19 | pred_states.append(ob) 20 | action = np.expand_dims(ac,0) 21 | ob = model.get_prediction(ob, action, data_statistics) 22 | pred_states = np.squeeze(pred_states) 23 | 24 | # mpe 25 | mpe = mean_squared_error(pred_states, true_states) 26 | 27 | return mpe, true_states, pred_states 28 | 29 | def perform_actions(env, actions): 30 | ob = env.reset() 31 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] 32 | steps = 0 33 | for ac in actions: 34 | obs.append(ob) 35 | acs.append(ac) 36 | ob, rew, done, _ = env.step(ac) 37 | # add the observation after taking a step to next_obs 38 | next_obs.append(ob) 39 | rewards.append(rew) 40 | steps += 1 41 | # If the episode ended, the corresponding terminal value is 1 42 | # otherwise, it is 0 43 | if done: 44 | terminals.append(1) 45 | break 46 | else: 47 | terminals.append(0) 48 | 49 | return Path(obs, image_obs, acs, rewards, next_obs, terminals) 50 | 51 | def mean_squared_error(a, b): 52 | return np.mean((a-b)**2) 53 | 54 | ############################################ 55 | ############################################ 56 | 57 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): 58 | # TODO: get this from Piazza 59 | 60 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): 61 | """ 62 | Collect rollouts using policy 63 | until we have collected min_timesteps_per_batch steps 64 | """ 65 | # TODO: get this from Piazza 66 | 67 | return paths, timesteps_this_batch 68 | 69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): 70 | """ 71 | Collect ntraj rollouts using policy 72 | """ 73 | # TODO: get this from Piazza 74 | 75 | return paths 76 | 77 | ############################################ 78 | ############################################ 79 | 80 | def Path(obs, image_obs, acs, rewards, next_obs, terminals): 81 | """ 82 | Take info (separate arrays) from a single rollout 83 | and return it in a single dictionary 84 | """ 85 | if image_obs != []: 86 | image_obs = np.stack(image_obs, axis=0) 87 | return {"observation" : np.array(obs, dtype=np.float32), 88 | "image_obs" : np.array(image_obs, dtype=np.uint8), 89 | "reward" : np.array(rewards, dtype=np.float32), 90 | "action" : np.array(acs, dtype=np.float32), 91 | "next_observation": np.array(next_obs, dtype=np.float32), 92 | "terminal": np.array(terminals, dtype=np.float32)} 93 | 94 | 95 | def convert_listofrollouts(paths): 96 | """ 97 | Take a list of rollout dictionaries 98 | and return separate arrays, 99 | where each array is a concatenation of that array from across the rollouts 100 | """ 101 | observations = np.concatenate([path["observation"] for path in paths]) 102 | actions = np.concatenate([path["action"] for path in paths]) 103 | next_observations = np.concatenate([path["next_observation"] for path in paths]) 104 | terminals = np.concatenate([path["terminal"] for path in paths]) 105 | concatenated_rewards = np.concatenate([path["reward"] for path in paths]) 106 | unconcatenated_rewards = [path["reward"] for path in paths] 107 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards 108 | 109 | ############################################ 110 | ############################################ 111 | 112 | def get_pathlength(path): 113 | return len(path["reward"]) 114 | 115 | def normalize(data, mean, std, eps=1e-8): 116 | return (data-mean)/(std+eps) 117 | 118 | def unnormalize(data, mean, std): 119 | return data*std+mean 120 | 121 | def add_noise(data_inp, noiseToSignal=0.01): 122 | 123 | data = copy.deepcopy(data_inp) #(num data points, dim) 124 | 125 | #mean of data 126 | mean_data = np.mean(data, axis=0) 127 | 128 | #if mean is 0, 129 | #make it 0.001 to avoid 0 issues later for dividing by std 130 | mean_data[mean_data == 0] = 0.000001 131 | 132 | #width of normal distribution to sample noise from 133 | #larger magnitude number = could have larger magnitude noise 134 | std_of_noise = mean_data * noiseToSignal 135 | for j in range(mean_data.shape[0]): 136 | data[:, j] = np.copy(data[:, j] + np.random.normal( 137 | 0, np.absolute(std_of_noise[j]), (data.shape[0],))) 138 | 139 | return data 140 | -------------------------------------------------------------------------------- /hw4/cs285/models/base_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import Any 3 | 4 | 5 | Prediction = Any 6 | 7 | 8 | class BaseModel(object): 9 | def update(self, ob_no, next_ob_no, re_n, terminal_n) -> dict: 10 | raise NotImplementedError 11 | 12 | def get_prediction(self, ob_no, ac_na, data_statistics) -> Prediction: 13 | raise NotImplementedError 14 | 15 | def convert_prediction_to_numpy(self, pred: Prediction) -> np.ndarray: 16 | """Allow caller to be pytorch-agnostic.""" 17 | raise NotImplementedError 18 | -------------------------------------------------------------------------------- /hw4/cs285/policies/MPC_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .base_policy import BasePolicy 4 | 5 | 6 | class MPCPolicy(BasePolicy): 7 | 8 | def __init__(self, 9 | env, 10 | ac_dim, 11 | dyn_models, 12 | horizon, 13 | N, 14 | **kwargs 15 | ): 16 | super().__init__(**kwargs) 17 | 18 | # init vars 19 | self.env = env 20 | self.dyn_models = dyn_models 21 | self.horizon = horizon 22 | self.N = N 23 | self.data_statistics = None # NOTE must be updated from elsewhere 24 | 25 | self.ob_dim = self.env.observation_space.shape[0] 26 | 27 | # action space 28 | self.ac_space = self.env.action_space 29 | self.ac_dim = ac_dim 30 | self.low = self.ac_space.low 31 | self.high = self.ac_space.high 32 | 33 | def sample_action_sequences(self, num_sequences, horizon): 34 | # TODO(Q1) uniformly sample trajectories and return an array of 35 | # dimensions (num_sequences, horizon, self.ac_dim) in the range 36 | # [self.low, self.high] 37 | return random_action_sequences 38 | 39 | def get_action(self, obs): 40 | 41 | if self.data_statistics is None: 42 | # print("WARNING: performing random actions.") 43 | return self.sample_action_sequences(num_sequences=1, horizon=1)[0] 44 | 45 | # sample random actions (N x horizon) 46 | candidate_action_sequences = self.sample_action_sequences( 47 | num_sequences=self.N, horizon=self.horizon) 48 | 49 | # for each model in ensemble: 50 | predicted_sum_of_rewards_per_model = [] 51 | for model in self.dyn_models: 52 | sum_of_rewards = self.calculate_sum_of_rewards( 53 | obs, candidate_action_sequences, model) 54 | predicted_sum_of_rewards_per_model.append(sum_of_rewards) 55 | 56 | # calculate mean_across_ensembles(predicted rewards) 57 | predicted_rewards = np.mean( 58 | predicted_sum_of_rewards_per_model, axis=0) # [ens, N] --> N 59 | 60 | # pick the action sequence and return the 1st element of that sequence 61 | best_action_sequence = None # TODO (Q2) 62 | action_to_take = None # TODO (Q2) 63 | return action_to_take[None] # Unsqueeze the first index 64 | 65 | def calculate_sum_of_rewards(self, obs, candidate_action_sequences, model): 66 | """ 67 | 68 | :param obs: numpy array with the current observation. Shape [D_obs] 69 | :param candidate_action_sequences: numpy array with the candidate action 70 | sequences. Shape [N, H, D_action] where 71 | - N is the number of action sequences considered 72 | - H is the horizon 73 | - D_action is the action of the dimension 74 | :param model: The current dynamics model. 75 | :return: numpy array with the sum of rewards for each action sequence. 76 | The array should have shape [N]. 77 | """ 78 | sum_of_rewards = None # TODO (Q2) 79 | # For each candidate action sequence, predict a sequence of 80 | # states for each dynamics model in your ensemble. 81 | # Once you have a sequence of predicted states from each model in 82 | # your ensemble, calculate the sum of rewards for each sequence 83 | # using `self.env.get_reward(predicted_obs)` 84 | # You should sum across `self.horizon` time step. 85 | # Hint: you should use model.get_prediction and you shouldn't need 86 | # to import pytorch in this file. 87 | # Hint: Remember that the model can process observations and actions 88 | # in batch, which can be much faster than looping through each 89 | # action sequence. 90 | return sum_of_rewards 91 | -------------------------------------------------------------------------------- /hw4/cs285/policies/base_policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import numpy as np 3 | 4 | 5 | class BasePolicy(object, metaclass=abc.ABCMeta): 6 | def get_action(self, obs: np.ndarray) -> np.ndarray: 7 | raise NotImplementedError 8 | 9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: 10 | """Return a dictionary of logging information.""" 11 | raise NotImplementedError 12 | 13 | def save(self, filepath: str): 14 | raise NotImplementedError 15 | -------------------------------------------------------------------------------- /hw4/cs285/scripts/filter_events.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Usage: 4 | 5 | Run the command 6 | ``` 7 | python filter_events.py --events SOME_DIRECTORY 8 | ``` 9 | 10 | and it will generate a directory named `SOME_DIRECTORY_filtered` with the video 11 | events removed. 12 | """ 13 | from __future__ import print_function 14 | import os 15 | import sys 16 | import argparse 17 | import tqdm 18 | 19 | # Adapted from 20 | # https://gist.github.com/serycjon/c9ad58ecc3176d87c49b69b598f4d6c6 21 | 22 | import tensorflow as tf 23 | 24 | 25 | def parse_arguments(): 26 | parser = argparse.ArgumentParser(description='') 27 | parser.add_argument('--event', help='event file', required=True) 28 | 29 | return parser.parse_args() 30 | 31 | 32 | def main(args): 33 | out_path = os.path.dirname(args.event) + '_filtered' 34 | writer = tf.summary.FileWriter(out_path) 35 | 36 | total = None 37 | for event in tqdm.tqdm(tf.train.summary_iterator(args.event), total=total): 38 | event_type = event.WhichOneof('what') 39 | if event_type != 'summary': 40 | writer.add_event(event) 41 | else: 42 | wall_time = event.wall_time 43 | step = event.step 44 | filtered_values = [value for value in event.summary.value if 45 | 'rollouts' not in value.tag] 46 | summary = tf.Summary(value=filtered_values) 47 | 48 | filtered_event = tf.summary.Event(summary=summary, 49 | wall_time=wall_time, 50 | step=step) 51 | writer.add_event(filtered_event) 52 | writer.close() 53 | return 0 54 | 55 | 56 | if __name__ == '__main__': 57 | args = parse_arguments() 58 | sys.exit(main(args)) 59 | -------------------------------------------------------------------------------- /hw4/cs285/scripts/read_results.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import tensorflow as tf 3 | 4 | def get_section_results(file): 5 | """ 6 | requires tensorflow==1.12.0 7 | """ 8 | X = [] 9 | Y = [] 10 | for e in tf.train.summary_iterator(file): 11 | for v in e.summary.value: 12 | if v.tag == 'Train_EnvstepsSoFar': 13 | X.append(v.simple_value) 14 | elif v.tag == 'Eval_AverageReturn': 15 | Y.append(v.simple_value) 16 | return X, Y 17 | 18 | if __name__ == '__main__': 19 | import glob 20 | 21 | logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*' 22 | eventfile = glob.glob(logdir)[0] 23 | 24 | X, Y = get_section_results(eventfile) 25 | for i, (x, y) in enumerate(zip(X, Y)): 26 | print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y)) -------------------------------------------------------------------------------- /hw4/cs285/scripts/run_hw4_mb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from cs285.infrastructure.rl_trainer import RL_Trainer 5 | from cs285.agents.mb_agent import MBAgent 6 | 7 | 8 | class MB_Trainer(object): 9 | 10 | def __init__(self, params): 11 | 12 | ##################### 13 | ## SET AGENT PARAMS 14 | ##################### 15 | 16 | computation_graph_args = { 17 | 'ensemble_size': params['ensemble_size'], 18 | 'n_layers': params['n_layers'], 19 | 'size': params['size'], 20 | 'learning_rate': params['learning_rate'], 21 | } 22 | 23 | train_args = { 24 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 25 | } 26 | 27 | controller_args = { 28 | 'mpc_horizon': params['mpc_horizon'], 29 | 'mpc_num_action_sequences': params['mpc_num_action_sequences'], 30 | } 31 | 32 | agent_params = {**computation_graph_args, **train_args, **controller_args} 33 | 34 | self.params = params 35 | self.params['agent_class'] = MBAgent 36 | self.params['agent_params'] = agent_params 37 | 38 | ################ 39 | ## RL TRAINER 40 | ################ 41 | 42 | self.rl_trainer = RL_Trainer(self.params) 43 | 44 | def run_training_loop(self): 45 | 46 | self.rl_trainer.run_training_loop( 47 | self.params['n_iter'], 48 | collect_policy = self.rl_trainer.agent.actor, 49 | eval_policy = self.rl_trainer.agent.actor, 50 | ) 51 | 52 | 53 | def main(): 54 | 55 | import argparse 56 | parser = argparse.ArgumentParser() 57 | parser.add_argument('--env_name', type=str) #reacher-cs285-v0, ant-cs285-v0, cheetah-cs285-v0, obstacles-cs285-v0 58 | parser.add_argument('--ep_len', type=int, default=200) 59 | parser.add_argument('--exp_name', type=str, default='todo') 60 | parser.add_argument('--n_iter', '-n', type=int, default=20) 61 | 62 | parser.add_argument('--ensemble_size', '-e', type=int, default=3) 63 | parser.add_argument('--mpc_horizon', type=int, default=10) 64 | parser.add_argument('--mpc_num_action_sequences', type=int, default=1000) 65 | 66 | parser.add_argument('--add_sl_noise', '-noise', action='store_true') 67 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1000) 68 | parser.add_argument('--batch_size_initial', type=int, default=20000) #(random) steps collected on 1st iteration (put into replay buffer) 69 | parser.add_argument('--batch_size', '-b', type=int, default=8000) #steps collected per train iteration (put into replay buffer) 70 | parser.add_argument('--train_batch_size', '-tb', type=int, default=512) ##steps used per gradient step (used for training) 71 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration 72 | 73 | parser.add_argument('--learning_rate', '-lr', type=float, default=0.001) 74 | parser.add_argument('--n_layers', '-l', type=int, default=2) 75 | parser.add_argument('--size', '-s', type=int, default=250) 76 | 77 | parser.add_argument('--seed', type=int, default=1) 78 | parser.add_argument('--no_gpu', '-ngpu', action='store_true') 79 | parser.add_argument('--which_gpu', '-gpu_id', default=0) 80 | parser.add_argument('--video_log_freq', type=int, default=1) #-1 to disable 81 | parser.add_argument('--scalar_log_freq', type=int, default=1) #-1 to disable 82 | parser.add_argument('--save_params', action='store_true') 83 | args = parser.parse_args() 84 | 85 | # convert to dictionary 86 | params = vars(args) 87 | 88 | # HARDCODE EPISODE LENGTHS FOR THE ENVS USED IN THIS MB ASSIGNMENT 89 | if params['env_name']=='reacher-cs285-v0': 90 | params['ep_len']=200 91 | if params['env_name']=='cheetah-cs285-v0': 92 | params['ep_len']=500 93 | if params['env_name']=='obstacles-cs285-v0': 94 | params['ep_len']=100 95 | 96 | ################################## 97 | ### CREATE DIRECTORY FOR LOGGING 98 | ################################## 99 | 100 | logdir_prefix = 'hw4_' # keep for autograder 101 | 102 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data') 103 | 104 | if not (os.path.exists(data_path)): 105 | os.makedirs(data_path) 106 | 107 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 108 | logdir = os.path.join(data_path, logdir) 109 | params['logdir'] = logdir 110 | if not(os.path.exists(logdir)): 111 | os.makedirs(logdir) 112 | 113 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") 114 | 115 | ################### 116 | ### RUN TRAINING 117 | ################### 118 | 119 | trainer = MB_Trainer(params) 120 | trainer.run_training_loop() 121 | 122 | 123 | if __name__ == "__main__": 124 | main() 125 | -------------------------------------------------------------------------------- /hw4/cs285_hw4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw4/cs285_hw4.pdf -------------------------------------------------------------------------------- /hw4/requirements.txt: -------------------------------------------------------------------------------- 1 | gym[atari]==0.17.2 2 | mujoco-py==2.0.2.2 3 | tensorboard==2.3.0 4 | tensorboardX==1.8 5 | matplotlib==2.2.2 6 | ipython==6.4.0 7 | moviepy==1.0.0 8 | pyvirtualdisplay==1.3.2 9 | torch==1.5.1 10 | opencv-python==4.4.0.42 11 | ipdb==0.13.3 12 | box2d-py 13 | -------------------------------------------------------------------------------- /hw4/requirements_colab.txt: -------------------------------------------------------------------------------- 1 | gym[atari]==0.17.2 2 | tensorboard==2.3.0 3 | tensorboardX==1.8 4 | matplotlib==2.2.2 5 | ipython==6.4.0 6 | moviepy==1.0.0 7 | pyvirtualdisplay==1.3.2 8 | torch==1.5.1 9 | opencv-python==4.4.0.42 10 | ipdb==0.13.3 11 | box2d-py 12 | -------------------------------------------------------------------------------- /hw4/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) -------------------------------------------------------------------------------- /hw5/README.md: -------------------------------------------------------------------------------- 1 | ## Setup 2 | 3 | You can run this code on your own machine or on Google Colab. 4 | 5 | 1. **Local option:** If you choose to run locally, you will need to install MuJoCo and some Python packages; see [installation.md](../hw1/installation.md) from homework 1 for instructions. There are two new package requirements (`opencv-python` and `gym[atari]`) beyond what was used in the previous assignments; make sure to install these with `pip install -r requirements.txt` if you are running the assignment locally. 6 | 7 | 2. **Colab:** The first few sections of the notebook will install all required dependencies. You can try out the Colab option by clicking the badges below: 8 | 9 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/berkeleydeeprlcourse/homework_fall2020/blob/master/hw5/run_hw5_expl.ipynb) 10 | 11 | ## Complete the code 12 | 13 | The following files have blanks to be filled with your solutions from homework 1 and 3. The relevant sections are marked with `TODO: get this from Piazza'. 14 | 15 | - [infrastructure/utils.py](cs285/infrastructure/utils.py) 16 | - [infrastructure/rl_trainer.py](cs285/infrastructure/rl_trainer.py) 17 | - [policies/MLP_policy.py](cs285/policies/MLP_policy.py) 18 | - [policies/argmax_policy.py](cs285/policies/argmax_policy.py) 19 | - [critics/dqn_critic.py](cs285/critics/dqn_critic.py) 20 | 21 | You will then need to implement code in the following files: 22 | - [exploration/rnd_model.py](cs285/exploration/rnd_model.py) 23 | - [agents/explore_or_exploit_agent.py](cs285/agents/explore_or_exploit_agent.py) 24 | - [critics/cql_critic.py](cs285/critics/cql_critic.py) 25 | 26 | The relevant sections are marked with `TODO`. 27 | 28 | You may also want to look through [scripts/run_hw5_expl.py](cs285/scripts/run_hw5_expl.py) (if running locally) or [run_hw5_expl.ipynb](run_hw5_expl.ipynb) (if running on Colab), though you will not need to edit this files beyond changing runtime arguments in the Colab notebook. 29 | 30 | See the [assignment PDF](hw5.pdf) for more details on what files to edit. 31 | 32 | For this particular assignment, you will need to install networkx==2.5 33 | -------------------------------------------------------------------------------- /hw5/cs285/agents/ac_agent.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from cs285.critics.bootstrapped_continuous_critic import \ 4 | BootstrappedContinuousCritic 5 | from cs285.infrastructure.replay_buffer import ReplayBuffer 6 | from cs285.infrastructure.utils import * 7 | from cs285.policies.MLP_policy import MLPPolicyAC 8 | from .base_agent import BaseAgent 9 | 10 | 11 | class ACAgent(BaseAgent): 12 | def __init__(self, env, agent_params): 13 | super(ACAgent, self).__init__() 14 | 15 | self.env = env 16 | self.agent_params = agent_params 17 | 18 | self.gamma = self.agent_params['gamma'] 19 | self.standardize_advantages = self.agent_params['standardize_advantages'] 20 | 21 | self.actor = MLPPolicyAC( 22 | self.agent_params['ac_dim'], 23 | self.agent_params['ob_dim'], 24 | self.agent_params['n_layers'], 25 | self.agent_params['size'], 26 | self.agent_params['discrete'], 27 | self.agent_params['learning_rate'], 28 | ) 29 | self.critic = BootstrappedContinuousCritic(self.agent_params) 30 | 31 | self.replay_buffer = ReplayBuffer() 32 | 33 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 34 | raise NotImplementedError 35 | # Not needed for this homework 36 | 37 | #################################### 38 | #################################### 39 | 40 | def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): 41 | raise NotImplementedError 42 | # Not needed for this homework 43 | 44 | #################################### 45 | #################################### 46 | -------------------------------------------------------------------------------- /hw5/cs285/agents/base_agent.py: -------------------------------------------------------------------------------- 1 | class BaseAgent(object): 2 | def __init__(self, **kwargs): 3 | super(BaseAgent, self).__init__(**kwargs) 4 | 5 | def train(self) -> dict: 6 | """Return a dictionary of logging information.""" 7 | raise NotImplementedError 8 | 9 | def add_to_replay_buffer(self, paths): 10 | raise NotImplementedError 11 | 12 | def sample(self, batch_size): 13 | raise NotImplementedError 14 | 15 | def save(self, path): 16 | raise NotImplementedError -------------------------------------------------------------------------------- /hw5/cs285/agents/dqn_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pdb 3 | 4 | from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule 5 | from cs285.policies.argmax_policy import ArgMaxPolicy 6 | from cs285.critics.dqn_critic import DQNCritic 7 | 8 | 9 | class DQNAgent(object): 10 | def __init__(self, env, agent_params): 11 | 12 | self.env = env 13 | self.agent_params = agent_params 14 | self.batch_size = agent_params['batch_size'] 15 | # import ipdb; ipdb.set_trace() 16 | self.last_obs = self.env.reset() 17 | 18 | self.num_actions = agent_params['ac_dim'] 19 | self.learning_starts = agent_params['learning_starts'] 20 | self.learning_freq = agent_params['learning_freq'] 21 | self.target_update_freq = agent_params['target_update_freq'] 22 | 23 | self.replay_buffer_idx = None 24 | self.exploration = agent_params['exploration_schedule'] 25 | self.optimizer_spec = agent_params['optimizer_spec'] 26 | 27 | self.critic = DQNCritic(agent_params, self.optimizer_spec) 28 | self.actor = ArgMaxPolicy(self.critic) 29 | 30 | lander = agent_params['env_name'].startswith('LunarLander') 31 | self.replay_buffer = MemoryOptimizedReplayBuffer( 32 | agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) 33 | self.t = 0 34 | self.num_param_updates = 0 35 | 36 | def add_to_replay_buffer(self, paths): 37 | pass 38 | 39 | def step_env(self): 40 | """ 41 | Step the env and store the transition 42 | At the end of this block of code, the simulator should have been 43 | advanced one step, and the replay buffer should contain one more transition. 44 | Note that self.last_obs must always point to the new latest observation. 45 | """ 46 | raise NotImplementedError 47 | # Not needed for this homework 48 | 49 | #################################### 50 | #################################### 51 | 52 | def sample(self, batch_size): 53 | if self.replay_buffer.can_sample(self.batch_size): 54 | return self.replay_buffer.sample(batch_size) 55 | else: 56 | return [],[],[],[],[] 57 | 58 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 59 | raise NotImplementedError 60 | # Not needed for this homework 61 | 62 | #################################### 63 | #################################### -------------------------------------------------------------------------------- /hw5/cs285/critics/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /hw5/cs285/critics/base_critic.py: -------------------------------------------------------------------------------- 1 | class BaseCritic(object): 2 | def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n): 3 | raise NotImplementedError 4 | -------------------------------------------------------------------------------- /hw5/cs285/critics/bootstrapped_continuous_critic.py: -------------------------------------------------------------------------------- 1 | from .base_critic import BaseCritic 2 | from torch import nn 3 | from torch import optim 4 | import pdb 5 | 6 | from cs285.infrastructure import pytorch_util as ptu 7 | 8 | 9 | class BootstrappedContinuousCritic(nn.Module, BaseCritic): 10 | """ 11 | Notes on notation: 12 | 13 | Prefixes and suffixes: 14 | ob - observation 15 | ac - action 16 | _no - this tensor should have shape (batch self.size /n/, observation dim) 17 | _na - this tensor should have shape (batch self.size /n/, action dim) 18 | _n - this tensor should have shape (batch self.size /n/) 19 | 20 | Note: batch self.size /n/ is defined at runtime. 21 | is None 22 | """ 23 | def __init__(self, hparams): 24 | super().__init__() 25 | self.ob_dim = hparams['ob_dim'] 26 | self.ac_dim = hparams['ac_dim'] 27 | self.discrete = hparams['discrete'] 28 | self.size = hparams['size'] 29 | self.n_layers = hparams['n_layers'] 30 | self.learning_rate = hparams['learning_rate'] 31 | 32 | # critic parameters 33 | self.num_target_updates = hparams['num_target_updates'] 34 | self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update'] 35 | self.gamma = hparams['gamma'] 36 | self.critic_network = ptu.build_mlp( 37 | self.ob_dim, 38 | 1, 39 | n_layers=self.n_layers, 40 | size=self.size, 41 | ) 42 | self.critic_network.to(ptu.device) 43 | self.loss = nn.MSELoss() 44 | self.optimizer = optim.Adam( 45 | self.critic_network.parameters(), 46 | self.learning_rate, 47 | ) 48 | 49 | def forward(self, obs): 50 | return self.critic_network(obs).squeeze(1) 51 | 52 | def forward_np(self, obs): 53 | obs = ptu.from_numpy(obs) 54 | predictions = self(obs) 55 | return ptu.to_numpy(predictions) 56 | 57 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): 58 | """ 59 | Update the parameters of the critic. 60 | 61 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from 62 | Agent.sample_trajectories 63 | let num_paths be the number of paths sampled from Agent.sample_trajectories 64 | 65 | arguments: 66 | ob_no: shape: (sum_of_path_lengths, ob_dim) 67 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward 68 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing 69 | the reward for each timestep 70 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended 71 | at that timestep of 0 if the episode did not end 72 | 73 | returns: 74 | nothing 75 | """ 76 | raise NotImplementedError 77 | # Not needed for this homework 78 | 79 | #################################### 80 | #################################### 81 | -------------------------------------------------------------------------------- /hw5/cs285/critics/cql_critic.py: -------------------------------------------------------------------------------- 1 | from .base_critic import BaseCritic 2 | import torch 3 | import torch.optim as optim 4 | from torch.nn import utils 5 | from torch import nn 6 | import pdb 7 | 8 | from cs285.infrastructure import pytorch_util as ptu 9 | 10 | 11 | class CQLCritic(BaseCritic): 12 | 13 | def __init__(self, hparams, optimizer_spec, **kwargs): 14 | super().__init__(**kwargs) 15 | self.env_name = hparams['env_name'] 16 | self.ob_dim = hparams['ob_dim'] 17 | 18 | if isinstance(self.ob_dim, int): 19 | self.input_shape = (self.ob_dim,) 20 | else: 21 | self.input_shape = hparams['input_shape'] 22 | 23 | self.ac_dim = hparams['ac_dim'] 24 | self.double_q = hparams['double_q'] 25 | self.grad_norm_clipping = hparams['grad_norm_clipping'] 26 | self.gamma = hparams['gamma'] 27 | 28 | self.optimizer_spec = optimizer_spec 29 | network_initializer = hparams['q_func'] 30 | self.q_net = network_initializer(self.ob_dim, self.ac_dim) 31 | self.q_net_target = network_initializer(self.ob_dim, self.ac_dim) 32 | self.optimizer = self.optimizer_spec.constructor( 33 | self.q_net.parameters(), 34 | **self.optimizer_spec.optim_kwargs 35 | ) 36 | self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR( 37 | self.optimizer, 38 | self.optimizer_spec.learning_rate_schedule, 39 | ) 40 | self.loss = nn.MSELoss() 41 | self.q_net.to(ptu.device) 42 | self.q_net_target.to(ptu.device) 43 | self.cql_alpha = hparams['cql_alpha'] 44 | 45 | def dqn_loss(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): 46 | qa_t_values = self.q_net(ob_no) 47 | q_t_values = torch.gather(qa_t_values, 1, ac_na.unsqueeze(1)).squeeze(1) 48 | qa_tp1_values = self.q_net_target(next_ob_no) 49 | 50 | next_actions = self.q_net(next_ob_no).argmax(dim=1) 51 | q_tp1 = torch.gather(qa_tp1_values, 1, next_actions.unsqueeze(1)).squeeze(1) 52 | 53 | target = reward_n + self.gamma * q_tp1 * (1 - terminal_n) 54 | target = target.detach() 55 | loss = self.loss(q_t_values, target) 56 | 57 | return loss, qa_t_values, q_t_values 58 | 59 | 60 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): 61 | """ 62 | Update the parameters of the critic. 63 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from 64 | Agent.sample_trajectories 65 | let num_paths be the number of paths sampled from Agent.sample_trajectories 66 | arguments: 67 | ob_no: shape: (sum_of_path_lengths, ob_dim) 68 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward 69 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing 70 | the reward for each timestep 71 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended 72 | at that timestep of 0 if the episode did not end 73 | returns: 74 | nothing 75 | """ 76 | ob_no = ptu.from_numpy(ob_no) 77 | ac_na = ptu.from_numpy(ac_na).to(torch.long) 78 | next_ob_no = ptu.from_numpy(next_ob_no) 79 | reward_n = ptu.from_numpy(reward_n) 80 | terminal_n = ptu.from_numpy(terminal_n) 81 | 82 | loss, qa_t_values, q_t_values = self.dqn_loss( 83 | ob_no, ac_na, next_ob_no, reward_n, terminal_n 84 | ) 85 | 86 | # CQL Implementation 87 | # TODO: Implement CQL as described in the pdf and paper 88 | # Hint: After calculating cql_loss, augment the loss appropriately 89 | cql_loss = None 90 | 91 | self.optimizer.zero_grad() 92 | loss.backward() 93 | self.optimizer.step() 94 | 95 | info = {'Training Loss': ptu.to_numpy(loss)} 96 | 97 | # TODO: Uncomment these lines after implementing CQL 98 | # info['CQL Loss'] = ptu.to_numpy(cql_loss) 99 | # info['Data q-values'] = ptu.to_numpy(q_t_values).mean() 100 | # info['OOD q-values'] = ptu.to_numpy(q_t_logsumexp).mean() 101 | 102 | return info 103 | 104 | 105 | def update_target_network(self): 106 | for target_param, param in zip( 107 | self.q_net_target.parameters(), self.q_net.parameters() 108 | ): 109 | target_param.data.copy_(param.data) 110 | 111 | def qa_values(self, obs): 112 | obs = ptu.from_numpy(obs) 113 | qa_values = self.q_net(obs) 114 | return ptu.to_numpy(qa_values) 115 | -------------------------------------------------------------------------------- /hw5/cs285/critics/dqn_critic.py: -------------------------------------------------------------------------------- 1 | from .base_critic import BaseCritic 2 | import torch 3 | import torch.optim as optim 4 | from torch.nn import utils 5 | from torch import nn 6 | import pdb 7 | 8 | from cs285.infrastructure import pytorch_util as ptu 9 | 10 | 11 | class DQNCritic(BaseCritic): 12 | 13 | def __init__(self, hparams, optimizer_spec, **kwargs): 14 | super().__init__(**kwargs) 15 | self.env_name = hparams['env_name'] 16 | self.ob_dim = hparams['ob_dim'] 17 | 18 | if isinstance(self.ob_dim, int): 19 | self.input_shape = (self.ob_dim,) 20 | else: 21 | self.input_shape = hparams['input_shape'] 22 | 23 | self.ac_dim = hparams['ac_dim'] 24 | self.double_q = hparams['double_q'] 25 | self.grad_norm_clipping = hparams['grad_norm_clipping'] 26 | self.gamma = hparams['gamma'] 27 | 28 | self.optimizer_spec = optimizer_spec 29 | network_initializer = hparams['q_func'] 30 | self.q_net = network_initializer(self.ob_dim, self.ac_dim) 31 | self.q_net_target = network_initializer(self.ob_dim, self.ac_dim) 32 | self.optimizer = self.optimizer_spec.constructor( 33 | self.q_net.parameters(), 34 | **self.optimizer_spec.optim_kwargs 35 | ) 36 | self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR( 37 | self.optimizer, 38 | self.optimizer_spec.learning_rate_schedule, 39 | ) 40 | self.loss = nn.SmoothL1Loss() # AKA Huber loss 41 | self.q_net.to(ptu.device) 42 | self.q_net_target.to(ptu.device) 43 | 44 | def update(self, ob_no, ac_na, next_ob_no, reward_n, terminal_n): 45 | """ 46 | Update the parameters of the critic. 47 | let sum_of_path_lengths be the sum of the lengths of the paths sampled from 48 | Agent.sample_trajectories 49 | let num_paths be the number of paths sampled from Agent.sample_trajectories 50 | arguments: 51 | ob_no: shape: (sum_of_path_lengths, ob_dim) 52 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward 53 | reward_n: length: sum_of_path_lengths. Each element in reward_n is a scalar containing 54 | the reward for each timestep 55 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended 56 | at that timestep of 0 if the episode did not end 57 | returns: 58 | nothing 59 | """ 60 | raise NotImplementedError 61 | # TODO: Get this from homework 3 62 | 63 | #################################### 64 | #################################### 65 | 66 | def update_target_network(self): 67 | for target_param, param in zip( 68 | self.q_net_target.parameters(), self.q_net.parameters() 69 | ): 70 | target_param.data.copy_(param.data) 71 | 72 | def qa_values(self, obs): 73 | obs = ptu.from_numpy(obs) 74 | qa_values = self.q_net(obs) 75 | return ptu.to_numpy(qa_values) 76 | -------------------------------------------------------------------------------- /hw5/cs285/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from cs285.envs import ant 2 | from cs285.envs import cheetah 3 | from cs285.envs import obstacles 4 | from cs285.envs import reacher -------------------------------------------------------------------------------- /hw5/cs285/envs/ant/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='ant-cs285-v0', 5 | entry_point='cs285.envs.ant:AntEnv', 6 | max_episode_steps=1000, 7 | ) 8 | from cs285.envs.ant.ant import AntEnv 9 | -------------------------------------------------------------------------------- /hw5/cs285/envs/box2d/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/cs285/envs/box2d/__init__.py -------------------------------------------------------------------------------- /hw5/cs285/envs/cheetah/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='cheetah-cs285-v0', 5 | entry_point='cs285.envs.cheetah:HalfCheetahEnv', 6 | max_episode_steps=1000, 7 | ) 8 | from cs285.envs.cheetah.cheetah import HalfCheetahEnv 9 | -------------------------------------------------------------------------------- /hw5/cs285/envs/cheetah/cheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mujoco_py 3 | from gym import utils 4 | from gym.envs.mujoco import mujoco_env 5 | 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | 8 | def __init__(self): 9 | 10 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) 11 | utils.EzPickle.__init__(self) 12 | 13 | self.skip = self.frame_skip 14 | 15 | self.action_dim = self.ac_dim = self.action_space.shape[0] 16 | self.observation_dim = self.obs_dim = self.observation_space.shape[0] 17 | 18 | def get_reward(self, observations, actions): 19 | 20 | """get reward/s of given (observations, actions) datapoint or datapoints 21 | 22 | Args: 23 | observations: (batchsize, obs_dim) or (obs_dim,) 24 | actions: (batchsize, ac_dim) or (ac_dim,) 25 | 26 | Return: 27 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) 28 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) 29 | """ 30 | 31 | #initialize and reshape as needed, for batch mode 32 | self.reward_dict = {} 33 | if(len(observations.shape)==1): 34 | observations = np.expand_dims(observations, axis = 0) 35 | actions = np.expand_dims(actions, axis = 0) 36 | batch_mode = False 37 | else: 38 | batch_mode = True 39 | 40 | #get vars 41 | xvel = observations[:, 9].copy() 42 | body_angle = observations[:, 2].copy() 43 | front_leg = observations[:, 6].copy() 44 | front_shin = observations[:, 7].copy() 45 | front_foot = observations[:, 8].copy() 46 | zeros = np.zeros((observations.shape[0],)).copy() 47 | 48 | # ranges 49 | leg_range = 0.2 50 | shin_range = 0 51 | foot_range = 0 52 | penalty_factor = 10 53 | 54 | #calc rew 55 | self.reward_dict['run'] = xvel 56 | 57 | front_leg_rew = zeros.copy() 58 | front_leg_rew[front_leg>leg_range] = -penalty_factor 59 | self.reward_dict['leg'] = front_leg_rew 60 | 61 | front_shin_rew = zeros.copy() 62 | front_shin_rew[front_shin>shin_range] = -penalty_factor 63 | self.reward_dict['shin'] = front_shin_rew 64 | 65 | front_foot_rew = zeros.copy() 66 | front_foot_rew[front_foot>foot_range] = -penalty_factor 67 | self.reward_dict['foot'] = front_foot_rew 68 | 69 | # total reward 70 | self.reward_dict['r_total'] = self.reward_dict['run'] + self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot'] 71 | 72 | #return 73 | dones = zeros.copy() 74 | if(not batch_mode): 75 | return self.reward_dict['r_total'][0], dones[0] 76 | return self.reward_dict['r_total'], dones 77 | 78 | 79 | def get_score(self, obs): 80 | xposafter = obs[0] 81 | return xposafter 82 | 83 | ############################################## 84 | 85 | def step(self, action): 86 | 87 | #step 88 | self.do_simulation(action, self.frame_skip) 89 | 90 | #obs/reward/done/score 91 | ob = self._get_obs() 92 | rew, done = self.get_reward(ob, action) 93 | score = self.get_score(ob) 94 | 95 | #return 96 | env_info = {'obs_dict': self.obs_dict, 97 | 'rewards': self.reward_dict, 98 | 'score': score} 99 | return ob, rew, done, env_info 100 | 101 | def _get_obs(self): 102 | 103 | self.obs_dict = {} 104 | self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy() 105 | self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy() 106 | self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy() 107 | 108 | return np.concatenate([ 109 | self.obs_dict['joints_pos'], #9 110 | self.obs_dict['joints_vel'], #9 111 | self.obs_dict['com_torso'], #3 112 | ]) 113 | 114 | ############################################## 115 | 116 | def reset_model(self, seed=None): 117 | 118 | # set reset pose/vel 119 | self.reset_pose = self.init_qpos + self.np_random.uniform( 120 | low=-.1, high=.1, size=self.model.nq) 121 | self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 122 | 123 | #reset the env to that pose/vel 124 | return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy()) 125 | 126 | 127 | def do_reset(self, reset_pose, reset_vel, reset_goal=None): 128 | 129 | #reset 130 | self.set_state(reset_pose, reset_vel) 131 | 132 | #return 133 | return self._get_obs() 134 | -------------------------------------------------------------------------------- /hw5/cs285/envs/obstacles/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='obstacles-cs285-v0', 5 | entry_point='cs285.envs.obstacles:Obstacles', 6 | max_episode_steps=500, 7 | ) 8 | from cs285.envs.obstacles.obstacles_env import Obstacles 9 | -------------------------------------------------------------------------------- /hw5/cs285/envs/reacher/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='reacher-cs285-v0', 5 | entry_point='cs285.envs.reacher:Reacher7DOFEnv', 6 | max_episode_steps=500, 7 | ) 8 | from cs285.envs.reacher.reacher_env import Reacher7DOFEnv 9 | -------------------------------------------------------------------------------- /hw5/cs285/envs/reacher/reacher_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | from mujoco_py import MjViewer 5 | import os 6 | 7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | 10 | # placeholder 11 | self.hand_sid = -2 12 | self.target_sid = -1 13 | 14 | curr_dir = os.path.dirname(os.path.abspath(__file__)) 15 | mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2) 16 | utils.EzPickle.__init__(self) 17 | self.observation_dim = 26 18 | self.action_dim = 7 19 | 20 | self.hand_sid = self.model.site_name2id("finger") 21 | self.target_sid = self.model.site_name2id("target") 22 | self.skip = self.frame_skip 23 | 24 | 25 | def _get_obs(self): 26 | return np.concatenate([ 27 | self.data.qpos.flat, #[7] 28 | self.data.qvel.flatten() / 10., #[7] 29 | self.data.site_xpos[self.hand_sid], #[3] 30 | self.model.site_pos[self.target_sid], #[3] 31 | ]) 32 | 33 | def step(self, a): 34 | 35 | self.do_simulation(a, self.frame_skip) 36 | ob = self._get_obs() 37 | reward, done = self.get_reward(ob, a) 38 | 39 | score = self.get_score(ob) 40 | 41 | # finalize step 42 | env_info = {'ob': ob, 43 | 'rewards': self.reward_dict, 44 | 'score': score} 45 | 46 | return ob, reward, done, env_info 47 | 48 | def get_score(self, obs): 49 | hand_pos = obs[-6:-3] 50 | target_pos = obs[-3:] 51 | score = -1*np.abs(hand_pos-target_pos) 52 | return score 53 | 54 | def get_reward(self, observations, actions): 55 | 56 | """get reward/s of given (observations, actions) datapoint or datapoints 57 | 58 | Args: 59 | observations: (batchsize, obs_dim) or (obs_dim,) 60 | actions: (batchsize, ac_dim) or (ac_dim,) 61 | 62 | Return: 63 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) 64 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) 65 | """ 66 | 67 | #initialize and reshape as needed, for batch mode 68 | self.reward_dict = {} 69 | if(len(observations.shape)==1): 70 | observations = np.expand_dims(observations, axis = 0) 71 | actions = np.expand_dims(actions, axis = 0) 72 | batch_mode = False 73 | else: 74 | batch_mode = True 75 | 76 | #get vars 77 | hand_pos = observations[:, -6:-3] 78 | target_pos = observations[:, -3:] 79 | 80 | #calc rew 81 | dist = np.linalg.norm(hand_pos - target_pos, axis=1) 82 | self.reward_dict['r_total'] = -10*dist 83 | 84 | #done is always false for this env 85 | dones = np.zeros((observations.shape[0],)) 86 | 87 | #return 88 | if(not batch_mode): 89 | return self.reward_dict['r_total'][0], dones[0] 90 | return self.reward_dict['r_total'], dones 91 | 92 | def reset(self): 93 | _ = self.reset_model() 94 | 95 | self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1] 96 | 97 | observation, _reward, done, _info = self.step(np.zeros(7)) 98 | ob = self._get_obs() 99 | 100 | return ob 101 | 102 | def reset_model(self, seed=None): 103 | if seed is not None: 104 | self.seed(seed) 105 | 106 | self.reset_pose = self.init_qpos.copy() 107 | self.reset_vel = self.init_qvel.copy() 108 | 109 | self.reset_goal = np.zeros(3) 110 | self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3) 111 | self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2) 112 | self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25) 113 | 114 | return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal) 115 | 116 | def do_reset(self, reset_pose, reset_vel, reset_goal): 117 | 118 | self.set_state(reset_pose, reset_vel) 119 | 120 | #reset target 121 | self.reset_goal = reset_goal.copy() 122 | self.model.site_pos[self.target_sid] = self.reset_goal 123 | self.sim.forward() 124 | 125 | #return 126 | return self._get_obs() -------------------------------------------------------------------------------- /hw5/cs285/exploration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/cs285/exploration/__init__.py -------------------------------------------------------------------------------- /hw5/cs285/exploration/base_exploration_model.py: -------------------------------------------------------------------------------- 1 | class BaseExplorationModel(object): 2 | def update(self, ob_no, ac_na, next_ob_no, re_n, terminal_n): 3 | raise NotImplementedError -------------------------------------------------------------------------------- /hw5/cs285/exploration/rnd_model.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure import pytorch_util as ptu 2 | from .base_exploration_model import BaseExplorationModel 3 | import torch.optim as optim 4 | from torch import nn 5 | import torch 6 | 7 | def init_method_1(model): 8 | model.weight.data.uniform_() 9 | model.bias.data.uniform_() 10 | 11 | def init_method_2(model): 12 | model.weight.data.normal_() 13 | model.bias.data.normal_() 14 | 15 | 16 | class RNDModel(nn.Module, BaseExplorationModel): 17 | def __init__(self, hparams, optimizer_spec, **kwargs): 18 | super().__init__(**kwargs) 19 | self.ob_dim = hparams['ob_dim'] 20 | self.output_size = hparams['rnd_output_size'] 21 | self.n_layers = hparams['rnd_n_layers'] 22 | self.size = hparams['rnd_size'] 23 | self.optimizer_spec = optimizer_spec 24 | 25 | # TODO: Create two neural networks: 26 | # 1) f, the random function we are trying to learn 27 | # 2) f_hat, the function we are using to learn f 28 | # WARNING: Make sure you use different types of weight 29 | # initializations for these two functions 30 | 31 | # HINT 1) Check out the method ptu.build_mlp 32 | # HINT 2) There are two weight init methods defined above 33 | 34 | self.f = None 35 | self.f_hat = None 36 | 37 | self.optimizer = self.optimizer_spec.constructor( 38 | self.f_hat.parameters(), 39 | **self.optimizer_spec.optim_kwargs 40 | ) 41 | self.learning_rate_scheduler = optim.lr_scheduler.LambdaLR( 42 | self.optimizer, 43 | self.optimizer_spec.learning_rate_schedule, 44 | ) 45 | 46 | self.f.to(ptu.device) 47 | self.f_hat.to(ptu.device) 48 | 49 | def forward(self, ob_no): 50 | # TODO: Get the prediction error for ob_no 51 | # HINT: Remember to detach the output of self.f! 52 | error = None 53 | return error 54 | 55 | def forward_np(self, ob_no): 56 | ob_no = ptu.from_numpy(ob_no) 57 | error = self(ob_no) 58 | return ptu.to_numpy(error) 59 | 60 | def update(self, ob_no): 61 | # TODO: Update f_hat using ob_no 62 | # Hint: Take the mean prediction error across the batch 63 | loss = None 64 | return loss.item() 65 | -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/colab_utils.py: -------------------------------------------------------------------------------- 1 | from gym.wrappers import Monitor 2 | import glob 3 | import io 4 | import base64 5 | from IPython.display import HTML 6 | from IPython import display as ipythondisplay 7 | 8 | ## modified from https://colab.research.google.com/drive/1flu31ulJlgiRL1dnN2ir8wGh9p7Zij2t#scrollTo=TCelFzWY9MBI 9 | 10 | def show_video(): 11 | mp4list = glob.glob('/content/video/*.mp4') 12 | if len(mp4list) > 0: 13 | mp4 = mp4list[0] 14 | video = io.open(mp4, 'r+b').read() 15 | encoded = base64.b64encode(video) 16 | ipythondisplay.display(HTML(data=''''''.format(encoded.decode('ascii')))) 20 | else: 21 | print("Could not find video") 22 | 23 | 24 | def wrap_env(env): 25 | env = Monitor(env, '/content/video', force=True) 26 | return env 27 | -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorboardX import SummaryWriter 3 | import numpy as np 4 | 5 | class Logger: 6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 7 | self._log_dir = log_dir 8 | print('########################') 9 | print('logging outputs to ', log_dir) 10 | print('########################') 11 | self._n_logged_samples = n_logged_samples 12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 13 | 14 | def log_scalar(self, scalar, name, step_): 15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 16 | 17 | def log_scalars(self, scalar_dict, group_name, step, phase): 18 | """Will log all scalars in the same plot.""" 19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 20 | 21 | def log_image(self, image, name, step): 22 | assert(len(image.shape) == 3) # [C, H, W] 23 | self._summ_writer.add_image('{}'.format(name), image, step) 24 | 25 | def log_video(self, video_frames, name, step, fps=10): 26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 28 | 29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 30 | 31 | # reshape the rollouts 32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 33 | 34 | # max rollout length 35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 36 | max_length = videos[0].shape[0] 37 | for i in range(max_videos_to_save): 38 | if videos[i].shape[0]>max_length: 39 | max_length = videos[i].shape[0] 40 | 41 | # pad rollouts to all be same length 42 | for i in range(max_videos_to_save): 43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 55 | 56 | def log_figure(self, figure, name, step, phase): 57 | """figure: matplotlib.pyplot figure handle""" 58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 59 | 60 | def log_graph(self, array, name, step, phase): 61 | """figure: matplotlib.pyplot figure handle""" 62 | im = plot_graph(array) 63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 64 | 65 | def dump_scalars(self, log_path=None): 66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 67 | self._summ_writer.export_scalars_to_json(log_path) 68 | 69 | def flush(self): 70 | self._summ_writer.flush() 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/pytorch_util.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import torch 4 | from torch import nn 5 | 6 | Activation = Union[str, nn.Module] 7 | 8 | 9 | _str_to_activation = { 10 | 'relu': nn.ReLU(), 11 | 'tanh': nn.Tanh(), 12 | 'leaky_relu': nn.LeakyReLU(), 13 | 'sigmoid': nn.Sigmoid(), 14 | 'selu': nn.SELU(), 15 | 'softplus': nn.Softplus(), 16 | 'identity': nn.Identity(), 17 | } 18 | 19 | 20 | def build_mlp( 21 | input_size: int, 22 | output_size: int, 23 | n_layers: int, 24 | size: int, 25 | activation: Activation = 'tanh', 26 | output_activation: Activation = 'identity', 27 | init_method=None, 28 | ): 29 | """ 30 | Builds a feedforward neural network 31 | arguments: 32 | input_placeholder: placeholder variable for the state (batch_size, input_size) 33 | scope: variable scope of the network 34 | n_layers: number of hidden layers 35 | size: dimension of each hidden layer 36 | activation: activation of each hidden layer 37 | input_size: size of the input layer 38 | output_size: size of the output layer 39 | output_activation: activation of the output layer 40 | returns: 41 | output_placeholder: the result of a forward pass through the hidden layers + the output layer 42 | """ 43 | if isinstance(activation, str): 44 | activation = _str_to_activation[activation] 45 | if isinstance(output_activation, str): 46 | output_activation = _str_to_activation[output_activation] 47 | layers = [] 48 | in_size = input_size 49 | for _ in range(n_layers): 50 | curr_layer = nn.Linear(in_size, size) 51 | if init_method is not None: 52 | curr_layer.apply(init_method) 53 | layers.append(curr_layer) 54 | layers.append(activation) 55 | in_size = size 56 | 57 | last_layer = nn.Linear(in_size, output_size) 58 | if init_method is not None: 59 | last_layer.apply(init_method) 60 | 61 | layers.append(last_layer) 62 | layers.append(output_activation) 63 | 64 | return nn.Sequential(*layers) 65 | 66 | 67 | device = None 68 | 69 | 70 | def init_gpu(use_gpu=True, gpu_id=0): 71 | global device 72 | if torch.cuda.is_available() and use_gpu: 73 | device = torch.device("cuda:" + str(gpu_id)) 74 | print("Using GPU id {}".format(gpu_id)) 75 | else: 76 | device = torch.device("cpu") 77 | print("GPU not detected. Defaulting to CPU.") 78 | 79 | 80 | def set_device(gpu_id): 81 | torch.cuda.set_device(gpu_id) 82 | 83 | 84 | def from_numpy(*args, **kwargs): 85 | return torch.from_numpy(*args, **kwargs).float().to(device) 86 | 87 | 88 | def to_numpy(tensor): 89 | return tensor.to('cpu').detach().numpy() 90 | -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure.utils import * 2 | 3 | 4 | class ReplayBuffer(object): 5 | 6 | def __init__(self, max_size=1000000): 7 | 8 | self.max_size = max_size 9 | self.paths = [] 10 | self.obs = None 11 | self.acs = None 12 | self.concatenated_rews = None 13 | self.unconcatenated_rews = None 14 | self.next_obs = None 15 | self.terminals = None 16 | 17 | def add_rollouts(self, paths, noised=False): 18 | 19 | # add new rollouts into our list of rollouts 20 | for path in paths: 21 | tpath = dict() 22 | # print (path.keys()) 23 | tpath['observation'] = path['observations'] 24 | tpath['next_observation'] = path['next_observations'] 25 | tpath['reward'] = path['rewards'] 26 | tpath['action'] = path['actions'] 27 | tpath['terminal'] = path['terminals'] 28 | self.paths.append(tpath) 29 | 30 | # convert new rollouts into their component arrays, and append them onto our arrays 31 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(self.paths) 32 | 33 | if noised: 34 | observations = add_noise(observations) 35 | next_observations = add_noise(next_observations) 36 | 37 | if self.obs is None: 38 | self.obs = observations[-self.max_size:] 39 | self.acs = actions[-self.max_size:] 40 | self.next_obs = next_observations[-self.max_size:] 41 | self.terminals = terminals[-self.max_size:] 42 | self.concatenated_rews = concatenated_rews[-self.max_size:] 43 | self.unconcatenated_rews = unconcatenated_rews[-self.max_size:] 44 | else: 45 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 46 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 47 | self.next_obs = np.concatenate( 48 | [self.next_obs, next_observations] 49 | )[-self.max_size:] 50 | self.terminals = np.concatenate( 51 | [self.terminals, terminals] 52 | )[-self.max_size:] 53 | self.concatenated_rews = np.concatenate( 54 | [self.concatenated_rews, concatenated_rews] 55 | )[-self.max_size:] 56 | if isinstance(unconcatenated_rews, list): 57 | self.unconcatenated_rews += unconcatenated_rews # TODO keep only latest max_size around 58 | else: 59 | self.unconcatenated_rews.append(unconcatenated_rews) # TODO keep only latest max_size around 60 | 61 | print (self.terminals.sum()) 62 | ######################################## 63 | ######################################## 64 | 65 | def sample_random_rollouts(self, num_rollouts): 66 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] 67 | return self.paths[rand_indices] 68 | 69 | def sample_recent_rollouts(self, num_rollouts=1): 70 | return self.paths[-num_rollouts:] 71 | 72 | def can_sample(self, batch_size): 73 | # print (self.obs.shape[0]) 74 | if self.obs.shape[0] > batch_size: 75 | return True 76 | else: 77 | return False 78 | 79 | ######################################## 80 | ######################################## 81 | 82 | def sample_random_data(self, batch_size): 83 | 84 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 85 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] 86 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] 87 | 88 | def sample(self, batch_size): 89 | return self.sample_random_data(batch_size) 90 | 91 | def sample_recent_data(self, batch_size=1, concat_rew=True): 92 | 93 | if concat_rew: 94 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 95 | else: 96 | num_recent_rollouts_to_return = 0 97 | num_datapoints_so_far = 0 98 | index = -1 99 | while num_datapoints_so_far < batch_size: 100 | recent_rollout = self.paths[index] 101 | index -=1 102 | num_recent_rollouts_to_return +=1 103 | num_datapoints_so_far += get_pathlength(recent_rollout) 104 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] 105 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) 106 | return observations, actions, unconcatenated_rews, next_observations, terminals 107 | -------------------------------------------------------------------------------- /hw5/cs285/policies/MLP_policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import itertools 3 | from torch import nn 4 | from torch.nn import functional as F 5 | from torch import optim 6 | 7 | import numpy as np 8 | import torch 9 | from torch import distributions 10 | 11 | from cs285.infrastructure import pytorch_util as ptu 12 | from cs285.policies.base_policy import BasePolicy 13 | 14 | 15 | class MLPPolicy(BasePolicy, nn.Module, metaclass=abc.ABCMeta): 16 | 17 | def __init__(self, 18 | ac_dim, 19 | ob_dim, 20 | n_layers, 21 | size, 22 | discrete=False, 23 | learning_rate=1e-4, 24 | training=True, 25 | nn_baseline=False, 26 | **kwargs 27 | ): 28 | super().__init__(**kwargs) 29 | 30 | # init vars 31 | self.ac_dim = ac_dim 32 | self.ob_dim = ob_dim 33 | self.n_layers = n_layers 34 | self.discrete = discrete 35 | self.size = size 36 | self.learning_rate = learning_rate 37 | self.training = training 38 | self.nn_baseline = nn_baseline 39 | 40 | if self.discrete: 41 | self.logits_na = ptu.build_mlp(input_size=self.ob_dim, 42 | output_size=self.ac_dim, 43 | n_layers=self.n_layers, 44 | size=self.size) 45 | self.logits_na.to(ptu.device) 46 | self.mean_net = None 47 | self.logstd = None 48 | self.optimizer = optim.Adam(self.logits_na.parameters(), 49 | self.learning_rate) 50 | else: 51 | self.logits_na = None 52 | self.mean_net = ptu.build_mlp(input_size=self.ob_dim, 53 | output_size=self.ac_dim, 54 | n_layers=self.n_layers, size=self.size) 55 | self.logstd = nn.Parameter( 56 | torch.zeros(self.ac_dim, dtype=torch.float32, device=ptu.device) 57 | ) 58 | self.mean_net.to(ptu.device) 59 | self.logstd.to(ptu.device) 60 | self.optimizer = optim.Adam( 61 | itertools.chain([self.logstd], self.mean_net.parameters()), 62 | self.learning_rate 63 | ) 64 | 65 | if nn_baseline: 66 | self.baseline = ptu.build_mlp( 67 | input_size=self.ob_dim, 68 | output_size=1, 69 | n_layers=self.n_layers, 70 | size=self.size, 71 | ) 72 | self.baseline.to(ptu.device) 73 | self.baseline_optimizer = optim.Adam( 74 | self.baseline.parameters(), 75 | self.learning_rate, 76 | ) 77 | else: 78 | self.baseline = None 79 | 80 | ################################## 81 | 82 | def save(self, filepath): 83 | torch.save(self.state_dict(), filepath) 84 | 85 | ################################## 86 | 87 | # query the policy with observation(s) to get selected action(s) 88 | def get_action(self, obs: np.ndarray) -> np.ndarray: 89 | raise NotImplementedError 90 | # TODO: get this from hw1 91 | 92 | #################################### 93 | #################################### 94 | 95 | # update/train this policy 96 | def update(self, observations, actions, **kwargs): 97 | raise NotImplementedError 98 | 99 | # This function defines the forward pass of the network. 100 | # You can return anything you want, but you should be able to differentiate 101 | # through it. For example, you can return a torch.FloatTensor. You can also 102 | # return more flexible objects, such as a 103 | # `torch.distributions.Distribution` object. It's up to you! 104 | def forward(self, observation: torch.FloatTensor): 105 | raise NotImplementedError 106 | # TODO: get this from hw1 107 | 108 | #################################### 109 | #################################### 110 | 111 | 112 | ##################################################### 113 | ##################################################### 114 | 115 | 116 | class MLPPolicyAC(MLPPolicy): 117 | # MJ: cut acs_labels_na and qvals from the signature if they are not used 118 | def update( 119 | self, observations, actions, 120 | adv_n=None, acs_labels_na=None, qvals=None 121 | ): 122 | raise NotImplementedError 123 | # Not needed for this homework 124 | 125 | #################################### 126 | #################################### 127 | -------------------------------------------------------------------------------- /hw5/cs285/policies/argmax_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pdb 3 | 4 | 5 | class ArgMaxPolicy(object): 6 | 7 | def __init__(self, critic): 8 | self.critic = critic 9 | 10 | def set_critic(self, critic): 11 | self.critic = critic 12 | 13 | def get_action(self, obs): 14 | # MJ: changed the dimension check to a 3 15 | if len(obs.shape) > 3: 16 | observation = obs 17 | else: 18 | observation = obs[None] 19 | 20 | raise NotImplementedError 21 | # TODO: get this from hw3 22 | 23 | #################################### 24 | #################################### -------------------------------------------------------------------------------- /hw5/cs285/policies/base_policy.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import numpy as np 3 | 4 | 5 | class BasePolicy(object, metaclass=abc.ABCMeta): 6 | def get_action(self, obs: np.ndarray) -> np.ndarray: 7 | raise NotImplementedError 8 | 9 | def update(self, obs: np.ndarray, acs: np.ndarray, **kwargs) -> dict: 10 | """Return a dictionary of logging information.""" 11 | raise NotImplementedError 12 | 13 | def save(self, filepath: str): 14 | raise NotImplementedError 15 | -------------------------------------------------------------------------------- /hw5/cs285/scripts/read_results.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import tensorflow as tf 3 | 4 | def get_section_results(file): 5 | """ 6 | requires tensorflow==1.12.0 7 | """ 8 | X = [] 9 | Y = [] 10 | for e in tf.train.summary_iterator(file): 11 | for v in e.summary.value: 12 | if v.tag == 'Train_EnvstepsSoFar': 13 | X.append(v.simple_value) 14 | elif v.tag == 'Eval_AverageReturn': 15 | Y.append(v.simple_value) 16 | return X, Y 17 | 18 | if __name__ == '__main__': 19 | import glob 20 | 21 | logdir = 'data/q1_lb_rtg_na_CartPole-v0_13-09-2020_23-32-10/events*' 22 | eventfile = glob.glob(logdir)[0] 23 | 24 | X, Y = get_section_results(eventfile) 25 | for i, (x, y) in enumerate(zip(X, Y)): 26 | print('Iteration {:d} | Train steps: {:d} | Return: {}'.format(i, int(x), y)) -------------------------------------------------------------------------------- /hw5/cs285/scripts/run_hw5_expl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | 4 | from cs285.infrastructure.rl_trainer import RL_Trainer 5 | from cs285.agents.explore_or_exploit_agent import ExplorationOrExploitationAgent 6 | from cs285.infrastructure.dqn_utils import get_env_kwargs, PiecewiseSchedule, ConstantSchedule 7 | 8 | 9 | class Q_Trainer(object): 10 | 11 | def __init__(self, params): 12 | self.params = params 13 | 14 | train_args = { 15 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 16 | 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], 17 | 'train_batch_size': params['batch_size'], 18 | 'double_q': params['double_q'], 19 | } 20 | 21 | env_args = get_env_kwargs(params['env_name']) 22 | 23 | self.agent_params = {**train_args, **env_args, **params} 24 | 25 | self.params['agent_class'] = ExplorationOrExploitationAgent 26 | self.params['agent_params'] = self.agent_params 27 | self.params['train_batch_size'] = params['batch_size'] 28 | self.params['env_wrappers'] = self.agent_params['env_wrappers'] 29 | 30 | self.rl_trainer = RL_Trainer(self.params) 31 | 32 | def run_training_loop(self): 33 | self.rl_trainer.run_training_loop( 34 | self.agent_params['num_timesteps'], 35 | collect_policy = self.rl_trainer.agent.actor, 36 | eval_policy = self.rl_trainer.agent.actor, 37 | ) 38 | 39 | def main(): 40 | 41 | import argparse 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | '--env_name', 45 | default='PointmassHard-v0', 46 | choices=('PointmassEasy-v0', 'PointmassMedium-v0', 'PointmassHard-v0', 'PointmassVeryHard-v0') 47 | ) 48 | 49 | parser.add_argument('--exp_name', type=str, default='todo') 50 | 51 | parser.add_argument('--eval_batch_size', type=int, default=1000) 52 | parser.add_argument('--batch_size', type=int, default=256) 53 | 54 | parser.add_argument('--use_rnd', action='store_true') 55 | parser.add_argument('--num_exploration_steps', type=int, default=10000) 56 | parser.add_argument('--unsupervised_exploration', action='store_true') 57 | 58 | parser.add_argument('--offline_exploitation', action='store_true') 59 | parser.add_argument('--cql_alpha', type=float, default=0.0) 60 | 61 | parser.add_argument('--exploit_rew_shift', type=float, default=0.0) 62 | parser.add_argument('--exploit_rew_scale', type=float, default=1.0) 63 | 64 | parser.add_argument('--rnd_output_size', type=int, default=5) 65 | parser.add_argument('--rnd_n_layers', type=int, default=2) 66 | parser.add_argument('--rnd_size', type=int, default=400) 67 | 68 | parser.add_argument('--seed', type=int, default=2) 69 | parser.add_argument('--no_gpu', '-ngpu', action='store_true') 70 | parser.add_argument('--which_gpu', '-gpu_id', default=0) 71 | parser.add_argument('--scalar_log_freq', type=int, default=int(1e3)) 72 | parser.add_argument('--save_params', action='store_true') 73 | 74 | args = parser.parse_args() 75 | 76 | # convert to dictionary 77 | params = vars(args) 78 | params['double_q'] = True 79 | params['num_agent_train_steps_per_iter'] = 1 80 | params['num_critic_updates_per_agent_update'] = 1 81 | params['exploit_weight_schedule'] = ConstantSchedule(1.0) 82 | params['video_log_freq'] = -1 # This param is not used for DQN 83 | params['num_timesteps'] = 50000 84 | params['learning_starts'] = 2000 85 | params['eps'] = 0.2 86 | ################################## 87 | ### CREATE DIRECTORY FOR LOGGING 88 | ################################## 89 | 90 | if params['env_name']=='PointmassEasy-v0': 91 | params['ep_len']=50 92 | if params['env_name']=='PointmassMedium-v0': 93 | params['ep_len']=150 94 | if params['env_name']=='PointmassHard-v0': 95 | params['ep_len']=100 96 | if params['env_name']=='PointmassVeryHard-v0': 97 | params['ep_len']=200 98 | 99 | if params['use_rnd']: 100 | params['explore_weight_schedule'] = PiecewiseSchedule([(0,1), (params['num_exploration_steps'], 0)], outside_value=0.0) 101 | else: 102 | params['explore_weight_schedule'] = ConstantSchedule(0.0) 103 | 104 | if params['unsupervised_exploration']: 105 | params['explore_weight_schedule'] = ConstantSchedule(1.0) 106 | params['exploit_weight_schedule'] = ConstantSchedule(0.0) 107 | 108 | if not params['use_rnd']: 109 | params['learning_starts'] = params['num_exploration_steps'] 110 | 111 | 112 | logdir_prefix = 'hw5_expl_' # keep for autograder 113 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../data') 114 | 115 | if not (os.path.exists(data_path)): 116 | os.makedirs(data_path) 117 | 118 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 119 | logdir = os.path.join(data_path, logdir) 120 | params['logdir'] = logdir 121 | if not(os.path.exists(logdir)): 122 | os.makedirs(logdir) 123 | 124 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") 125 | 126 | trainer = Q_Trainer(params) 127 | trainer.run_training_loop() 128 | 129 | 130 | if __name__ == "__main__": 131 | main() 132 | -------------------------------------------------------------------------------- /hw5/hw5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/berkeleydeeprlcourse/homework_fall2020/4808ac3054faf3dec0cd345219f6a9dea19d4202/hw5/hw5.pdf -------------------------------------------------------------------------------- /hw5/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.17.2 2 | mujoco-py==2.0.2.2 3 | tensorboard==2.3.0 4 | tensorboardX==1.8 5 | matplotlib==2.2.2 6 | ipython==6.4.0 7 | moviepy==1.0.0 8 | pyvirtualdisplay==1.3.2 9 | torch==1.5.1 10 | opencv-python==4.4.0.42 11 | networkx==2.5 12 | ipdb==0.13.3 13 | box2d-py 14 | -------------------------------------------------------------------------------- /hw5/requirements_colab.txt: -------------------------------------------------------------------------------- 1 | gym==0.17.2 2 | tensorboard==2.3.0 3 | tensorboardX==1.8 4 | matplotlib==2.2.2 5 | ipython==6.4.0 6 | moviepy==1.0.0 7 | pyvirtualdisplay==1.3.2 8 | torch==1.5.1 9 | opencv-python==4.4.0.42 10 | networkx==2.5 11 | ipdb==0.13.3 12 | box2d-py 13 | -------------------------------------------------------------------------------- /hw5/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) --------------------------------------------------------------------------------