├── .gitattributes ├── README.md ├── hw1 ├── README.txt ├── cs285 │ ├── agents │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── base_agent.cpython-37.pyc │ │ │ └── bc_agent.cpython-37.pyc │ │ └── bc_agent.py │ ├── expert_data │ │ ├── expert_data_Ant-v2.pkl │ │ ├── expert_data_HalfCheetah-v2.pkl │ │ ├── expert_data_Hopper-v2.pkl │ │ ├── expert_data_Humanoid-v2.pkl │ │ └── expert_data_Walker2d-v2.pkl │ ├── infrastructure │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── logger.cpython-37.pyc │ │ │ ├── replay_buffer.cpython-37.pyc │ │ │ ├── rl_trainer.cpython-37.pyc │ │ │ ├── tf_utils.cpython-37.pyc │ │ │ └── utils.cpython-37.pyc │ │ ├── logger.py │ │ ├── replay_buffer.py │ │ ├── rl_trainer.py │ │ └── utils.py │ ├── policies │ │ ├── MLP_policy.py │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── MLP_policy.cpython-37.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── base_policy.cpython-37.pyc │ │ │ └── loaded_gaussian_policy.cpython-37.pyc │ │ ├── experts │ │ │ ├── Ant.pkl │ │ │ ├── HalfCheetah.pkl │ │ │ ├── Hopper.pkl │ │ │ ├── Humanoid.pkl │ │ │ └── Walker2d.pkl │ │ └── loaded_gaussian_policy.py │ └── scripts │ │ └── run_hw1_behavior_cloning.py ├── cs285_hw1.pdf ├── downloads │ └── mjpro150 │ │ ├── bin │ │ ├── basic │ │ ├── compile │ │ ├── derivative │ │ ├── libglew.so │ │ ├── libglewegl.so │ │ ├── libglewosmesa.so │ │ ├── libglfw.so.3 │ │ ├── libmujoco150.so │ │ ├── libmujoco150nogl.so │ │ ├── record │ │ ├── simulate │ │ └── test │ │ ├── doc │ │ ├── README.txt │ │ └── REFERENCE.txt │ │ ├── include │ │ ├── glfw3.h │ │ ├── mjdata.h │ │ ├── mjmodel.h │ │ ├── mjrender.h │ │ ├── mjvisualize.h │ │ ├── mjxmacro.h │ │ └── mujoco.h │ │ ├── model │ │ ├── humanoid.xml │ │ └── humanoid100.xml │ │ └── sample │ │ ├── basic.cpp │ │ ├── compile.cpp │ │ ├── derivative.cpp │ │ ├── makefile │ │ ├── record.cpp │ │ ├── simulate.cpp │ │ └── test.cpp ├── requirements.txt └── setup.py ├── hw2 ├── README.txt ├── cs285 │ ├── agents │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── pg_agent.cpython-37.pyc │ │ └── pg_agent.py │ ├── infrastructure │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── logger.cpython-37.pyc │ │ │ ├── replay_buffer.cpython-37.pyc │ │ │ ├── rl_trainer.cpython-37.pyc │ │ │ └── utils.cpython-37.pyc │ │ ├── logger.py │ │ ├── replay_buffer.py │ │ ├── rl_trainer.py │ │ └── utils.py │ ├── policies │ │ ├── MLP_policy.py │ │ ├── __init__.py │ │ └── __pycache__ │ │ │ ├── MLP_policy.cpython-37.pyc │ │ │ └── __init__.cpython-37.pyc │ └── scripts │ │ └── run_hw2_policy_gradient.py ├── cs285_hw2.pdf ├── requirements.txt └── setup.py ├── hw3 ├── README.txt ├── cs285 │ ├── agents │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── ac_agent.cpython-37.pyc │ │ │ └── dqn_agent.cpython-37.pyc │ │ ├── ac_agent.py │ │ └── dqn_agent.py │ ├── critics │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── bootstrapped_continuous_critic.cpython-37.pyc │ │ │ └── dqn_critic.cpython-37.pyc │ │ ├── bootstrapped_continuous_critic.py │ │ └── dqn_critic.py │ ├── infrastructure │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── atari_wrappers.cpython-37.pyc │ │ │ ├── dqn_utils.cpython-37.pyc │ │ │ ├── logger.cpython-37.pyc │ │ │ ├── models.cpython-37.pyc │ │ │ ├── replay_buffer.cpython-37.pyc │ │ │ ├── rl_trainer.cpython-37.pyc │ │ │ └── utils.cpython-37.pyc │ │ ├── atari_wrappers.py │ │ ├── dqn_utils.py │ │ ├── logger.py │ │ ├── models.py │ │ ├── replay_buffer.py │ │ ├── rl_trainer.py │ │ └── utils.py │ ├── policies │ │ ├── MLP_policy.py │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── MLP_policy.cpython-37.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── argmax_policy.cpython-37.pyc │ │ └── argmax_policy.py │ └── scripts │ │ ├── run_hw3_actor_critic.py │ │ └── run_hw3_dqn.py ├── cs285_hw3.pdf ├── lunar_lander.py ├── requirements.txt └── setup.py ├── hw4 ├── README.txt ├── cs285 │ ├── agents │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ └── mb_agent.cpython-37.pyc │ │ └── mb_agent.py │ ├── envs │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ └── __init__.cpython-37.pyc │ │ ├── ant │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-35.pyc │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ ├── ant.cpython-35.pyc │ │ │ │ └── ant.cpython-37.pyc │ │ │ └── ant.py │ │ ├── cheetah │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-35.pyc │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ ├── cheetah.cpython-35.pyc │ │ │ │ └── cheetah.cpython-37.pyc │ │ │ └── cheetah.py │ │ ├── obstacles │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ │ ├── __init__.cpython-35.pyc │ │ │ │ ├── __init__.cpython-37.pyc │ │ │ │ ├── obstacles_env.cpython-35.pyc │ │ │ │ └── obstacles_env.cpython-37.pyc │ │ │ └── obstacles_env.py │ │ └── reacher │ │ │ ├── __init__.py │ │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-35.pyc │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── reacher_env.cpython-35.pyc │ │ │ └── reacher_env.cpython-37.pyc │ │ │ ├── assets │ │ │ └── sawyer.xml │ │ │ └── reacher_env.py │ ├── infrastructure │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── logger.cpython-37.pyc │ │ │ ├── replay_buffer.cpython-37.pyc │ │ │ ├── rl_trainer.cpython-37.pyc │ │ │ └── utils.cpython-37.pyc │ │ ├── logger.py │ │ ├── replay_buffer.py │ │ ├── rl_trainer.py │ │ └── utils.py │ ├── models │ │ ├── __pycache__ │ │ │ └── ff_model.cpython-37.pyc │ │ └── ff_model.py │ ├── policies │ │ ├── MPC_policy.py │ │ ├── __init__.py │ │ └── __pycache__ │ │ │ ├── MPC_policy.cpython-37.pyc │ │ │ └── __init__.cpython-37.pyc │ └── scripts │ │ └── run_hw4_mb.py ├── cs285_hw4.pdf └── setup.py └── hw5 ├── README.txt ├── cs285 ├── agents │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── ac_agent.cpython-37.pyc │ └── ac_agent.py ├── critics │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ └── bootstrapped_continuous_critic.cpython-37.pyc │ └── bootstrapped_continuous_critic.py ├── envs │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── pointmass.cpython-37.pyc │ │ └── sparse_half_cheetah.cpython-37.pyc │ ├── pointmass.py │ └── sparse_half_cheetah.py ├── exploration │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── density_model.cpython-37.pyc │ │ └── exploration.cpython-37.pyc │ ├── density_model.py │ └── exploration.py ├── infrastructure │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── logger.cpython-37.pyc │ │ ├── replay.cpython-37.pyc │ │ ├── replay_buffer.cpython-37.pyc │ │ ├── rl_trainer.cpython-37.pyc │ │ └── utils.cpython-37.pyc │ ├── logger.py │ ├── replay_buffer.py │ ├── rl_trainer.py │ └── utils.py ├── policies │ ├── MLP_policy.py │ ├── __init__.py │ └── __pycache__ │ │ ├── MLP_policy.cpython-37.pyc │ │ └── __init__.cpython-37.pyc └── scripts │ └── train_ac_exploration_f18.py ├── cs285_hw5.pdf ├── requirements.txt └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | hw1/downloads/* linguist-detectable=false 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # UC Berkeley Deep RL in Pytorch 2 | 3 | Pytorch starter code for [UC Berkeley's CS285 Deep RL course](http://rail.eecs.berkeley.edu/deeprlcourse/). The code is meant to be used as a direct alternative to the [official HW repository](https://github.com/berkeleydeeprlcourse/homework_fall2019) for those who would rather complete the course assignments in pytorch. [Solutions to this starter code is available](https://github.com/mdeib/berkeley-deep-RL-pytorch-solutions). 4 | 5 | # Changes 6 | 7 | All tensorflow in the starter code was converted to pytorch and or numpy, and all solutions are to be written in pytorch. Overall structure of the HW starter code was kept mostly the same, although with the move to pytorch it made sense to delete some files and move thier contents elsewhere. The README.txt in each HW folder has been modified where necessary but the pdf has not - refer to the README.txt for any changes made in the pytorch version. Although tensorflow is not needed within the main code, the logging is still done with tensorboard and thus tensorflow is still needed to easily use and view tensorboard in your browser. 8 | 9 | Please note that while this starter code has been shown to produce reasonable results when filled in correctly there may still exist small bugs/errors. 10 | -------------------------------------------------------------------------------- /hw1/README.txt: -------------------------------------------------------------------------------- 1 | 2 | 1) install package by running: 3 | 4 | $ python setup.py develop 5 | 6 | ############################################## 7 | ############################################## 8 | 9 | 2)install mujoco: 10 | $ cd ~ 11 | $ mkdir .mujoco 12 | $ cd 13 | $ cp mjkey.txt ~/.mujoco/ 14 | $ cd /downloads 15 | $ cp -r mjpro150 ~/.mujoco/ 16 | 17 | add the following to bottom of your bashrc: 18 | export LD_LIBRARY_PATH=~/.mujoco/mjpro150/bin/ 19 | 20 | NOTE IF YOU'RE USING A MAC: 21 | The provided mjpro150 folder is for Linux. 22 | Please download the OSX version yourself, from https://www.roboti.us/index.html 23 | 24 | ############################################## 25 | ############################################## 26 | 27 | 3)install other dependencies 28 | 29 | ------------------- 30 | 31 | a) [PREFERRED] Option A: 32 | 33 | i) install anaconda, if you don't already have it: 34 | Download Anaconda2 (suggested v5.2 for linux): https://www.continuum.io/downloads 35 | $ cd Downloads 36 | $ bash Anaconda2-5.2.0-Linux-x86_64.sh #file name might be slightly different, but follows this format 37 | 38 | Note that this install will modify the PATH variable in your bashrc. 39 | You need to open a new terminal for that path change to take place (to be able to find 'conda' in the next step). 40 | 41 | ii) create a conda env that will contain python 3: 42 | $ conda create -n cs285_env python=3.5 43 | 44 | iii) activate the environment (do this every time you open a new terminal and want to run code): 45 | $ source activate cs285_env 46 | 47 | iv) install the requirements into this conda env 48 | $ pip install --user --requirement requirements.txt 49 | 50 | v) get the appropriate version of pytorch (1.5.0+cu101 was used here, but your version will vary based on your device) and some version of tnesorflow to run tensorboard 51 | 52 | vi) allow your code to be able to see 'cs285' 53 | $ cd 54 | $ pip install -e . 55 | 56 | Note: This conda environment requires activating it every time you open a new terminal (in order to run code), but the benefit is that the required dependencies for this codebase will not affect existing/other versions of things on your computer. This stand-alone environment will have everything that is necessary. 57 | 58 | ------------------- 59 | 60 | b) Option B: 61 | 62 | i) install dependencies locally, by running: 63 | $ pip install -r requirements.txt 64 | 65 | ii) get the appropriate version of pytorch (1.5.0+cu101 was used here, but your version will vary based on your device) and some version of tnesorflow to run tensorboard 66 | 67 | iii) set path to cs285 folder in run_hw1_behavior_cloning.py 68 | 69 | ############################################## 70 | ############################################## 71 | 72 | 4) code: 73 | 74 | Blanks to be filled in are marked with "TODO" 75 | The following files have blanks in them: 76 | - scripts/run_hw1_behavior_cloning.py 77 | - infrastructure/rl_trainer.py 78 | - agents/bc_agent.py 79 | - policies/MLP_policy.py 80 | - infrastructure/replay_buffer.py 81 | - infrastructure/utils.py 82 | 83 | NOTE - tf_utils.py was deleted in the pytorch version 84 | 85 | See the code + the hw pdf for more details. 86 | 87 | ############################################## 88 | ############################################## 89 | 90 | 5) run code: 91 | 92 | Run the following command(s) for Section 1 (Behavior Cloning): 93 | (All identical, one for each env) 94 | 95 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Ant.pkl --env_name Ant-v2 --exp_name test_bc_ant --n_iter 1 --expert_data cs285/expert_data/expert_data_Ant-v2.pkl 96 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/HalfCheetah.pkl --env_name HalfCheetah-v2 --exp_name test_bc_halfcheetah --n_iter 1 --expert_data cs285/expert_data/expert_data_HalfCheetah-v2.pkl 97 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Hopper.pkl --env_name Hopper-v2 --exp_name test_bc_hopper --n_iter 1 --expert_data cs285/expert_data/expert_data_Hopper-v2.pkl 98 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Humanoid.pkl --env_name Humanoid-v2 --exp_name test_bc_humanoid --n_iter 1 --expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl 99 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Walker2d.pkl --env_name Walker2d-v2 --exp_name test_bc_walker2d --n_iter 1 --expert_data cs285/expert_data/expert_data_Walker2d-v2.pkl 100 | 101 | Run the following command for Section 2 (DAGGER): 102 | (NOTE: the --do_dagger flag, and the higher value for n_iter) 103 | 104 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Walker2d.pkl --env_name Walker2d-v2 --exp_name test_dagger_walker --n_iter 10 --do_dagger --expert_data cs285/expert_data/expert_data_Walker2d-v2.pkl 105 | 106 | ############################################## 107 | 108 | 6) visualize saved tensorboard event file: 109 | 110 | $ cd cs285/data/ 111 | $ tensorboard --logdir . 112 | 113 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab) 114 | -------------------------------------------------------------------------------- /hw1/cs285/agents/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw1/cs285/agents/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/agents/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/agents/__pycache__/base_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/agents/__pycache__/base_agent.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/agents/__pycache__/bc_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/agents/__pycache__/bc_agent.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/agents/bc_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | from cs285.policies.MLP_policy import * 5 | from cs285.infrastructure.replay_buffer import ReplayBuffer 6 | from cs285.infrastructure.utils import * 7 | 8 | class BCAgent: 9 | def __init__(self, env, agent_params): 10 | # init vars 11 | self.env = env 12 | self.agent_params = agent_params 13 | 14 | # actor/policy 15 | self.actor = MLPPolicySL(self.agent_params['ac_dim'], 16 | self.agent_params['ob_dim'], 17 | self.agent_params['n_layers'], 18 | self.agent_params['size'], 19 | self.agent_params['device'], 20 | discrete = self.agent_params['discrete'], 21 | learning_rate = self.agent_params['learning_rate'], 22 | ) ## TODO: look in here and implement this 23 | 24 | # replay buffer 25 | self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size']) 26 | 27 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 28 | # training a BC agent refers to updating its actor using 29 | # the given observations and corresponding action labels 30 | self.actor.update(ob_no, ac_na) ## TODO: look in here and implement this 31 | 32 | def add_to_replay_buffer(self, paths): 33 | self.replay_buffer.add_rollouts(paths) 34 | 35 | def sample(self, batch_size): 36 | return self.replay_buffer.sample_random_data(batch_size) ## TODO: look in here and implement this 37 | -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_Ant-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_Ant-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_Hopper-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/__pycache__/tf_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/tf_utils.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.tensorboard import SummaryWriter 4 | import numpy as np 5 | 6 | class Logger: 7 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 8 | self._log_dir = log_dir 9 | print('########################') 10 | print('logging outputs to ', log_dir) 11 | print('########################') 12 | self._n_logged_samples = n_logged_samples 13 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 14 | 15 | def log_scalar(self, scalar, name, step_): 16 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 17 | 18 | def log_scalars(self, scalar_dict, group_name, step, phase): 19 | """Will log all scalars in the same plot.""" 20 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 21 | 22 | def log_image(self, image, name, step): 23 | assert(len(image.shape) == 3) # [C, H, W] 24 | self._summ_writer.add_image('{}'.format(name), image, step) 25 | 26 | def log_video(self, video_frames, name, step, fps=10): 27 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 28 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 29 | 30 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 31 | 32 | # reshape the rollouts 33 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 34 | 35 | # max rollout length 36 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 37 | max_length = videos[0].shape[0] 38 | for i in range(max_videos_to_save): 39 | if videos[i].shape[0]>max_length: 40 | max_length = videos[i].shape[0] 41 | 42 | # pad rollouts to all be same length 43 | for i in range(max_videos_to_save): 44 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 56 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 57 | 58 | def log_figure(self, figure, name, step, phase): 59 | """figure: matplotlib.pyplot figure handle""" 60 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 61 | 62 | def log_graph(self, array, name, step, phase): 63 | """figure: matplotlib.pyplot figure handle""" 64 | im = plot_graph(array) 65 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 66 | 67 | def dump_scalars(self, log_path=None): 68 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 69 | self._summ_writer.export_scalars_to_json(log_path) 70 | 71 | def flush(self): 72 | self._summ_writer.flush() 73 | -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import time 2 | import numpy as np 3 | import gym 4 | import os 5 | 6 | from cs285.infrastructure.utils import * 7 | 8 | class ReplayBuffer(object): 9 | 10 | def __init__(self, max_size=1000000): 11 | 12 | self.max_size = max_size 13 | 14 | # store each rollout 15 | self.paths = [] 16 | 17 | # store (concatenated) component arrays from each rollout 18 | self.obs = None 19 | self.acs = None 20 | self.rews = None 21 | self.next_obs = None 22 | self.terminals = None 23 | 24 | def __len__(self): 25 | if self.obs is not None: 26 | return self.obs.shape[0] 27 | else: 28 | return 0 29 | 30 | def add_rollouts(self, paths, concat_rew=True): 31 | 32 | # add new rollouts into our list of rollouts 33 | for path in paths: 34 | self.paths.append(path) 35 | 36 | # convert new rollouts into their component arrays, and append them onto our arrays 37 | observations, actions, rewards, next_observations, terminals = convert_listofrollouts(paths, concat_rew) 38 | 39 | if self.obs is None: 40 | self.obs = observations[-self.max_size:] 41 | self.acs = actions[-self.max_size:] 42 | self.rews = rewards[-self.max_size:] 43 | self.next_obs = next_observations[-self.max_size:] 44 | self.terminals = terminals[-self.max_size:] 45 | else: 46 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 47 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 48 | if concat_rew: 49 | self.rews = np.concatenate([self.rews, rewards])[-self.max_size:] 50 | else: 51 | if isinstance(rewards, list): 52 | self.rews += rewards 53 | else: 54 | self.rews.append(rewards) 55 | self.rews = self.rews[-self.max_size:] 56 | self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:] 57 | self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:] 58 | 59 | ######################################## 60 | ######################################## 61 | 62 | def sample_random_data(self, batch_size): 63 | assert self.obs.shape[0] == self.acs.shape[0] == self.rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 64 | 65 | ## TODO return batch_size number of random entries from each of the 5 component arrays above 66 | ## HINT 1: use np.random.permutation to sample random indices 67 | ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array) 68 | ## HINT 3: look at the sample_recent_data function below 69 | return TODO, TODO, TODO, TODO, TODO 70 | 71 | def sample_recent_data(self, batch_size=1): 72 | return self.obs[-batch_size:], self.acs[-batch_size:], self.rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 73 | -------------------------------------------------------------------------------- /hw1/cs285/infrastructure/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import scipy 4 | 5 | ############################################ 6 | ############################################ 7 | 8 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=("rgb_array")): 9 | 10 | #next two lines is a fix for the error: "GLEW initalization error: Missing GL version" 11 | #ignore if you do not recieve this error 12 | #if render: 13 | # env.render(mode = "human") 14 | 15 | # initialize env for the beginning of a new rollout 16 | ob = TODO # HINT: should be the output of resetting the env 17 | 18 | # init vars 19 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] 20 | steps = 0 21 | while True: 22 | 23 | # render image of the simulated env 24 | if render: 25 | if 'rgb_array' in render_mode: 26 | if hasattr(env, 'sim'): 27 | image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1]) 28 | else: 29 | image_obs.append(env.render(mode=render_mode)) 30 | if 'human' in render_mode: 31 | env.render(mode=render_mode) 32 | time.sleep(env.model.opt.timestep) 33 | 34 | # use the most recent ob to decide what to do 35 | obs.append(ob) 36 | ac = TODO # HINT: query the policy's get_action function 37 | ac = ac[0] 38 | acs.append(ac) 39 | 40 | # take that action and record results 41 | ob, rew, done, _ = env.step(ac) 42 | 43 | # record result of taking that action 44 | steps += 1 45 | next_obs.append(ob) 46 | rewards.append(rew) 47 | 48 | # TODO end the rollout if the rollout ended 49 | # HINT: rollout can end due to done, or due to max_path_length 50 | rollout_done = TODO # HINT: this is either 0 or 1 51 | terminals.append(rollout_done) 52 | 53 | if rollout_done: 54 | break 55 | 56 | return Path(obs, image_obs, acs, rewards, next_obs, terminals) 57 | 58 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): 59 | """ 60 | Collect rollouts until we have collected min_timesteps_per_batch steps. 61 | TODO implement this function 62 | Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths 63 | Hint2: use get_pathlength to count the timesteps collected in each path 64 | """ 65 | timesteps_this_batch = 0 66 | paths = [] 67 | while timesteps_this_batch < min_timesteps_per_batch: 68 | 69 | TODO 70 | 71 | return paths, timesteps_this_batch 72 | 73 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): 74 | """ 75 | Collect ntraj rollouts. 76 | TODO implement this function 77 | Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths 78 | """ 79 | paths = [] 80 | 81 | TODO 82 | 83 | return paths 84 | 85 | ############################################ 86 | ############################################ 87 | 88 | def Path(obs, image_obs, acs, rewards, next_obs, terminals): 89 | """ 90 | Take info (separate arrays) from a single rollout 91 | and return it in a single dictionary 92 | """ 93 | if image_obs != []: 94 | image_obs = np.stack(image_obs, axis=0) 95 | return {"observation" : np.array(obs, dtype=np.float32), 96 | "image_obs" : np.array(image_obs, dtype=np.uint8), 97 | "reward" : np.array(rewards, dtype=np.float32), 98 | "action" : np.array(acs, dtype=np.float32), 99 | "next_observation": np.array(next_obs, dtype=np.float32), 100 | "terminal": np.array(terminals, dtype=np.float32)} 101 | 102 | 103 | def convert_listofrollouts(paths, concat_rew=True): 104 | """ 105 | Take a list of rollout dictionaries 106 | and return separate arrays, 107 | where each array is a concatenation of that array from across the rollouts 108 | """ 109 | observations = np.concatenate([path["observation"] for path in paths]) 110 | actions = np.concatenate([path["action"] for path in paths]) 111 | if concat_rew: 112 | rewards = np.concatenate([path["reward"] for path in paths]) 113 | else: 114 | rewards = [path["reward"] for path in paths] 115 | next_observations = np.concatenate([path["next_observation"] for path in paths]) 116 | terminals = np.concatenate([path["terminal"] for path in paths]) 117 | return observations, actions, rewards, next_observations, terminals 118 | 119 | ############################################ 120 | ############################################ 121 | 122 | def get_pathlength(path): 123 | return len(path["reward"]) 124 | -------------------------------------------------------------------------------- /hw1/cs285/policies/MLP_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | class MLPPolicy(nn.Module): 6 | 7 | def __init__(self, 8 | ac_dim, 9 | ob_dim, 10 | n_layers, 11 | size, 12 | device, 13 | lr = 1e-4, 14 | training=True, 15 | discrete=False, # unused for now 16 | nn_baseline=False, # unused for now 17 | **kwargs): 18 | super().__init__() 19 | 20 | # init vars 21 | self.training = training 22 | self.device = device 23 | 24 | # network architecture 25 | #TODO -build the network architecture 26 | #HINT -build an nn.Modulelist() using the passed in parameters 27 | 28 | #loss and optimizer 29 | if self.training: 30 | # TODO define the loss that will be used to train this policy 31 | self.loss_func = TODO 32 | self.optimizer = torch.optim.Adam(self.parameters(), lr) 33 | 34 | self.to(device) 35 | 36 | ################################## 37 | 38 | def forward(self, x): 39 | for layer in self.mlp: 40 | x = layer(x) 41 | return x 42 | 43 | ################################## 44 | 45 | def save(self, filepath): 46 | torch.save(self.state_dict(), filepath) 47 | 48 | def restore(self, filepath): 49 | self.load_state_dict(torch.load(filepath)) 50 | 51 | ################################## 52 | 53 | # query this policy with observation(s) to get selected action(s) 54 | def get_action(self, obs): 55 | if len(obs.shape)>1: 56 | observation = obs 57 | else: 58 | observation = obs[None] 59 | 60 | # TODO return the action that the policy prescribes 61 | return TODO 62 | 63 | # update/train this policy 64 | def update(self, observations, actions): 65 | raise NotImplementedError 66 | 67 | ##################################################### 68 | ##################################################### 69 | 70 | class MLPPolicySL(MLPPolicy): 71 | 72 | """ 73 | This class is a special case of MLPPolicy, 74 | which is trained using supervised learning. 75 | The relevant functions to define are included below. 76 | """ 77 | 78 | def update(self, observations, actions): 79 | assert self.training, 'Policy must be created with training = true in order to perform training updates...' 80 | 81 | # TODO define network update 82 | #HINT - you need to calculate the prediction loss and then use optimizer.step() 83 | -------------------------------------------------------------------------------- /hw1/cs285/policies/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw1/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/policies/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/policies/__pycache__/base_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/__pycache__/base_policy.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/policies/__pycache__/loaded_gaussian_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/__pycache__/loaded_gaussian_policy.cpython-37.pyc -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/Ant.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/Ant.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/HalfCheetah.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/HalfCheetah.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/Hopper.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/Hopper.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/Humanoid.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/Humanoid.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/experts/Walker2d.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/Walker2d.pkl -------------------------------------------------------------------------------- /hw1/cs285/policies/loaded_gaussian_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import pickle 5 | 6 | class Loaded_Gaussian_Policy(nn.Module): 7 | def __init__(self, filename, **kwargs): 8 | super().__init__() 9 | with open(filename, 'rb') as f: 10 | data = pickle.loads(f.read()) 11 | 12 | self.nonlin_type = data['nonlin_type'] 13 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 14 | 15 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) 16 | self.policy_params = data[policy_type] 17 | 18 | assert set(self.policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} 19 | 20 | self.obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D'] 21 | self.obsnorm_meansq = self.policy_params['obsnorm']['Standardizer']['meansq_1_D'] 22 | layer_params = self.policy_params['hidden']['FeedforwardNet'] 23 | 24 | self.mlp = nn.ModuleList() 25 | for layer_name in sorted(layer_params.keys()): 26 | W = layer_params[layer_name]['AffineLayer']['W'].astype(np.float32) 27 | b = layer_params[layer_name]['AffineLayer']['b'].astype(np.float32) 28 | r, h = W.shape 29 | 30 | layer = nn.Linear(r,h) 31 | layer.weight.data.copy_(torch.from_numpy(W.transpose())) 32 | layer.bias.data.copy_(torch.from_numpy(b.squeeze(0))) 33 | self.mlp.append(layer) 34 | 35 | if self.nonlin_type == 'lrelu': 36 | self.mlp.append(nn.LeakyReLU()) 37 | elif self.nonlin_type == 'tanh': 38 | self.mlp.append(nn.Tanh()) 39 | else: 40 | raise NotImplementedError(self.nonlin_type) 41 | 42 | #output layer 43 | W = self.policy_params['out']['AffineLayer']['W'].astype(np.float32) 44 | b = self.policy_params['out']['AffineLayer']['b'].astype(np.float32) 45 | r, h = W.shape 46 | layer = nn.Linear(r, h) 47 | layer.weight.data.copy_(torch.from_numpy(W.transpose())) 48 | layer.bias.data.copy_(torch.from_numpy(b.squeeze(0))) 49 | self.mlp.append(layer) 50 | 51 | ################################## 52 | 53 | def obs_norm(self, obs_bo, obsnorm_mean, obsnorm_meansq): 54 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 55 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) 56 | return torch.FloatTensor(normedobs_bo).squeeze(0) 57 | 58 | ################################## 59 | 60 | def forward(self, obs): 61 | x = self.obs_norm(obs, self.obsnorm_mean, self.obsnorm_meansq) 62 | for layer in self.mlp: 63 | x = layer(x) 64 | return x 65 | 66 | ################################## 67 | 68 | def update(self, obs_no, acs_na, adv_n=None, acs_labels_na=None): 69 | print("\n\nThis policy class simply loads in a particular type of policy and queries it.") 70 | print("Not training procedure has been written, so do not try to train it.\n\n") 71 | raise NotImplementedError 72 | 73 | def get_action(self, obs): 74 | if len(obs.shape) > 1: 75 | observation = obs 76 | else: 77 | observation = obs[None, :] 78 | return self(obs) 79 | -------------------------------------------------------------------------------- /hw1/cs285/scripts/run_hw1_behavior_cloning.py: -------------------------------------------------------------------------------- 1 | #Uncomment next two lines and replace the path if not using anaconda 2 | #import sys 3 | #sys.path.append(r'') 4 | 5 | import torch 6 | import os 7 | import time 8 | import numpy as np 9 | 10 | from cs285.infrastructure.rl_trainer import RL_Trainer 11 | from cs285.agents.bc_agent import BCAgent 12 | from cs285.policies.loaded_gaussian_policy import Loaded_Gaussian_Policy 13 | 14 | class BC_Trainer(object): 15 | def __init__(self, params): 16 | 17 | ####################### 18 | ## AGENT PARAMS 19 | ####################### 20 | 21 | agent_params = { 22 | 'n_layers': params['n_layers'], 23 | 'size': params['size'], 24 | 'learning_rate': params['learning_rate'], 25 | 'max_replay_buffer_size': params['max_replay_buffer_size'], 26 | } 27 | 28 | self.params = params 29 | self.params['agent_class'] = BCAgent ## TODO: look in here and implement this 30 | self.params['agent_params'] = agent_params 31 | 32 | ################ 33 | ## RL TRAINER 34 | ################ 35 | 36 | self.rl_trainer = RL_Trainer(self.params) ## TODO: look in here and implement this 37 | 38 | ####################### 39 | ## LOAD EXPERT POLICY 40 | ####################### 41 | 42 | print('Loading expert policy from...', self.params['expert_policy_file']) 43 | self.loaded_expert_policy = Loaded_Gaussian_Policy(self.params['expert_policy_file']) 44 | print('Done restoring expert policy...') 45 | 46 | def run_training_loop(self): 47 | 48 | self.rl_trainer.run_training_loop( 49 | n_iter=self.params['n_iter'], 50 | initial_expertdata=self.params['expert_data'], 51 | collect_policy=self.rl_trainer.agent.actor, 52 | eval_policy=self.rl_trainer.agent.actor, 53 | relabel_with_expert=self.params['do_dagger'], 54 | expert_policy=self.loaded_expert_policy, 55 | ) 56 | 57 | 58 | def main(): 59 | import argparse 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument('--expert_policy_file', '-epf', type=str, required=True) # relative to where you're running this script from 62 | parser.add_argument('--expert_data', '-ed', type=str, required=True) #relative to where you're running this script from 63 | parser.add_argument('--env_name', '-env', type=str, help='choices: Ant-v2, Humanoid-v2, Walker-v2, HalfCheetah-v2, Hopper-v2', required=True) 64 | parser.add_argument('--exp_name', '-exp', type=str, default='pick an experiment name', required=True) 65 | parser.add_argument('--do_dagger', action='store_true') 66 | parser.add_argument('--ep_len', type=int) 67 | 68 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=10000) # number of gradient steps for training policy (per iter in n_iter) 69 | parser.add_argument('--n_iter', '-n', type=int, default=1) 70 | 71 | parser.add_argument('--batch_size', type=int, default=1000) # training data collected (in the env) during each iteration 72 | parser.add_argument('--eval_batch_size', type=int, 73 | default=10000) # eval data collected (in the env) for logging metrics 74 | parser.add_argument('--train_batch_size', type=int, 75 | default=100) # number of sampled data points to be used per gradient/train step 76 | 77 | parser.add_argument('--n_layers', type=int, default=2) # depth, of policy to be learned 78 | parser.add_argument('--size', type=int, default=64) # width of each layer, of policy to be learned 79 | parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) # LR for supervised learning 80 | 81 | parser.add_argument('--video_log_freq', type=int, default=5) 82 | parser.add_argument('--scalar_log_freq', type=int, default=1) 83 | parser.add_argument('--use_gpu', action='store_true', default=True) 84 | parser.add_argument('--which_gpu', type=int, default=0) 85 | parser.add_argument('--max_replay_buffer_size', type=int, default=1000000) 86 | parser.add_argument('--seed', type=int, default=1) 87 | args = parser.parse_args() 88 | 89 | # convert args to dictionary 90 | params = vars(args) 91 | 92 | if torch.cuda.is_available() and params["use_gpu"]: 93 | which_gpu = "cuda:" + str(params["which_gpu"]) 94 | params["device"] = torch.device(which_gpu) 95 | print("Pytorch is running on GPU", params["which_gpu"]) 96 | else: 97 | params["device"] = torch.device("cpu") 98 | print("Pytorch is running on the CPU") 99 | 100 | ################################## 101 | ### CREATE DIRECTORY FOR LOGGING 102 | ################################## 103 | 104 | logdir_prefix = 'bc_' 105 | if args.do_dagger: 106 | logdir_prefix = 'dagger_' 107 | assert args.n_iter>1, ('DAGGER needs more than 1 iteration (n_iter>1) of training, to iteratively query the expert and train (after 1st warmstarting from behavior cloning).') 108 | else: 109 | assert args.n_iter==1, ('Vanilla behavior cloning collects expert data just once (n_iter=1)') 110 | 111 | ## directory for logging 112 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') 113 | if not (os.path.exists(data_path)): 114 | os.makedirs(data_path) 115 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 116 | logdir = os.path.join(data_path, logdir) 117 | params['logdir'] = logdir 118 | if not(os.path.exists(logdir)): 119 | os.makedirs(logdir) 120 | 121 | ################### 122 | ### RUN TRAINING 123 | ################### 124 | 125 | trainer = BC_Trainer(params) 126 | trainer.run_training_loop() 127 | 128 | if __name__ == "__main__": 129 | main() 130 | -------------------------------------------------------------------------------- /hw1/cs285_hw1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285_hw1.pdf -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/basic: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/basic -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/compile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/compile -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/derivative: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/derivative -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/libglew.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libglew.so -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/libglewegl.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libglewegl.so -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/libglewosmesa.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libglewosmesa.so -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/libglfw.so.3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libglfw.so.3 -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/libmujoco150.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libmujoco150.so -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/libmujoco150nogl.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libmujoco150nogl.so -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/record: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/record -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/simulate: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/simulate -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/bin/test: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/test -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/doc/README.txt: -------------------------------------------------------------------------------- 1 | Welcome to MuJoCo Pro version 1.50. 2 | 3 | The full documentation is available at http://www.mujoco.org/book 4 | The most relevant chapters are Overview, MJCF Models, and MuJoCo Pro. 5 | 6 | Here we provide brief notes to get you started: 7 | 8 | 9 | The activation key (which you should have received with your license) is a 10 | plain-text file whose path must be passed to the mj_activate() function. 11 | The code samples assume that it is called mjkey.txt in the bin directory. 12 | 13 | Once you have mjkey.txt in the bin directory, run: 14 | simulate ../model/humanoid.xml (or ./simulate on Linux and OSX) 15 | to see MuJoCo Pro in action. 16 | 17 | On Linux, you can use LD_LIBRARY_PATH to point the dynamic linker to the 18 | .so files, or copy them to a directory that is already in the linker path. 19 | On OSX, the MuJoCo Pro dynamic library is compiled with @executable_path/ 20 | to avoid the need for installation in a predefined directory. 21 | 22 | In general, the directory structure we have provided is merely a suggestion; 23 | feel free to re-organize it if needed. MuJoCo Pro does not have an installer 24 | and does not write any files outside the executable directory. 25 | 26 | The makefile in the sample directory generates binaries in the bin directory. 27 | These binaries are pre-compiled and included in the software distribution. 28 | 29 | While the software distribution contains only one model (humanoid.xml), 30 | additional models are available at http://www.mujoco.org/forum under Resources. 31 | -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/include/mjrender.h: -------------------------------------------------------------------------------- 1 | //---------------------------------// 2 | // This file is part of MuJoCo // 3 | // Written by Emo Todorov // 4 | // Copyright (C) 2017 Roboti LLC // 5 | //---------------------------------// 6 | 7 | 8 | #pragma once 9 | 10 | 11 | typedef enum _mjtGridPos // grid position for overlay 12 | { 13 | mjGRID_TOPLEFT = 0, // top left 14 | mjGRID_TOPRIGHT, // top right 15 | mjGRID_BOTTOMLEFT, // bottom left 16 | mjGRID_BOTTOMRIGHT // bottom right 17 | } mjtGridPos; 18 | 19 | 20 | typedef enum _mjtFramebuffer // OpenGL framebuffer option 21 | { 22 | mjFB_WINDOW = 0, // default/window buffer 23 | mjFB_OFFSCREEN // offscreen buffer 24 | } mjtFramebuffer; 25 | 26 | 27 | typedef enum _mjtFontScale // font scale, used at context creation 28 | { 29 | mjFONTSCALE_100 = 100, // normal scale, suitable in the absence of DPI scaling 30 | mjFONTSCALE_150 = 150, // 150% scale 31 | mjFONTSCALE_200 = 200 // 200% scale 32 | } mjtFontScale; 33 | 34 | 35 | typedef enum _mjtFont // font type, used at each text operation 36 | { 37 | mjFONT_NORMAL = 0, // normal font 38 | mjFONT_SHADOW, // normal font with shadow (for higher contrast) 39 | mjFONT_BIG // big font (for user alerts) 40 | } mjtFont; 41 | 42 | 43 | struct _mjrRect // OpenGL rectangle 44 | { 45 | int left; // left (usually 0) 46 | int bottom; // bottom (usually 0) 47 | int width; // width (usually buffer width) 48 | int height; // height (usually buffer height) 49 | }; 50 | typedef struct _mjrRect mjrRect; 51 | 52 | 53 | struct _mjrContext // custom OpenGL context 54 | { 55 | // parameters copied from mjVisual 56 | float lineWidth; // line width for wireframe rendering 57 | float shadowClip; // clipping radius for directional lights 58 | float shadowScale; // fraction of light cutoff for spot lights 59 | int shadowSize; // size of shadow map texture 60 | int offWidth; // width of offscreen buffer 61 | int offHeight; // height of offscreen buffer 62 | int offSamples; // number of offscreen buffer multisamples 63 | 64 | // offscreen rendering objects 65 | unsigned int offFBO; // offscreen framebuffer object 66 | unsigned int offFBO_r; // offscreen framebuffer for resolving multisamples 67 | unsigned int offColor; // offscreen color buffer 68 | unsigned int offColor_r; // offscreen color buffer for resolving multisamples 69 | unsigned int offDepthStencil; // offscreen depth and stencil buffer 70 | unsigned int offDepthStencil_r; // offscreen depth and stencil buffer for resolving multisamples 71 | 72 | // shadow rendering objects 73 | unsigned int shadowFBO; // shadow map framebuffer object 74 | unsigned int shadowTex; // shadow map texture 75 | 76 | // texture objects and info 77 | int ntexture; // number of allocated textures 78 | int textureType[100]; // type of texture (mjtTexture) 79 | unsigned int texture[100]; // texture names 80 | 81 | // displaylist starting positions 82 | unsigned int basePlane; // all planes from model 83 | unsigned int baseMesh; // all meshes from model 84 | unsigned int baseHField; // all hfields from model 85 | unsigned int baseBuiltin; // all buildin geoms, with quality from model 86 | unsigned int baseFontNormal; // normal font 87 | unsigned int baseFontShadow; // shadow font 88 | unsigned int baseFontBig; // big font 89 | 90 | // displaylist ranges 91 | int rangePlane; // all planes from model 92 | int rangeMesh; // all meshes from model 93 | int rangeHField; // all hfields from model 94 | int rangeBuiltin; // all builtin geoms, with quality from model 95 | int rangeFont; // all characters in font 96 | 97 | // character info 98 | int charWidth[127]; // character widths: normal and shadow 99 | int charWidthBig[127]; // chacarter widths: big 100 | int charHeight; // character heights: normal and shadow 101 | int charHeightBig; // character heights: big 102 | 103 | // capabilities 104 | int glewInitialized; // is glew initialized 105 | int windowAvailable; // is default/window framebuffer available 106 | int windowSamples; // number of samples for default/window framebuffer 107 | int windowStereo; // is stereo available for default/window framebuffer 108 | int windowDoublebuffer; // is default/window framebuffer double buffered 109 | 110 | // only field that changes after mjr_makeContext 111 | int currentBuffer; // currently active framebuffer: mjFB_WINDOW or mjFB_OFFSCREEN 112 | }; 113 | typedef struct _mjrContext mjrContext; 114 | 115 | -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/sample/compile.cpp: -------------------------------------------------------------------------------- 1 | //---------------------------------// 2 | // This file is part of MuJoCo // 3 | // Written by Emo Todorov // 4 | // Copyright (C) 2017 Roboti LLC // 5 | //---------------------------------// 6 | 7 | 8 | #include "mujoco.h" 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | // help 16 | const char helpstring[] = 17 | "\n Usage: compile infile outfile\n" 18 | " infile can be in mjcf, urdf, mjb format\n" 19 | " outfile can be in mjcf, mjb, txt format\n\n" 20 | " Example: compile model.xml model.mjb\n"; 21 | 22 | 23 | // deallocate and print message 24 | int finish(const char* msg = 0, mjModel* m = 0) 25 | { 26 | // deallocated everything 27 | if( m ) 28 | mj_deleteModel(m); 29 | mj_deactivate(); 30 | 31 | // print message 32 | if( msg ) 33 | printf("%s\n", msg); 34 | 35 | return 0; 36 | } 37 | 38 | 39 | // possible file types 40 | enum 41 | { 42 | typeUNKNOWN = 0, 43 | typeXML, 44 | typeMJB, 45 | typeTXT 46 | }; 47 | 48 | 49 | // determine file type 50 | int filetype(const char* filename) 51 | { 52 | // convert to lower case for string comparison 53 | char lower[1000]; 54 | size_t i=0; 55 | while( i=0 && lower[dot]!='.' ) 65 | dot--; 66 | 67 | // no dot found 68 | if( dot<0 ) 69 | return typeUNKNOWN; 70 | 71 | // check extension 72 | if( !strcmp(lower+dot, ".xml") || !strcmp(lower+dot, ".urdf") ) 73 | return typeXML; 74 | else if( !strcmp(lower+dot, ".mjb") ) 75 | return typeMJB; 76 | else if( !strcmp(lower+dot, ".txt") ) 77 | return typeTXT; 78 | else 79 | return typeUNKNOWN; 80 | } 81 | 82 | 83 | 84 | // main function 85 | int main(int argc, const char** argv) 86 | { 87 | // model and error 88 | mjModel* m = 0; 89 | char error[1000]; 90 | 91 | // print help if arguments are missing 92 | if( argc!=3 ) 93 | return finish(helpstring); 94 | 95 | // activate MuJoCo Pro license (this must be *your* activation key) 96 | mj_activate("mjkey.txt"); 97 | 98 | // determine file types 99 | int type1 = filetype(argv[1]); 100 | int type2 = filetype(argv[2]); 101 | 102 | // check types 103 | if( type1==typeUNKNOWN || type1==typeTXT || 104 | type2==typeUNKNOWN || (type1==typeMJB && type2==typeXML) ) 105 | return finish("Illegal combination of file formats"); 106 | 107 | // make sure output file does not exist 108 | FILE* fp = fopen(argv[2], "r"); 109 | if( fp ) 110 | { 111 | fclose(fp); 112 | return finish("Output file already exists"); 113 | } 114 | 115 | // load model 116 | if( type1==typeXML ) 117 | m = mj_loadXML(argv[1], 0, error, 1000); 118 | else 119 | m = mj_loadModel(argv[1], 0); 120 | 121 | // check error 122 | if( !m ) 123 | { 124 | if( type1==typeXML ) 125 | return finish(error, 0); 126 | else 127 | return finish("Could not load model", 0); 128 | } 129 | 130 | // save model 131 | if( type2==typeXML ) 132 | { 133 | if( mj_saveLastXML(argv[2], m, error, 1000) ) 134 | return finish(error, m); 135 | } 136 | else if( type2==typeMJB ) 137 | mj_saveModel(m, argv[2], 0, 0); 138 | else 139 | mj_printModel(m, argv[2]); 140 | 141 | // finalize 142 | return finish("Done", m); 143 | } 144 | -------------------------------------------------------------------------------- /hw1/downloads/mjpro150/sample/makefile: -------------------------------------------------------------------------------- 1 | COMMON=-O2 -I../include -L../bin -std=c++11 -mavx 2 | 3 | default: 4 | g++ $(COMMON) test.cpp -lmujoco150nogl -o ../bin/test 5 | g++ $(COMMON) compile.cpp -lmujoco150nogl -o ../bin/compile 6 | g++ $(COMMON) derivative.cpp -lmujoco150nogl -fopenmp -o ../bin/derivative 7 | g++ $(COMMON) simulate.cpp -lmujoco150 -lGL -lglew ../bin/libglfw.so.3 -o ../bin/simulate 8 | g++ $(COMMON) record.cpp -lmujoco150 -lGL -lglew ../bin/libglfw.so.3 -o ../bin/record 9 | g++ $(COMMON) basic.cpp -lmujoco150 -lGL -lglew ../bin/libglfw.so.3 -o ../bin/basic 10 | 11 | egl: 12 | g++ $(COMMON) -DMJ_EGL record.cpp -lmujoco150 -lOpenGL -lEGL -lglewegl -o ../bin/recordegl 13 | 14 | osmesa: 15 | g++ $(COMMON) -DMJ_OSMESA record.cpp -lmujoco150 -lOSMesa -lglewosmesa -o ../bin/recordosmesa 16 | 17 | all: default egl osmesa 18 | -------------------------------------------------------------------------------- /hw1/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.11 2 | mujoco-py==1.50.1.35 3 | matplotlib==2.2.2 4 | ipython==6.4.0 5 | moviepy==1.0.0 -------------------------------------------------------------------------------- /hw1/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) -------------------------------------------------------------------------------- /hw2/README.txt: -------------------------------------------------------------------------------- 1 | 2 | 1) See hw1 if you'd like to see installation instructions. You do NOT have to redo them. 3 | 4 | 5 | ############################################## 6 | ############################################## 7 | 8 | 9 | 2) Code: 10 | 11 | ------------------------------------------- 12 | 13 | Files to look at, even though there are no explicit 'TODO' markings: 14 | - scripts/run_hw2_policy_gradient.py 15 | 16 | ------------------------------------------- 17 | 18 | Relevant Code from the first HW has already been filled in in the following files: 19 | - infrastructure/rl_trainer.py 20 | - infrastructure/utils.py 21 | - policies/MLP_policy.py 22 | 23 | ------------------------------------------- 24 | 25 | Blanks to be filled in now (for this assignment) are marked with 'TODO' 26 | 27 | The following files have these: 28 | - agents/pg_agent.py 29 | - policies/MLP_policy.py 30 | 31 | 32 | ############################################## 33 | ############################################## 34 | 35 | 36 | 3) Run code with the following command: 37 | 38 | $ python cs285/scripts/run_hw2_policy_gradient.py --env_name CartPole-v1 --exp_name test_pg_cartpole 39 | $ python cs285/scripts/run_hw2_policy_gradient.py --env_name InvertedPendulum-v2 --exp_name test_pg_pendulum 40 | 41 | Flags of relevance, when running the commands above (see pdf for more info): 42 | -n number of policy training iterations 43 | -rtg use reward_to_go for the value 44 | -dsa do not standardize the advantage values 45 | 46 | ############################################## 47 | 48 | 49 | 4) Visualize saved tensorboard event file: 50 | 51 | $ cd cs285/data/ 52 | $ tensorboard --logdir . 53 | 54 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab) 55 | 56 | -------------------------------------------------------------------------------- /hw2/cs285/agents/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw2/cs285/agents/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/agents/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/agents/__pycache__/pg_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/agents/__pycache__/pg_agent.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.tensorboard import SummaryWriter 4 | import numpy as np 5 | 6 | class Logger: 7 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 8 | self._log_dir = log_dir 9 | print('########################') 10 | print('logging outputs to ', log_dir) 11 | print('########################') 12 | self._n_logged_samples = n_logged_samples 13 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 14 | 15 | def log_scalar(self, scalar, name, step_): 16 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 17 | 18 | def log_scalars(self, scalar_dict, group_name, step, phase): 19 | """Will log all scalars in the same plot.""" 20 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 21 | 22 | def log_image(self, image, name, step): 23 | assert(len(image.shape) == 3) # [C, H, W] 24 | self._summ_writer.add_image('{}'.format(name), image, step) 25 | 26 | def log_video(self, video_frames, name, step, fps=10): 27 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 28 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 29 | 30 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 31 | 32 | # reshape the rollouts 33 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 34 | 35 | # max rollout length 36 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 37 | max_length = videos[0].shape[0] 38 | for i in range(max_videos_to_save): 39 | if videos[i].shape[0]>max_length: 40 | max_length = videos[i].shape[0] 41 | 42 | # pad rollouts to all be same length 43 | for i in range(max_videos_to_save): 44 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 55 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 56 | 57 | def log_figure(self, figure, name, step, phase): 58 | """figure: matplotlib.pyplot figure handle""" 59 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 60 | 61 | def log_graph(self, array, name, step, phase): 62 | """figure: matplotlib.pyplot figure handle""" 63 | im = plot_graph(array) 64 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 65 | 66 | def dump_scalars(self, log_path=None): 67 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 68 | self._summ_writer.export_scalars_to_json(log_path) 69 | 70 | def flush(self): 71 | self._summ_writer.flush() 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from cs285.infrastructure.utils import * 4 | 5 | class ReplayBuffer(object): 6 | 7 | def __init__(self, max_size=1000000): 8 | 9 | self.max_size = max_size 10 | self.paths = [] 11 | self.obs = None 12 | self.acs = None 13 | self.concatenated_rews = None 14 | self.unconcatenated_rews = None 15 | self.next_obs = None 16 | self.terminals = None 17 | 18 | def add_rollouts(self, paths): 19 | 20 | # add new rollouts into our list of rollouts 21 | for path in paths: 22 | self.paths.append(path) 23 | 24 | # convert new rollouts into their component arrays, and append them onto our arrays 25 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) 26 | 27 | if self.obs is None: 28 | self.obs = observations[-self.max_size:] 29 | self.acs = actions[-self.max_size:] 30 | self.next_obs = next_observations[-self.max_size:] 31 | self.terminals = terminals[-self.max_size:] 32 | self.concatenated_rews = concatenated_rews[-self.max_size:] 33 | self.unconcatenated_rews = unconcatenated_rews[-self.max_size:] 34 | else: 35 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 36 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 37 | self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:] 38 | self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:] 39 | self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:] 40 | if isinstance(unconcatenated_rews, list): 41 | self.unconcatenated_rews += unconcatenated_rews 42 | else: 43 | self.unconcatenated_rews.append(unconcatenated_rews) 44 | 45 | ######################################## 46 | ######################################## 47 | 48 | def sample_random_rollouts(self, num_rollouts): 49 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] 50 | return self.paths[rand_indices] 51 | 52 | def sample_recent_rollouts(self, num_rollouts=1): 53 | return self.paths[-num_rollouts:] 54 | 55 | ######################################## 56 | ######################################## 57 | 58 | def sample_random_data(self, batch_size): 59 | 60 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 61 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] 62 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] 63 | 64 | def sample_recent_data(self, batch_size=1, concat_rew=True): 65 | 66 | if concat_rew: 67 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 68 | else: 69 | num_recent_rollouts_to_return = 0 70 | num_datapoints_so_far = 0 71 | index = -1 72 | while num_datapoints_so_far < batch_size: 73 | recent_rollout = self.paths[index] 74 | index -=1 75 | num_recent_rollouts_to_return +=1 76 | num_datapoints_so_far += get_pathlength(recent_rollout) 77 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] 78 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) 79 | return observations, actions, unconcatenated_rews, next_observations, terminals -------------------------------------------------------------------------------- /hw2/cs285/infrastructure/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | ############################################ 5 | ############################################ 6 | 7 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): 8 | 9 | if render: 10 | env.render(mode = "human") 11 | 12 | # initialize env for the beginning of a new rollout 13 | ob = env.reset() # TODO: GETTHIS from HW1 14 | 15 | # init vars 16 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] 17 | steps = 0 18 | while True: 19 | 20 | # render image of the simulated env 21 | if render: 22 | if 'rgb_array' in render_mode: 23 | if hasattr(env, 'sim'): 24 | if 'track' in env.env.model.camera_names: 25 | image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1]) 26 | else: 27 | image_obs.append(env.sim.render(height=500, width=500)[::-1]) 28 | else: 29 | image_obs.append(env.render(mode=render_mode)) 30 | if 'human' in render_mode: 31 | env.render(mode=render_mode) 32 | time.sleep(env.model.opt.timestep) 33 | 34 | # use the most recent ob to decide what to do 35 | obs.append(ob) 36 | ac = policy.get_action(ob) 37 | acs.append(ac) 38 | 39 | # take that action and record results 40 | ob, rew, done, _ = env.step(ac) 41 | 42 | # record result of taking that action 43 | steps += 1 44 | next_obs.append(ob) 45 | rewards.append(rew) 46 | 47 | # End the rollout if the rollout ended 48 | # Note that the rollout can end due to done, or due to max_path_length 49 | rollout_done = done or steps >= max_path_length 50 | terminals.append(rollout_done) 51 | 52 | if rollout_done: 53 | break 54 | 55 | return Path(obs, image_obs, acs, rewards, next_obs, terminals) 56 | 57 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): 58 | 59 | timesteps_left = min_timesteps_per_batch 60 | timesteps_this_batch = 0 61 | paths = [] 62 | 63 | while timesteps_this_batch < min_timesteps_per_batch: 64 | paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode)) 65 | timesteps_this_batch += get_pathlength(paths[-1]) 66 | 67 | return paths, timesteps_this_batch 68 | 69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): 70 | paths = [] 71 | for n in range(ntraj): 72 | paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode)) 73 | 74 | return paths 75 | 76 | ############################################ 77 | ############################################ 78 | 79 | def Path(obs, image_obs, acs, rewards, next_obs, terminals): 80 | """ 81 | Take info (separate arrays) from a single rollout 82 | and return it in a single dictionary 83 | """ 84 | if image_obs != []: 85 | image_obs = np.stack(image_obs, axis=0) 86 | return {"observation" : np.array(obs, dtype=np.float32), 87 | "image_obs" : np.array(image_obs, dtype=np.uint8), 88 | "reward" : np.array(rewards, dtype=np.float32), 89 | "action" : np.array(acs, dtype=np.float32), 90 | "next_observation": np.array(next_obs, dtype=np.float32), 91 | "terminal": np.array(terminals, dtype=np.float32)} 92 | 93 | 94 | def convert_listofrollouts(paths): 95 | """ 96 | Take a list of rollout dictionaries 97 | and return separate arrays, 98 | where each array is a concatenation of that array from across the rollouts 99 | """ 100 | observations = np.concatenate([path["observation"] for path in paths]) 101 | actions = np.concatenate([path["action"] for path in paths]) 102 | next_observations = np.concatenate([path["next_observation"] for path in paths]) 103 | terminals = np.concatenate([path["terminal"] for path in paths]) 104 | concatenated_rewards = np.concatenate([path["reward"] for path in paths]) 105 | unconcatenated_rewards = [path["reward"] for path in paths] 106 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards 107 | 108 | ############################################ 109 | ############################################ 110 | 111 | def get_pathlength(path): 112 | return len(path["reward"]) 113 | -------------------------------------------------------------------------------- /hw2/cs285/policies/MLP_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | class MLP(nn.Module): 6 | def __init__(self, 7 | ac_dim, 8 | ob_dim, 9 | n_layers, 10 | size, 11 | device, 12 | discrete, 13 | activation = nn.Tanh()): 14 | super().__init__() 15 | 16 | self.discrete = discrete 17 | 18 | #TODO -build the network architecture -can be taken from HW1 19 | #HINT -build an nn.Modulelist() using the passed in parameters 20 | 21 | #if continuous define logstd variable 22 | if not self.discrete: 23 | self.logstd = nn.Parameter(torch.zeros(ac_dim)) 24 | 25 | self.to(device) 26 | 27 | def forward(self, x): 28 | for layer in self.mlp: 29 | x = layer(x) 30 | if self.discrete: 31 | return x 32 | else: 33 | return (x, self.logstd.exp()) 34 | 35 | class MLPPolicy: 36 | def __init__(self, 37 | ac_dim, 38 | ob_dim, 39 | n_layers, 40 | size, 41 | device, 42 | learning_rate, 43 | training=True, 44 | discrete=False, 45 | nn_baseline=False, 46 | **kwargs): 47 | super().__init__() 48 | 49 | # init vars 50 | self.device = device 51 | self.discrete = discrete 52 | self.training = training 53 | self.nn_baseline = nn_baseline 54 | 55 | # network architecture 56 | self.policy_mlp = MLP(ac_dim, ob_dim, n_layers, size, device, discrete) 57 | params = list(self.policy_mlp.parameters()) 58 | if self.nn_baseline: 59 | self.baseline_mlp = MLP(1, ob_dim, n_layers, size, device, True) 60 | params += list(self.baseline_mlp.parameters()) 61 | 62 | #optimizer 63 | if self.training: 64 | self.optimizer = torch.optim.Adam(params, lr = learning_rate) 65 | 66 | ################################## 67 | 68 | # update/train this policy 69 | def update(self, observations, actions): 70 | raise NotImplementedError 71 | 72 | # query the neural net that's our 'policy' function, as defined by the policy_mlp above 73 | # query the policy with observation(s) to get selected action(s) 74 | def get_action(self, obs): 75 | raise NotImplementedError 76 | #implement similar to HW1 77 | 78 | def get_log_prob(self, network_outputs, actions_taken): 79 | actions_taken = torch.Tensor(actions_taken).to(self.device) 80 | if self.discrete: 81 | #log probability under a categorical distribution 82 | network_outputs = nn.functional.log_softmax(network_outputs).exp() 83 | return torch.distributions.Categorical(network_outputs).log_prob(actions_taken) 84 | else: 85 | #log probability under a multivariate gaussian 86 | return torch.distributions.Normal(network_outputs[0], network_outputs[1]).log_prob(actions_taken).sum(-1) 87 | 88 | ##################################################### 89 | ##################################################### 90 | 91 | class MLPPolicyPG(MLPPolicy): 92 | 93 | def update(self, observations, acs_na, adv_n = None, acs_labels_na = None, qvals = None): 94 | policy_output = self.policy_mlp(torch.Tensor(observations).to(self.device)) 95 | logprob_pi = self.get_log_prob(policy_output, acs_na) 96 | 97 | #TODO Don't forget to zero out the gradient 98 | 99 | # TODO: define the loss that should be optimized when training a policy with policy gradient 100 | # HINT1: Recall that the expression that we want to MAXIMIZE 101 | # is the expectation over collected trajectories of: 102 | # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]] 103 | # HINT2: look at logprob_pi above 104 | # HINT3: don't forget that we need to MINIMIZE this self.loss 105 | # but the equation above is something that should be maximized 106 | #HINT4: Don't forget to propagate the loss backward 107 | 108 | if self.nn_baseline: 109 | baseline_prediction = self.baseline_mlp(torch.Tensor(observations).to(self.device)).view(-1) 110 | baseline_target = torch.Tensor((qvals - qvals.mean()) / (qvals.std() + 1e-8)).to(self.device) 111 | 112 | # TODO: define the loss that should be optimized for training the baseline 113 | # HINT1: use nn.functional.mse_loss, similar to SL loss from hw1 114 | # HINT2: we want predictions (baseline_prediction) to be as close as possible to the labels (baseline_target) 115 | # HINT3: Don't forget to propagate the loss backward 116 | 117 | #step the optimizer 118 | return loss 119 | 120 | ##################################################### 121 | ##################################################### 122 | -------------------------------------------------------------------------------- /hw2/cs285/policies/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw2/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/policies/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/policies/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw2/cs285/scripts/run_hw2_policy_gradient.py: -------------------------------------------------------------------------------- 1 | #Uncomment next two lines and replace the path if not using anaconda 2 | #import sys 3 | #sys.path.append(r'') 4 | 5 | import torch 6 | import os 7 | import time 8 | 9 | from cs285.infrastructure.rl_trainer import RL_Trainer 10 | from cs285.agents.pg_agent import PGAgent 11 | 12 | class PG_Trainer(object): 13 | 14 | def __init__(self, params): 15 | 16 | ##################### 17 | ## SET AGENT PARAMS 18 | ##################### 19 | 20 | computation_graph_args = { 21 | 'n_layers': params['n_layers'], 22 | 'size': params['size'], 23 | 'learning_rate': params['learning_rate'], 24 | 'device': params['device'], 25 | } 26 | 27 | estimate_advantage_args = { 28 | 'gamma': params['discount'], 29 | 'standardize_advantages': not(params['dont_standardize_advantages']), 30 | 'reward_to_go': params['reward_to_go'], 31 | 'nn_baseline': params['nn_baseline'], 32 | } 33 | 34 | train_args = { 35 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 36 | } 37 | 38 | agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args} 39 | 40 | self.params = params 41 | self.params['agent_class'] = PGAgent 42 | self.params['agent_params'] = agent_params 43 | self.params['batch_size_initial'] = self.params['batch_size'] 44 | 45 | ################ 46 | ## RL TRAINER 47 | ################ 48 | 49 | self.rl_trainer = RL_Trainer(self.params) 50 | 51 | def run_training_loop(self): 52 | 53 | self.rl_trainer.run_training_loop( 54 | self.params['n_iter'], 55 | collect_policy = self.rl_trainer.agent.actor, 56 | eval_policy = self.rl_trainer.agent.actor, 57 | ) 58 | 59 | 60 | def main(): 61 | 62 | import argparse 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('--env_name', type=str) 65 | parser.add_argument('--exp_name', type=str, default='todo') 66 | parser.add_argument('--n_iter', '-n', type=int, default=200) 67 | 68 | parser.add_argument('--reward_to_go', '-rtg', action='store_true') 69 | parser.add_argument('--nn_baseline', action='store_true') 70 | parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true') 71 | parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration 72 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration 73 | 74 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) 75 | parser.add_argument('--discount', type=float, default=1) 76 | parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) 77 | parser.add_argument('--n_layers', '-l', type=int, default=2) 78 | parser.add_argument('--size', '-s', type=int, default=64) 79 | 80 | parser.add_argument('--ep_len', type=int) #students shouldn't change this away from env's default 81 | parser.add_argument('--seed', type=int, default=1) 82 | parser.add_argument('--use_gpu', '-gpu', default = True) 83 | parser.add_argument('--which_gpu', '-gpu_id', default=0) 84 | parser.add_argument('--video_log_freq', type=int, default=-1) # video log disabled 85 | parser.add_argument('--scalar_log_freq', type=int, default=1) 86 | 87 | parser.add_argument('--save_params', action='store_true') 88 | 89 | args = parser.parse_args() 90 | 91 | # convert to dictionary 92 | params = vars(args) 93 | 94 | if torch.cuda.is_available() and params["use_gpu"]: 95 | which_gpu = "cuda:" + str(params["which_gpu"]) 96 | params["device"] = torch.device(which_gpu) 97 | print("Pytorch is running on GPU", params["which_gpu"]) 98 | else: 99 | params["device"] = torch.device("cpu") 100 | print("Pytorch is running on the CPU") 101 | 102 | # for this assignment, we train on everything we recently collected 103 | # so making train_batch_size=batch_size 104 | params['train_batch_size']=params['batch_size'] 105 | 106 | ################################## 107 | ### CREATE DIRECTORY FOR LOGGING 108 | ################################## 109 | 110 | logdir_prefix = 'pg_' 111 | 112 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') 113 | 114 | if not (os.path.exists(data_path)): 115 | os.makedirs(data_path) 116 | 117 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 118 | logdir = os.path.join(data_path, logdir) 119 | params['logdir'] = logdir 120 | if not(os.path.exists(logdir)): 121 | os.makedirs(logdir) 122 | 123 | ################### 124 | ### RUN TRAINING 125 | ################### 126 | 127 | trainer = PG_Trainer(params) 128 | trainer.run_training_loop() 129 | 130 | 131 | if __name__ == "__main__": 132 | main() 133 | -------------------------------------------------------------------------------- /hw2/cs285_hw2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285_hw2.pdf -------------------------------------------------------------------------------- /hw2/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.11 2 | mujoco-py==1.50.1.35 3 | matplotlib==2.2.2 4 | ipython==6.4.0 5 | moviepy==1.0.0 -------------------------------------------------------------------------------- /hw2/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) -------------------------------------------------------------------------------- /hw3/README.txt: -------------------------------------------------------------------------------- 1 | 2 | 1) See hw1 if you'd like to see installation instructions. You do NOT have to redo them. But, you need to install OpenCV for this assignment: 3 | `pip install opencv-python==3.4.0.12` 4 | 5 | You also need to replace `/gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file. To find the file: 6 | $ locate lunar_lander.py 7 | (or if there are multiple options there): 8 | $ source activate cs285_env 9 | $ ipython 10 | $ import gym 11 | $ gym.__file__ 12 | /gym/__init__.py 13 | ############################################## 14 | ############################################## 15 | 16 | 17 | 2) Code: 18 | 19 | ------------------------------------------- 20 | 21 | Files to look at, even though there are no explicit 'TODO' markings: 22 | - scripts/run_hw3_dqn.py 23 | - scripts/run_hw3_actor_critic.py 24 | - infrastructure/models.py 25 | - policies/dqn_utils.py 26 | - policies/MLP_policy.py 27 | 28 | ------------------------------------------- 29 | 30 | Blanks to be filled in now (for this assignment) are marked with 'TODO' 31 | 32 | The following files have these: 33 | - critics/dqn_critic.py 34 | - agents/dqn_agent.py 35 | - policies/argmax_policy.py 36 | - critics/bootstrapped_continuous_critic.py 37 | - agents/ac_agent.py 38 | 39 | ############################################## 40 | ############################################## 41 | 42 | 43 | 3) Run code with the following command: 44 | 45 | $ python cs285/scripts/run_hw3_dqn.py --env_name PongNoFrameskip-v4 --exp_name test_pong 46 | $ python cs285/scripts/run_hw3_actor_critic.py --env_name CartPole-v0 -n 100 -b 1000 --exp_name 100_1 -ntu 100 -ngsptu 1 47 | 48 | Flags of relevance, when running the commands above (see pdf for more info): 49 | -double_q Whether to use double Q learning or not. 50 | 51 | ############################################## 52 | 53 | 54 | 4) Visualize saved tensorboard event file: 55 | 56 | $ cd cs285/data/ 57 | $ tensorboard --logdir . 58 | 59 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab) 60 | -------------------------------------------------------------------------------- /hw3/cs285/agents/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw3/cs285/agents/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/agents/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/agents/__pycache__/ac_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/agents/__pycache__/ac_agent.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/agents/__pycache__/dqn_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/agents/__pycache__/dqn_agent.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/agents/ac_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from collections import OrderedDict 4 | 5 | from cs285.policies.MLP_policy import MLPPolicyAC 6 | from cs285.critics.bootstrapped_continuous_critic import BootstrappedContinuousCritic 7 | from cs285.infrastructure.replay_buffer import ReplayBuffer 8 | from cs285.infrastructure.utils import * 9 | 10 | class ACAgent: 11 | def __init__(self, env, agent_params): 12 | super(ACAgent, self).__init__() 13 | 14 | self.env = env 15 | self.agent_params = agent_params 16 | self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update'] 17 | self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update']3 18 | self.device = agent_params['device'] 19 | 20 | self.gamma = self.agent_params['gamma'] 21 | self.standardize_advantages = self.agent_params['standardize_advantages'] 22 | 23 | self.actor = MLPPolicyAC(self.agent_params['ac_dim'], 24 | self.agent_params['ob_dim'], 25 | self.agent_params['n_layers'], 26 | self.agent_params['size'], 27 | self.agent_params['device'], 28 | discrete=self.agent_params['discrete'], 29 | learning_rate=self.agent_params['learning_rate'], 30 | ) 31 | self.critic = BootstrappedContinuousCritic(self.agent_params) 32 | 33 | self.replay_buffer = ReplayBuffer() 34 | 35 | def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n): 36 | ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) 37 | 38 | # TODO Implement the following pseudocode: 39 | # 1) query the critic with ob_no, to get V(s) 40 | # 2) query the critic with next_ob_no, to get V(s') 41 | # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s') 42 | # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1) 43 | # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s) 44 | 45 | adv_n = TODO 46 | 47 | if self.standardize_advantages: 48 | adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8) 49 | return adv_n 50 | 51 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 52 | 53 | # TODO Implement the following pseudocode: 54 | # for agent_params['num_critic_updates_per_agent_update'] steps, 55 | # update the critic 56 | 57 | # advantage = estimate_advantage(...) 58 | 59 | # for agent_params['num_actor_updates_per_agent_update'] steps, 60 | # update the actor 61 | 62 | TODO 63 | 64 | loss = OrderedDict() 65 | loss['Critic_Loss'] = TODO # put final critic loss here 66 | loss['Actor_Loss'] = TODO # put final actor loss here 67 | return loss 68 | 69 | def add_to_replay_buffer(self, paths): 70 | self.replay_buffer.add_rollouts(paths) 71 | 72 | def sample(self, batch_size): 73 | return self.replay_buffer.sample_recent_data(batch_size) 74 | -------------------------------------------------------------------------------- /hw3/cs285/agents/dqn_agent.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule 5 | from cs285.policies.argmax_policy import ArgMaxPolicy 6 | from cs285.critics.dqn_critic import DQNCritic 7 | 8 | 9 | class DQNAgent(object): 10 | def __init__(self, env, agent_params): 11 | 12 | print(agent_params['optimizer_spec']) 13 | 14 | self.env = env 15 | self.agent_params = agent_params 16 | self.batch_size = agent_params['batch_size'] 17 | self.device = agent_params['device'] 18 | self.last_obs = self.env.reset() 19 | 20 | self.num_actions = agent_params['ac_dim'] 21 | self.learning_starts = agent_params['learning_starts'] 22 | self.learning_freq = agent_params['learning_freq'] 23 | self.target_update_freq = agent_params['target_update_freq'] 24 | 25 | self.replay_buffer_idx = None 26 | self.exploration = agent_params['exploration_schedule'] 27 | self.optimizer_spec = agent_params['optimizer_spec'] 28 | 29 | self.critic = DQNCritic(agent_params, self.optimizer_spec) 30 | self.actor = ArgMaxPolicy(self.critic, self.device) 31 | 32 | lander = agent_params['env_name'] == 'LunarLander-v2' 33 | self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander) 34 | self.t = 0 35 | self.num_param_updates = 0 36 | 37 | def add_to_replay_buffer(self, paths): 38 | pass 39 | 40 | def step_env(self): 41 | 42 | """ 43 | Step the env and store the transition 44 | 45 | At the end of this block of code, the simulator should have been 46 | advanced one step, and the replay buffer should contain one more transition. 47 | 48 | Note that self.last_obs must always point to the new latest observation. 49 | """ 50 | 51 | # TODO store the latest observation into the replay buffer 52 | # HINT: see replay buffer's function store_frame 53 | self.replay_buffer_idx = TODO 54 | 55 | eps = self.exploration.value(self.t) 56 | # TODO use epsilon greedy exploration when selecting action 57 | # HINT: take random action 58 | # with probability eps (see np.random.random()) 59 | # OR if your current step number (see self.t) is less that self.learning_starts 60 | perform_random_action = TODO 61 | 62 | if perform_random_action: 63 | action = TODO 64 | else: 65 | # TODO query the policy to select action 66 | # HINT: you cannot use "self.last_obs" directly as input 67 | # into your network, since it needs to be processed to include context 68 | # from previous frames. 69 | # Check out the replay buffer, which has a function called 70 | # encode_recent_observation that will take the latest observation 71 | # that you pushed into the buffer and compute the corresponding 72 | # input that should be given to a Q network by appending some 73 | # previous frames. 74 | enc_last_obs = 75 | enc_last_obs = torch.tensor(enc_last_obs[None, :]).to(self.device) 76 | 77 | # TODO query the policy with enc_last_obs to select action 78 | action = TODO 79 | 80 | # TODO take a step in the environment using the action from the policy 81 | # HINT1: remember that self.last_obs must always point to the newest/latest observation 82 | # HINT2: remember the following useful function that you've seen before: 83 | #obs, reward, done, info = env.step(action) 84 | TODO 85 | 86 | # TODO store the result of taking this action into the replay buffer 87 | # HINT1: see replay buffer's store_effect function 88 | # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above 89 | TODO 90 | 91 | # TODO if taking this step resulted in done, reset the env (and the latest observation), otherwise set last obs to obs 92 | TODO 93 | 94 | def sample(self, batch_size): 95 | if self.replay_buffer.can_sample(self.batch_size): 96 | return self.replay_buffer.sample(batch_size) 97 | else: 98 | return [],[],[],[],[] 99 | 100 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 101 | 102 | """ 103 | Here, you should train the DQN agent. 104 | This consists of training the critic, as well as periodically updating the target network. 105 | """ 106 | loss = 0 107 | if (self.t > self.learning_starts and \ 108 | self.t % self.learning_freq == 0 and \ 109 | self.replay_buffer.can_sample(self.batch_size)): 110 | 111 | # TODO populate the parameters and implement critic.update() 112 | loss = self.critic.update(TODO, TODO, TODO, TODO, TODO) 113 | 114 | # TODO: load newest parameters into the target network 115 | if self.num_param_updates % self.target_update_freq == 0: 116 | TODO 117 | 118 | self.num_param_updates += 1 119 | 120 | self.t += 1 121 | return loss 122 | -------------------------------------------------------------------------------- /hw3/cs285/critics/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw3/cs285/critics/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/critics/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/critics/__pycache__/bootstrapped_continuous_critic.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/critics/__pycache__/bootstrapped_continuous_critic.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/critics/__pycache__/dqn_critic.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/critics/__pycache__/dqn_critic.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/critics/bootstrapped_continuous_critic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from cs285.infrastructure.models import MLP 4 | 5 | class BootstrappedContinuousCritic: 6 | def __init__(self, hparams): 7 | self.ob_dim = hparams['ob_dim'] 8 | self.ac_dim = hparams['ac_dim'] 9 | self.discrete = hparams['discrete'] 10 | self.size = hparams['size'] 11 | self.n_layers = hparams['n_layers'] 12 | self.device = hparams['device'] 13 | self.learning_rate = hparams['learning_rate'] 14 | self.num_target_updates = hparams['num_target_updates'] 15 | self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update'] 16 | self.gamma = hparams['gamma'] 17 | 18 | self.value_func = MLP(1, self.ob_dim, self.n_layers, self.size, self.device, self.discrete) 19 | # TODO: use the Adam optimizer to optimize the loss with self.learning_rate 20 | self.optimizer = TODO 21 | 22 | def update(self, ob_no, next_ob_no, re_n, terminal_n): 23 | """ 24 | Update the parameters of the critic. 25 | 26 | let sum_of_path_lengths be the sum of the lengths of the sampled paths 27 | let num_paths be the number of sampled paths 28 | 29 | arguments: 30 | ob_no: shape: (sum_of_path_lengths, ob_dim) 31 | next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward 32 | re_n: length: sum_of_path_lengths. Each element in re_n is a scalar containing 33 | the reward for each timestep 34 | terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended 35 | at that timestep of 0 if the episode did not end 36 | 37 | returns: 38 | loss 39 | """ 40 | 41 | # TODO: Implement the pseudocode below: 42 | 43 | # do the following (self.num_grad_steps_per_target_update * self.num_target_updates) times: 44 | # every self.num_grad_steps_per_target_update steps (which includes the first step), 45 | # recompute the target values by 46 | #a) calculating V(s') by querying this critic network (ie calling 'forward') with next_ob_no 47 | #b) and computing the target values as r(s, a) + gamma * V(s') 48 | # HINT: don't forget to use terminal_n to cut off the V(s') (ie set it to 0) when a terminal state is reached 49 | # every time, 50 | # update this critic using the observations and targets 51 | # HINT: use nn.MSE() 52 | 53 | TODO 54 | 55 | return loss 56 | -------------------------------------------------------------------------------- /hw3/cs285/critics/dqn_critic.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure.models import * 2 | import torch 3 | from torch import nn 4 | 5 | class DQNCritic: 6 | def __init__(self, hparams, optimizer_spec, **kwargs): 7 | super().__init__(**kwargs) 8 | self.env_name = hparams['env_name'] 9 | self.device = hparams['device'] 10 | self.ob_dim = hparams['ob_dim'] 11 | 12 | if isinstance(self.ob_dim, int): 13 | self.input_shape = self.ob_dim 14 | else: 15 | self.input_shape = hparams['input_shape'] 16 | 17 | self.ac_dim = hparams['ac_dim'] 18 | self.double_q = hparams['double_q'] 19 | self.grad_norm_clipping = hparams['grad_norm_clipping'] 20 | self.gamma = hparams['gamma'] 21 | 22 | self.optimizer_spec = optimizer_spec 23 | 24 | if self.env_name == 'LunarLander-v2': 25 | self.Q_func = LL_DQN(self.ac_dim, self.input_shape, self.device) 26 | self.target_Q_func = LL_DQN(self.ac_dim, self.input_shape, self.device) 27 | 28 | elif self.env_name == 'PongNoFrameskip-v4': 29 | self.Q_func = atari_DQN(self.ac_dim, self.input_shape, self.device) 30 | self.target_Q_func = atari_DQN(self.ac_dim, self.input_shape, self.device) 31 | 32 | else: raise NotImplementedError 33 | 34 | self.optimizer = self.optimizer_spec.constructor(self.Q_func.parameters(), lr = 1, **self.optimizer_spec.kwargs) 35 | self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, self.optimizer_spec.lr_schedule) 36 | 37 | def get_loss(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 38 | ob, ac, rew, next_ob, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, ac_na, re_n, next_ob_no, terminal_n]) 39 | 40 | with torch.no_grad(): 41 | if self.double_q: 42 | # You must fill this part for Q2 of the Q-learning potion of the homework. 43 | # In double Q-learning, the best action is selected using the Q-network that 44 | # is being updated, but the Q-value for this action is obtained from the 45 | # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details. 46 | max_ac = TODO 47 | else: 48 | max_ac = TODO 49 | 50 | curr_Q = self.Q_func(ob).gather(-1, ac.long().view(-1, 1)).squeeze() 51 | # TODO calculate the optimal Qs for next_ob using max_ac 52 | # HINT1: similar to how it is done above 53 | best_next_Q = TODO 54 | # TODO calculate the targets for the Bellman error 55 | # HINT1: as you saw in lecture, this would be: 56 | #currentReward + self.gamma * best_next_Q * (1 - self.done_mask_ph) 57 | calc_Q = TODO 58 | 59 | return nn.functional.smooth_l1_loss(curr_Q, calc_Q) #Huber Loss 60 | 61 | 62 | def update(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 63 | self.optimizer.zero_grad() 64 | 65 | loss = self.get_loss(ob_no, ac_na, re_n, next_ob_no, terminal_n) 66 | loss.backward() 67 | 68 | nn.utils.clip_grad_norm_(self.Q_func.parameters(), max_norm = self.grad_norm_clipping) #perform grad clipping 69 | self.optimizer.step() #take step with optimizer 70 | self.lr_scheduler.step() #move forward learning rate 71 | 72 | return loss 73 | -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__pycache__/atari_wrappers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/atari_wrappers.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__pycache__/dqn_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/dqn_utils.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__pycache__/models.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/models.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.tensorboard import SummaryWriter 4 | import numpy as np 5 | 6 | class Logger: 7 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 8 | self._log_dir = log_dir 9 | print('########################') 10 | print('logging outputs to ', log_dir) 11 | print('########################') 12 | self._n_logged_samples = n_logged_samples 13 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 14 | 15 | def log_scalar(self, scalar, name, step_): 16 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 17 | 18 | def log_scalars(self, scalar_dict, group_name, step, phase): 19 | """Will log all scalars in the same plot.""" 20 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 21 | 22 | def log_image(self, image, name, step): 23 | assert(len(image.shape) == 3) # [C, H, W] 24 | self._summ_writer.add_image('{}'.format(name), image, step) 25 | 26 | def log_video(self, video_frames, name, step, fps=10): 27 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 28 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 29 | 30 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 31 | 32 | # reshape the rollouts 33 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 34 | 35 | # max rollout length 36 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 37 | max_length = videos[0].shape[0] 38 | for i in range(max_videos_to_save): 39 | if videos[i].shape[0]>max_length: 40 | max_length = videos[i].shape[0] 41 | 42 | # pad rollouts to all be same length 43 | for i in range(max_videos_to_save): 44 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 55 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 56 | 57 | def log_figure(self, figure, name, step, phase): 58 | """figure: matplotlib.pyplot figure handle""" 59 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 60 | 61 | def log_graph(self, array, name, step, phase): 62 | """figure: matplotlib.pyplot figure handle""" 63 | im = plot_graph(array) 64 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 65 | 66 | def dump_scalars(self, log_path=None): 67 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 68 | self._summ_writer.export_scalars_to_json(log_path) 69 | 70 | def flush(self): 71 | self._summ_writer.flush() 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | class MLP(nn.Module): 5 | def __init__(self, 6 | ac_dim, 7 | ob_dim, 8 | n_layers, 9 | size, 10 | device, 11 | discrete, 12 | activation = nn.Tanh()): 13 | super().__init__() 14 | 15 | self.discrete = discrete 16 | 17 | # network architecture 18 | self.mlp = nn.ModuleList() 19 | self.mlp.append(nn.Linear(ob_dim, size)) #first hidden layer 20 | self.mlp.append(activation) 21 | 22 | for h in range(n_layers - 1): #additional hidden layers 23 | self.mlp.append(nn.Linear(size, size)) 24 | self.mlp.append(activation) 25 | 26 | self.mlp.append(nn.Linear(size, ac_dim)) #output layer, no activation function 27 | 28 | #if continuous define logstd variable 29 | if not self.discrete: 30 | self.logstd = nn.Parameter(torch.zeros(ac_dim)) 31 | 32 | self.to(device) 33 | 34 | def forward(self, x): 35 | for layer in self.mlp: 36 | x = layer(x) 37 | if self.discrete: 38 | return x 39 | else: 40 | return (x, self.logstd.exp()) 41 | 42 | def save(self, filepath): 43 | torch.save(self.state_dict(), filepath) 44 | 45 | def restore(self, filepath): 46 | self.load_state_dict(torch.load(filepath)) 47 | 48 | class LL_DQN(MLP): 49 | def __init__(self, ac_dim, ob_dim, device): 50 | super().__init__(ac_dim, ob_dim, 2, 64, device, True, nn.ReLU()) 51 | 52 | class atari_DQN(nn.Module): 53 | def __init__(self, ac_dim, ob_dim, device): 54 | super().__init__() 55 | 56 | self.convnet = nn.Sequential( 57 | nn.Conv2d(ob_dim[2], 32, 8, stride = 4), 58 | nn.ReLU(True), 59 | nn.Conv2d(32, 64, 4, stride = 2), 60 | nn.ReLU(True), 61 | nn.Conv2d(64, 64, 3, stride = 1), 62 | nn.ReLU(True), 63 | ) 64 | self.action_value = nn.Sequential( 65 | nn.Linear(7 * 7 * 64, 512), 66 | nn.ReLU(True), 67 | nn.Linear(512, ac_dim), 68 | ) 69 | self.to(device) 70 | 71 | def forward(self, obs): 72 | out = obs.float() / 255 73 | out = out.permute(0, 3, 1, 2) #reshape to [batch size, channels, height, width] 74 | out = self.convnet(out) 75 | out = out.reshape(out.size(0), -1) 76 | out = self.action_value(out) 77 | return out 78 | 79 | def save(self, filepath): 80 | torch.save(self.state_dict(), filepath) 81 | 82 | def restore(self, filepath): 83 | self.load_state_dict(torch.load(filepath)) 84 | -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from cs285.infrastructure.utils import * 4 | 5 | class ReplayBuffer(object): 6 | 7 | def __init__(self, max_size=1000000): 8 | 9 | self.max_size = max_size 10 | self.paths = [] 11 | self.obs = None 12 | self.acs = None 13 | self.concatenated_rews = None 14 | self.unconcatenated_rews = None 15 | self.next_obs = None 16 | self.terminals = None 17 | 18 | def add_rollouts(self, paths): 19 | 20 | # add new rollouts into our list of rollouts 21 | for path in paths: 22 | self.paths.append(path) 23 | 24 | # convert new rollouts into their component arrays, and append them onto our arrays 25 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) 26 | 27 | if self.obs is None: 28 | self.obs = observations[-self.max_size:] 29 | self.acs = actions[-self.max_size:] 30 | self.next_obs = next_observations[-self.max_size:] 31 | self.terminals = terminals[-self.max_size:] 32 | self.concatenated_rews = concatenated_rews[-self.max_size:] 33 | self.unconcatenated_rews = unconcatenated_rews[-self.max_size:] 34 | else: 35 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 36 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 37 | self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:] 38 | self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:] 39 | self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:] 40 | if isinstance(unconcatenated_rews, list): 41 | self.unconcatenated_rews += unconcatenated_rews 42 | else: 43 | self.unconcatenated_rews.append(unconcatenated_rews) 44 | 45 | ######################################## 46 | ######################################## 47 | 48 | def sample_random_rollouts(self, num_rollouts): 49 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] 50 | return self.paths[rand_indices] 51 | 52 | def sample_recent_rollouts(self, num_rollouts=1): 53 | return self.paths[-num_rollouts:] 54 | 55 | ######################################## 56 | ######################################## 57 | 58 | def sample_random_data(self, batch_size): 59 | 60 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 61 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] 62 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] 63 | 64 | def sample_recent_data(self, batch_size=1, concat_rew=True): 65 | 66 | if concat_rew: 67 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 68 | else: 69 | num_recent_rollouts_to_return = 0 70 | num_datapoints_so_far = 0 71 | index = -1 72 | while num_datapoints_so_far < batch_size: 73 | recent_rollout = self.paths[index] 74 | index -=1 75 | num_recent_rollouts_to_return +=1 76 | num_datapoints_so_far += get_pathlength(recent_rollout) 77 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] 78 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) 79 | return observations, actions, unconcatenated_rews, next_observations, terminals -------------------------------------------------------------------------------- /hw3/cs285/infrastructure/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | ############################################ 5 | ############################################ 6 | 7 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')): 8 | 9 | if render: 10 | env.render(mode = "human") 11 | 12 | # initialize env for the beginning of a new rollout 13 | ob = env.reset() # TODO: GETTHIS from HW1 14 | 15 | # init vars 16 | obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], [] 17 | steps = 0 18 | while True: 19 | 20 | # render image of the simulated env 21 | if render: 22 | if 'rgb_array' in render_mode: 23 | if hasattr(env, 'sim'): 24 | if 'track' in env.env.model.camera_names: 25 | image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1]) 26 | else: 27 | image_obs.append(env.sim.render(height=500, width=500)[::-1]) 28 | else: 29 | image_obs.append(env.render(mode=render_mode)) 30 | if 'human' in render_mode: 31 | env.render(mode=render_mode) 32 | time.sleep(env.model.opt.timestep) 33 | 34 | # use the most recent ob to decide what to do 35 | obs.append(ob) 36 | ac = policy.get_action(ob) 37 | acs.append(ac) 38 | 39 | # take that action and record results 40 | ob, rew, done, _ = env.step(ac) 41 | 42 | # record result of taking that action 43 | steps += 1 44 | next_obs.append(ob) 45 | rewards.append(rew) 46 | 47 | # End the rollout if the rollout ended 48 | # Note that the rollout can end due to done, or due to max_path_length 49 | rollout_done = done or steps >= max_path_length 50 | terminals.append(rollout_done) 51 | 52 | if rollout_done: 53 | break 54 | 55 | return Path(obs, image_obs, acs, rewards, next_obs, terminals) 56 | 57 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')): 58 | 59 | timesteps_left = min_timesteps_per_batch 60 | timesteps_this_batch = 0 61 | paths = [] 62 | 63 | while timesteps_this_batch < min_timesteps_per_batch: 64 | paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode)) 65 | timesteps_this_batch += get_pathlength(paths[-1]) 66 | 67 | return paths, timesteps_this_batch 68 | 69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')): 70 | paths = [] 71 | for n in range(ntraj): 72 | paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode)) 73 | 74 | return paths 75 | 76 | 77 | def Path(obs, image_obs, acs, rewards, next_obs, terminals): 78 | """ 79 | Take info (separate arrays) from a single rollout 80 | and return it in a single dictionary 81 | """ 82 | if image_obs != []: 83 | image_obs = np.stack(image_obs, axis=0) 84 | return {"observation" : np.array(obs, dtype=np.float32), 85 | "image_obs" : np.array(image_obs, dtype=np.uint8), 86 | "reward" : np.array(rewards, dtype=np.float32), 87 | "action" : np.array(acs, dtype=np.float32), 88 | "next_observation": np.array(next_obs, dtype=np.float32), 89 | "terminal": np.array(terminals, dtype=np.float32)} 90 | 91 | 92 | def convert_listofrollouts(paths): 93 | """ 94 | Take a list of rollout dictionaries 95 | and return separate arrays, 96 | where each array is a concatenation of that array from across the rollouts 97 | """ 98 | observations = np.concatenate([path["observation"] for path in paths]) 99 | actions = np.concatenate([path["action"] for path in paths]) 100 | next_observations = np.concatenate([path["next_observation"] for path in paths]) 101 | terminals = np.concatenate([path["terminal"] for path in paths]) 102 | concatenated_rewards = np.concatenate([path["reward"] for path in paths]) 103 | unconcatenated_rewards = [path["reward"] for path in paths] 104 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards 105 | 106 | ############################################ 107 | ############################################ 108 | 109 | def get_pathlength(path): 110 | return len(path["reward"]) 111 | -------------------------------------------------------------------------------- /hw3/cs285/policies/MLP_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from cs285.infrastructure.models import MLP 5 | 6 | class MLPPolicy: 7 | def __init__(self, 8 | ac_dim, 9 | ob_dim, 10 | n_layers, 11 | size, 12 | device, 13 | learning_rate, 14 | training=True, 15 | discrete=False, 16 | nn_baseline=False, 17 | **kwargs): 18 | super().__init__() 19 | 20 | # init vars 21 | self.device = device 22 | self.discrete = discrete 23 | self.training = training 24 | self.nn_baseline = nn_baseline 25 | 26 | # network architecture 27 | self.policy_mlp = MLP(ac_dim, ob_dim, n_layers, size, device, discrete) 28 | params = list(self.policy_mlp.parameters()) 29 | if self.nn_baseline: 30 | self.baseline_mlp = MLP(1, ob_dim, n_layers, size, device, True) 31 | params += list(self.baseline_mlp.parameters()) 32 | 33 | #optimizer 34 | if self.training: 35 | self.optimizer = torch.optim.Adam(params, lr = learning_rate) 36 | 37 | ################################## 38 | 39 | # update/train this policy 40 | def update(self, observations, actions): 41 | raise NotImplementedError 42 | 43 | # query the neural net that's our 'policy' function, as defined by an mlp above 44 | # query the policy with observation(s) to get selected action(s) 45 | def get_action(self, obs): 46 | output = self.policy_mlp(torch.Tensor(obs).to(self.device)) 47 | if self.discrete: 48 | action_probs = nn.functional.log_softmax(output).exp() 49 | return torch.multinomial(action_probs, num_samples = 1).cpu().detach().numpy()[0] 50 | else: 51 | return torch.normal(output[0], output[1]).cpu().detach().numpy() 52 | 53 | def get_log_prob(self, network_outputs, actions_taken): 54 | actions_taken = torch.Tensor(actions_taken).to(self.device) 55 | if self.discrete: 56 | network_outputs = nn.functional.log_softmax(network_outputs).exp() 57 | return torch.distributions.Categorical(network_outputs).log_prob(actions_taken) 58 | else: 59 | return torch.distributions.Normal(network_outputs[0], network_outputs[1]).log_prob(actions_taken).sum(-1) 60 | 61 | ##################################################### 62 | ##################################################### 63 | 64 | class MLPPolicyPG(MLPPolicy): 65 | 66 | def update(self, observations, acs_na, adv_n = None, acs_labels_na = None, qvals = None): 67 | policy_output = self.policy_mlp(torch.Tensor(observations).to(self.device)) 68 | logprob_pi = self.get_log_prob(policy_output, acs_na) 69 | 70 | self.optimizer.zero_grad() 71 | 72 | loss = torch.sum((-logprob_pi * torch.Tensor(adv_n).to(self.device))) 73 | loss.backward() 74 | 75 | if self.nn_baseline: 76 | baseline_prediction = self.baseline_mlp(torch.Tensor(observations).to(self.device)).view(-1) 77 | baseline_target = torch.Tensor((qvals - qvals.mean()) / (qvals.std() + 1e-8)).to(self.device) 78 | baseline_loss = nn.functional.mse_loss(baseline_prediction, baseline_target) 79 | baseline_loss.backward() 80 | 81 | self.optimizer.step() 82 | 83 | return loss 84 | 85 | ##################################################### 86 | ##################################################### 87 | 88 | class MLPPolicyAC(MLPPolicyPG): 89 | """ MLP policy required for actor-critic. 90 | 91 | Note: Your code for this class could in fact the same as MLPPolicyPG, except the neural net baseline 92 | would not be required (i.e. self.nn_baseline would always be false. It is separated here only 93 | to avoid any unintended errors. 94 | """ 95 | def __init__(self, *args, **kwargs): 96 | if 'nn_baseline' in kwargs.keys(): 97 | assert kwargs['nn_baseline'] == False, "MLPPolicyAC should not use the nn_baseline flag" 98 | super().__init__(*args, **kwargs) 99 | -------------------------------------------------------------------------------- /hw3/cs285/policies/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw3/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/policies/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/policies/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/policies/__pycache__/argmax_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/policies/__pycache__/argmax_policy.cpython-37.pyc -------------------------------------------------------------------------------- /hw3/cs285/policies/argmax_policy.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class ArgMaxPolicy: 4 | 5 | def __init__(self, critic, device): 6 | self.critic = critic 7 | self.device = device 8 | 9 | def get_action(self, obs): 10 | if len(obs.shape) > 1: 11 | observation = torch.tensor(obs).to(self.device) 12 | else: 13 | observation = torch.tensor(obs[None]).to(self.device) 14 | # TODO: pass observation to critic and use argmax of the resulting Q values as the action 15 | return TODO 16 | -------------------------------------------------------------------------------- /hw3/cs285/scripts/run_hw3_actor_critic.py: -------------------------------------------------------------------------------- 1 | #Uncomment next two lines and replace the path if not using anaconda 2 | #import sys 3 | #sys.path.append(r'') 4 | 5 | import os 6 | import gym 7 | import pdb 8 | import time 9 | import numpy as np 10 | import torch 11 | 12 | from cs285.infrastructure.rl_trainer import RL_Trainer 13 | from cs285.agents.ac_agent import ACAgent 14 | 15 | class AC_Trainer(object): 16 | 17 | def __init__(self, params): 18 | 19 | ##################### 20 | ## SET AGENT PARAMS 21 | ##################### 22 | 23 | computation_graph_args = { 24 | 'n_layers': params['n_layers'], 25 | 'size': params['size'], 26 | 'learning_rate': params['learning_rate'], 27 | 'num_target_updates': params['num_target_updates'], 28 | 'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'], 29 | 'device': params['device'], 30 | } 31 | 32 | estimate_advantage_args = { 33 | 'gamma': params['discount'], 34 | 'standardize_advantages': not(params['dont_standardize_advantages']), 35 | } 36 | 37 | train_args = { 38 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 39 | 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], 40 | 'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'], 41 | } 42 | 43 | agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args} 44 | 45 | self.params = params 46 | self.params['agent_class'] = ACAgent 47 | self.params['agent_params'] = agent_params 48 | self.params['batch_size_initial'] = self.params['batch_size'] 49 | 50 | ################ 51 | ## RL TRAINER 52 | ################ 53 | 54 | self.rl_trainer = RL_Trainer(self.params) 55 | 56 | def run_training_loop(self): 57 | self.rl_trainer.run_training_loop( 58 | self.params['n_iter'], 59 | collect_policy = self.rl_trainer.agent.actor, 60 | eval_policy = self.rl_trainer.agent.actor, 61 | ) 62 | 63 | 64 | def main(): 65 | import argparse 66 | parser = argparse.ArgumentParser() 67 | parser.add_argument('--env_name', type=str) 68 | parser.add_argument('--ep_len', type=int, default=200) 69 | parser.add_argument('--exp_name', type=str, default='todo') 70 | parser.add_argument('--n_iter', '-n', type=int, default=200) 71 | 72 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) 73 | parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1) 74 | parser.add_argument('--num_actor_updates_per_agent_update', type=int, default=1) 75 | 76 | parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration 77 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration 78 | parser.add_argument('--train_batch_size', '-tb', type=int, default=1000) ##steps used per gradient step 79 | 80 | parser.add_argument('--discount', type=float, default=1.0) 81 | parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) 82 | parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true') 83 | parser.add_argument('--num_target_updates', '-ntu', type=int, default=10) 84 | parser.add_argument('--num_grad_steps_per_target_update', '-ngsptu', type=int, default=10) 85 | parser.add_argument('--n_layers', '-l', type=int, default=2) 86 | parser.add_argument('--size', '-s', type=int, default=64) 87 | 88 | parser.add_argument('--seed', type=int, default=1) 89 | parser.add_argument('--use_gpu', '-gpu', default = True) 90 | parser.add_argument('--which_gpu', '-gpu_id', default=0) 91 | parser.add_argument('--video_log_freq', type=int, default=-1) 92 | parser.add_argument('--scalar_log_freq', type=int, default=1) 93 | 94 | parser.add_argument('--save_params', action='store_true') 95 | 96 | args = parser.parse_args() 97 | 98 | # convert to dictionary 99 | params = vars(args) 100 | 101 | if torch.cuda.is_available() and params["use_gpu"]: 102 | which_gpu = "cuda:" + str(params["which_gpu"]) 103 | params["device"] = torch.device(which_gpu) 104 | print("Pytorch is running on GPU", params["which_gpu"]) 105 | else: 106 | params["device"] = torch.device("cpu") 107 | print("Pytorch is running on the CPU") 108 | 109 | # for policy gradient, we made a design decision 110 | # to force batch_size = train_batch_size 111 | # note that, to avoid confusion, you don't even have a train_batch_size argument anymore (above) 112 | params['train_batch_size'] = params['batch_size'] 113 | 114 | ################################## 115 | ### CREATE DIRECTORY FOR LOGGING 116 | ################################## 117 | 118 | logdir_prefix = 'ac_' 119 | 120 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') 121 | 122 | if not (os.path.exists(data_path)): 123 | os.makedirs(data_path) 124 | 125 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 126 | logdir = os.path.join(data_path, logdir) 127 | params['logdir'] = logdir 128 | if not(os.path.exists(logdir)): 129 | os.makedirs(logdir) 130 | 131 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") 132 | 133 | ################### 134 | ### RUN TRAINING 135 | ################### 136 | 137 | trainer = AC_Trainer(params) 138 | trainer.run_training_loop() 139 | 140 | 141 | if __name__ == "__main__": 142 | main() 143 | -------------------------------------------------------------------------------- /hw3/cs285/scripts/run_hw3_dqn.py: -------------------------------------------------------------------------------- 1 | #Uncomment next two lines and replace the path if not using anaconda 2 | #import sys 3 | #sys.path.append(r'') 4 | 5 | import os 6 | import time 7 | import torch 8 | 9 | from cs285.infrastructure.rl_trainer import RL_Trainer 10 | from cs285.agents.dqn_agent import DQNAgent 11 | from cs285.infrastructure.dqn_utils import get_env_kwargs 12 | 13 | 14 | class Q_Trainer: 15 | def __init__(self, params): 16 | self.params = params 17 | 18 | train_args = { 19 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 20 | 'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'], 21 | 'train_batch_size': params['batch_size'], 22 | 'double_q': params['double_q'], 23 | 'device': params['device'], 24 | } 25 | 26 | env_args = get_env_kwargs(params['env_name']) 27 | 28 | self.agent_params = {**train_args, **env_args, **params} 29 | 30 | self.params['agent_class'] = DQNAgent 31 | self.params['agent_params'] = self.agent_params 32 | self.params['train_batch_size'] = params['batch_size'] 33 | self.params['env_wrappers'] = self.agent_params['env_wrappers'] 34 | 35 | self.rl_trainer = RL_Trainer(self.params) 36 | 37 | def run_training_loop(self): 38 | self.rl_trainer.run_training_loop( 39 | self.agent_params['num_timesteps'], 40 | collect_policy = self.rl_trainer.agent.actor, 41 | eval_policy = self.rl_trainer.agent.actor, 42 | ) 43 | 44 | def main(): 45 | 46 | import argparse 47 | parser = argparse.ArgumentParser() 48 | parser.add_argument('--env_name', default='PongNoFrameskip-v4', 49 | choices=('PongNoFrameskip-v4', 50 | 'LunarLander-v2') 51 | ) 52 | 53 | parser.add_argument('--ep_len', type=int, default=200) 54 | parser.add_argument('--exp_name', type=str, default='todo') 55 | 56 | parser.add_argument('--eval_batch_size', type=int, default=1000) 57 | 58 | parser.add_argument('--batch_size', type=int, default=32) 59 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1) 60 | parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1) 61 | parser.add_argument('--double_q', action='store_true') 62 | 63 | parser.add_argument('--seed', type=int, default=1) 64 | parser.add_argument('--use_gpu', '-gpu', default = True) 65 | parser.add_argument('--which_gpu', '-gpu_id', default=0) 66 | parser.add_argument('--scalar_log_freq', type=int, default=int(1e4)) 67 | 68 | parser.add_argument('--save_params', action='store_true') 69 | 70 | args = parser.parse_args() 71 | 72 | # convert to dictionary 73 | params = vars(args) 74 | params['video_log_freq'] = -1 # This param is not used for DQN 75 | 76 | if torch.cuda.is_available() and params["use_gpu"]: 77 | which_gpu = "cuda:" + str(params["which_gpu"]) 78 | params["device"] = torch.device(which_gpu) 79 | print("Pytorch is running on GPU", params["which_gpu"]) 80 | else: 81 | params["device"] = torch.device("cpu") 82 | print("Pytorch is running on the CPU") 83 | 84 | ################################## 85 | ### CREATE DIRECTORY FOR LOGGING 86 | ################################## 87 | 88 | logdir_prefix = 'dqn_' 89 | if args.double_q: 90 | logdir_prefix += 'double_q_' 91 | 92 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') 93 | 94 | if not (os.path.exists(data_path)): 95 | os.makedirs(data_path) 96 | 97 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 98 | logdir = os.path.join(data_path, logdir) 99 | params['logdir'] = logdir 100 | if not(os.path.exists(logdir)): 101 | os.makedirs(logdir) 102 | 103 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") 104 | 105 | trainer = Q_Trainer(params) 106 | trainer.run_training_loop() 107 | 108 | 109 | if __name__ == "__main__": 110 | main() 111 | -------------------------------------------------------------------------------- /hw3/cs285_hw3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285_hw3.pdf -------------------------------------------------------------------------------- /hw3/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.11 2 | mujoco-py==1.50.1.35 3 | matplotlib==2.2.2 4 | ipython==6.4.0 5 | moviepy==1.0.0 6 | box2d-py -------------------------------------------------------------------------------- /hw3/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) -------------------------------------------------------------------------------- /hw4/README.txt: -------------------------------------------------------------------------------- 1 | 1) See hw1 if you'd like to see installation instructions. You do NOT have to redo them. 2 | 3 | ############################################## 4 | ############################################## 5 | 6 | 7 | 2) Code: 8 | 9 | ------------------------------------------- 10 | 11 | Files to look at, even though there are no explicit 'TODO' markings: 12 | - scripts/run_hw4_mb.py 13 | - infrastructure/rl_trainer.py 14 | 15 | ------------------------------------------- 16 | 17 | Blanks to be filled in now (for this assignment) are marked with 'TODO' 18 | 19 | The following files have these: 20 | - agents/mb_agent.py 21 | - models/ff_model.py 22 | - policies/MPC_policy.py 23 | - infrastructure/utils.py 24 | 25 | ############################################## 26 | ############################################## 27 | 28 | 29 | 3) Commands: 30 | 31 | Please refer to the PDF for the specific commands needed for different questions. 32 | 33 | ############################################## 34 | 35 | 36 | 4) Visualize saved tensorboard event file: 37 | 38 | $ cd cs285/data/ 39 | $ tensorboard --logdir . 40 | 41 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab) -------------------------------------------------------------------------------- /hw4/cs285/agents/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw4/cs285/agents/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/agents/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/agents/__pycache__/mb_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/agents/__pycache__/mb_agent.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/agents/mb_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from cs285.models.ff_model import FFModel 3 | from cs285.policies.MPC_policy import MPCPolicy 4 | from cs285.infrastructure.replay_buffer import ReplayBuffer 5 | from cs285.infrastructure.utils import * 6 | 7 | class MBAgent: 8 | def __init__(self, env, agent_params): 9 | super(MBAgent, self).__init__() 10 | 11 | self.env = env.unwrapped 12 | self.agent_params = agent_params 13 | self.ensemble_size = self.agent_params['ensemble_size'] 14 | 15 | self.dyn_models = [] 16 | for i in range(self.ensemble_size): 17 | model = FFModel(self.agent_params['ac_dim'], 18 | self.agent_params['ob_dim'], 19 | self.agent_params['n_layers'], 20 | self.agent_params['size'], 21 | self.agent_params['device'], 22 | self.agent_params['learning_rate']) 23 | self.dyn_models.append(model) 24 | 25 | self.actor = MPCPolicy(self.env, 26 | ac_dim = self.agent_params['ac_dim'], 27 | dyn_models = self.dyn_models, 28 | horizon = self.agent_params['mpc_horizon'], 29 | N = self.agent_params['mpc_num_action_sequences']) 30 | 31 | self.replay_buffer = ReplayBuffer() 32 | 33 | def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n): 34 | 35 | # training a MB agent refers to updating the predictive model using observed state transitions 36 | # NOTE: each model in the ensemble is trained on a different random batch of size batch_size 37 | losses = [] 38 | num_data = ob_no.shape[0] 39 | num_data_per_ens = int(num_data/self.ensemble_size) 40 | 41 | for i in range(self.ensemble_size): 42 | 43 | # select which datapoints to use for this model of the ensemble 44 | # you might find the num_data_per_env variable defined above useful 45 | 46 | observations = # TODO(Q1) 47 | actions = # TODO(Q1) 48 | next_observations = # TODO(Q1) 49 | 50 | # use datapoints to update one of the dyn_models 51 | model = # TODO(Q1) 52 | loss = model.update(observations, actions, next_observations, self.data_statistics) 53 | losses.append(loss) 54 | 55 | avg_loss = np.mean(losses) 56 | return avg_loss 57 | 58 | def add_to_replay_buffer(self, paths, add_sl_noise=False): 59 | 60 | # add data to replay buffer 61 | self.replay_buffer.add_rollouts(paths, noised=add_sl_noise) 62 | 63 | # get updated mean/std of the data in our replay buffer 64 | self.data_statistics = {'obs_mean': np.mean(self.replay_buffer.obs, axis=0), 65 | 'obs_std': np.std(self.replay_buffer.obs, axis=0), 66 | 'acs_mean': np.mean(self.replay_buffer.acs, axis=0), 67 | 'acs_std': np.std(self.replay_buffer.acs, axis=0), 68 | 'delta_mean': np.mean( 69 | self.replay_buffer.next_obs - self.replay_buffer.obs, 70 | axis=0), 71 | 'delta_std': np.std( 72 | self.replay_buffer.next_obs - self.replay_buffer.obs, 73 | axis=0), 74 | } 75 | 76 | # update the actor's data_statistics too, so actor.get_action can be calculated correctly 77 | self.actor.data_statistics = self.data_statistics 78 | 79 | def sample(self, batch_size): 80 | # NOTE: The size of the batch returned here is sampling batch_size * ensemble_size, 81 | # so each model in our ensemble can get trained on batch_size data 82 | return self.replay_buffer.sample_random_data(batch_size * self.ensemble_size) 83 | -------------------------------------------------------------------------------- /hw4/cs285/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from cs285.envs import ant 2 | from cs285.envs import cheetah 3 | from cs285.envs import obstacles 4 | from cs285.envs import reacher -------------------------------------------------------------------------------- /hw4/cs285/envs/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/ant/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='ant-cs285-v0', 5 | entry_point='cs285.envs.ant:AntEnv', 6 | max_episode_steps=1000, 7 | ) 8 | from cs285.envs.ant.ant import AntEnv 9 | -------------------------------------------------------------------------------- /hw4/cs285/envs/ant/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/ant/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/ant/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/ant/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/ant/__pycache__/ant.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/ant/__pycache__/ant.cpython-35.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/ant/__pycache__/ant.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/ant/__pycache__/ant.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/cheetah/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='cheetah-cs285-v0', 5 | entry_point='cs285.envs.cheetah:HalfCheetahEnv', 6 | max_episode_steps=1000, 7 | ) 8 | from cs285.envs.cheetah.cheetah import HalfCheetahEnv 9 | -------------------------------------------------------------------------------- /hw4/cs285/envs/cheetah/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/cheetah/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/cheetah/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/cheetah/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/cheetah/__pycache__/cheetah.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/cheetah/__pycache__/cheetah.cpython-35.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/cheetah/__pycache__/cheetah.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/cheetah/__pycache__/cheetah.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/cheetah/cheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import mujoco_py 3 | from gym import utils 4 | from gym.envs.mujoco import mujoco_env 5 | 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | 8 | def __init__(self): 9 | 10 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) 11 | utils.EzPickle.__init__(self) 12 | 13 | self.skip = self.frame_skip 14 | 15 | self.action_dim = self.ac_dim = self.action_space.shape[0] 16 | self.observation_dim = self.obs_dim = self.observation_space.shape[0] 17 | 18 | def get_reward(self, observations, actions): 19 | 20 | """get reward/s of given (observations, actions) datapoint or datapoints 21 | 22 | Args: 23 | observations: (batchsize, obs_dim) or (obs_dim,) 24 | actions: (batchsize, ac_dim) or (ac_dim,) 25 | 26 | Return: 27 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) 28 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) 29 | """ 30 | 31 | #initialize and reshape as needed, for batch mode 32 | self.reward_dict = {} 33 | if(len(observations.shape)==1): 34 | observations = np.expand_dims(observations, axis = 0) 35 | actions = np.expand_dims(actions, axis = 0) 36 | batch_mode = False 37 | else: 38 | batch_mode = True 39 | 40 | #get vars 41 | xvel = observations[:, 9].copy() 42 | body_angle = observations[:, 2].copy() 43 | front_leg = observations[:, 6].copy() 44 | front_shin = observations[:, 7].copy() 45 | front_foot = observations[:, 8].copy() 46 | zeros = np.zeros((observations.shape[0],)).copy() 47 | 48 | # ranges 49 | leg_range = 0.2 50 | shin_range = 0 51 | foot_range = 0 52 | penalty_factor = 10 53 | 54 | #calc rew 55 | self.reward_dict['run'] = xvel 56 | 57 | front_leg_rew = zeros.copy() 58 | front_leg_rew[front_leg>leg_range] = -penalty_factor 59 | self.reward_dict['leg'] = front_leg_rew 60 | 61 | front_shin_rew = zeros.copy() 62 | front_shin_rew[front_shin>shin_range] = -penalty_factor 63 | self.reward_dict['shin'] = front_shin_rew 64 | 65 | front_foot_rew = zeros.copy() 66 | front_foot_rew[front_foot>foot_range] = -penalty_factor 67 | self.reward_dict['foot'] = front_foot_rew 68 | 69 | # total reward 70 | self.reward_dict['r_total'] = self.reward_dict['run'] + self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot'] 71 | 72 | #return 73 | dones = zeros.copy() 74 | if(not batch_mode): 75 | return self.reward_dict['r_total'][0], dones[0] 76 | return self.reward_dict['r_total'], dones 77 | 78 | 79 | def get_score(self, obs): 80 | xposafter = obs[0] 81 | return xposafter 82 | 83 | ############################################## 84 | 85 | def step(self, action): 86 | 87 | #step 88 | self.do_simulation(action, self.frame_skip) 89 | 90 | #obs/reward/done/score 91 | ob = self._get_obs() 92 | rew, done = self.get_reward(ob, action) 93 | score = self.get_score(ob) 94 | 95 | #return 96 | env_info = {'obs_dict': self.obs_dict, 97 | 'rewards': self.reward_dict, 98 | 'score': score} 99 | return ob, rew, done, env_info 100 | 101 | def _get_obs(self): 102 | 103 | self.obs_dict = {} 104 | self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy() 105 | self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy() 106 | self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy() 107 | 108 | return np.concatenate([ 109 | self.obs_dict['joints_pos'], #9 110 | self.obs_dict['joints_vel'], #9 111 | self.obs_dict['com_torso'], #3 112 | ]) 113 | 114 | ############################################## 115 | 116 | def reset_model(self, seed=None): 117 | 118 | # set reset pose/vel 119 | self.reset_pose = self.init_qpos + self.np_random.uniform( 120 | low=-.1, high=.1, size=self.model.nq) 121 | self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 122 | 123 | #reset the env to that pose/vel 124 | return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy()) 125 | 126 | 127 | def do_reset(self, reset_pose, reset_vel, reset_goal=None): 128 | 129 | #reset 130 | self.set_state(reset_pose, reset_vel) 131 | 132 | #return 133 | return self._get_obs() 134 | -------------------------------------------------------------------------------- /hw4/cs285/envs/obstacles/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='obstacles-cs285-v0', 5 | entry_point='cs285.envs.obstacles:Obstacles', 6 | max_episode_steps=500, 7 | ) 8 | from cs285.envs.obstacles.obstacles_env import Obstacles 9 | -------------------------------------------------------------------------------- /hw4/cs285/envs/obstacles/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/obstacles/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/obstacles/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/obstacles/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/obstacles/__pycache__/obstacles_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/obstacles/__pycache__/obstacles_env.cpython-35.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/obstacles/__pycache__/obstacles_env.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/obstacles/__pycache__/obstacles_env.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/reacher/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | register( 4 | id='reacher-cs285-v0', 5 | entry_point='cs285.envs.reacher:Reacher7DOFEnv', 6 | max_episode_steps=500, 7 | ) 8 | from cs285.envs.reacher.reacher_env import Reacher7DOFEnv 9 | -------------------------------------------------------------------------------- /hw4/cs285/envs/reacher/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/reacher/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/reacher/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/reacher/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/reacher/__pycache__/reacher_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/reacher/__pycache__/reacher_env.cpython-35.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/reacher/__pycache__/reacher_env.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/reacher/__pycache__/reacher_env.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/envs/reacher/reacher_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | from mujoco_py import MjViewer 5 | import os 6 | 7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle): 8 | def __init__(self): 9 | 10 | # placeholder 11 | self.hand_sid = -2 12 | self.target_sid = -1 13 | 14 | curr_dir = os.path.dirname(os.path.abspath(__file__)) 15 | mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2) 16 | utils.EzPickle.__init__(self) 17 | self.observation_dim = 26 18 | self.action_dim = 7 19 | 20 | self.hand_sid = self.model.site_name2id("finger") 21 | self.target_sid = self.model.site_name2id("target") 22 | self.skip = self.frame_skip 23 | 24 | 25 | def _get_obs(self): 26 | return np.concatenate([ 27 | self.data.qpos.flat, #[7] 28 | self.data.qvel.flatten() / 10., #[7] 29 | self.data.site_xpos[self.hand_sid], #[3] 30 | self.model.site_pos[self.target_sid], #[3] 31 | ]) 32 | 33 | def step(self, a): 34 | 35 | self.do_simulation(a, self.frame_skip) 36 | ob = self._get_obs() 37 | reward, done = self.get_reward(ob, a) 38 | 39 | score = self.get_score(ob) 40 | 41 | # finalize step 42 | env_info = {'ob': ob, 43 | 'rewards': self.reward_dict, 44 | 'score': score} 45 | 46 | return ob, reward, done, env_info 47 | 48 | def get_score(self, obs): 49 | hand_pos = obs[-6:-3] 50 | target_pos = obs[-3:] 51 | score = -1*np.abs(hand_pos-target_pos) 52 | return score 53 | 54 | def get_reward(self, observations, actions): 55 | 56 | """get reward/s of given (observations, actions) datapoint or datapoints 57 | 58 | Args: 59 | observations: (batchsize, obs_dim) or (obs_dim,) 60 | actions: (batchsize, ac_dim) or (ac_dim,) 61 | 62 | Return: 63 | r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,) 64 | done: True if env reaches terminal state, dimension is (batchsize,1) or (1,) 65 | """ 66 | 67 | #initialize and reshape as needed, for batch mode 68 | self.reward_dict = {} 69 | if(len(observations.shape)==1): 70 | observations = np.expand_dims(observations, axis = 0) 71 | actions = np.expand_dims(actions, axis = 0) 72 | batch_mode = False 73 | else: 74 | batch_mode = True 75 | 76 | #get vars 77 | hand_pos = observations[:, -6:-3] 78 | target_pos = observations[:, -3:] 79 | 80 | #calc rew 81 | dist = np.linalg.norm(hand_pos - target_pos, axis=1) 82 | self.reward_dict['r_total'] = -10*dist 83 | 84 | #done is always false for this env 85 | dones = np.zeros((observations.shape[0],)) 86 | 87 | #return 88 | if(not batch_mode): 89 | return self.reward_dict['r_total'][0], dones[0] 90 | return self.reward_dict['r_total'], dones 91 | 92 | def reset(self): 93 | _ = self.reset_model() 94 | 95 | self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1] 96 | 97 | observation, _reward, done, _info = self.step(np.zeros(7)) 98 | ob = self._get_obs() 99 | 100 | return ob 101 | 102 | def reset_model(self, seed=None): 103 | if seed is not None: 104 | self.seed(seed) 105 | 106 | self.reset_pose = self.init_qpos.copy() 107 | self.reset_vel = self.init_qvel.copy() 108 | 109 | self.reset_goal = np.zeros(3) 110 | self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3) 111 | self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2) 112 | self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25) 113 | 114 | return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal) 115 | 116 | def do_reset(self, reset_pose, reset_vel, reset_goal): 117 | 118 | self.set_state(reset_pose, reset_vel) 119 | 120 | #reset target 121 | self.reset_goal = reset_goal.copy() 122 | self.model.site_pos[self.target_sid] = self.reset_goal 123 | self.sim.forward() 124 | 125 | #return 126 | return self._get_obs() -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorboardX import SummaryWriter 3 | import numpy as np 4 | 5 | class Logger: 6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 7 | self._log_dir = log_dir 8 | print('########################') 9 | print('logging outputs to ', log_dir) 10 | print('########################') 11 | self._n_logged_samples = n_logged_samples 12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 13 | 14 | def log_scalar(self, scalar, name, step_): 15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 16 | 17 | def log_scalars(self, scalar_dict, group_name, step, phase): 18 | """Will log all scalars in the same plot.""" 19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 20 | 21 | def log_image(self, image, name, step): 22 | assert(len(image.shape) == 3) # [C, H, W] 23 | self._summ_writer.add_image('{}'.format(name), image, step) 24 | 25 | def log_video(self, video_frames, name, step, fps=10): 26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 28 | 29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 30 | 31 | # reshape the rollouts 32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 33 | 34 | # max rollout length 35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 36 | max_length = videos[0].shape[0] 37 | for i in range(max_videos_to_save): 38 | if videos[i].shape[0]>max_length: 39 | max_length = videos[i].shape[0] 40 | 41 | # pad rollouts to all be same length 42 | for i in range(max_videos_to_save): 43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 55 | 56 | def log_figure(self, figure, name, step, phase): 57 | """figure: matplotlib.pyplot figure handle""" 58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 59 | 60 | def log_graph(self, array, name, step, phase): 61 | """figure: matplotlib.pyplot figure handle""" 62 | im = plot_graph(array) 63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 64 | 65 | def dump_scalars(self, log_path=None): 66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 67 | self._summ_writer.export_scalars_to_json(log_path) 68 | 69 | def flush(self): 70 | self._summ_writer.flush() 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /hw4/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from cs285.infrastructure.utils import * 4 | 5 | class ReplayBuffer(object): 6 | 7 | def __init__(self, max_size=1000000): 8 | 9 | self.max_size = max_size 10 | self.paths = [] 11 | self.obs = None 12 | self.acs = None 13 | self.concatenated_rews = None 14 | self.unconcatenated_rews = None 15 | self.next_obs = None 16 | self.terminals = None 17 | 18 | def add_rollouts(self, paths, noised=False): 19 | 20 | # add new rollouts into our list of rollouts 21 | for path in paths: 22 | self.paths.append(path) 23 | 24 | # convert new rollouts into their component arrays, and append them onto our arrays 25 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) 26 | 27 | if noised: 28 | observations = add_noise(observations) 29 | next_observations = add_noise(next_observations) 30 | 31 | if self.obs is None: 32 | self.obs = observations[-self.max_size:] 33 | self.acs = actions[-self.max_size:] 34 | self.next_obs = next_observations[-self.max_size:] 35 | self.terminals = terminals[-self.max_size:] 36 | self.concatenated_rews = concatenated_rews[-self.max_size:] 37 | self.unconcatenated_rews = unconcatenated_rews[-self.max_size:] 38 | else: 39 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 40 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 41 | self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:] 42 | self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:] 43 | self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:] 44 | if isinstance(unconcatenated_rews, list): 45 | self.unconcatenated_rews += unconcatenated_rews 46 | else: 47 | self.unconcatenated_rews.append(unconcatenated_rews) 48 | 49 | ######################################## 50 | ######################################## 51 | 52 | def sample_random_rollouts(self, num_rollouts): 53 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] 54 | return self.paths[rand_indices] 55 | 56 | def sample_recent_rollouts(self, num_rollouts=1): 57 | return self.paths[-num_rollouts:] 58 | 59 | ######################################## 60 | ######################################## 61 | 62 | def sample_random_data(self, batch_size): 63 | 64 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 65 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] 66 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] 67 | 68 | def sample_recent_data(self, batch_size=1, concat_rew=True): 69 | 70 | if concat_rew: 71 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 72 | else: 73 | num_recent_rollouts_to_return = 0 74 | num_datapoints_so_far = 0 75 | index = -1 76 | while num_datapoints_so_far < batch_size: 77 | recent_rollout = self.paths[index] 78 | index -=1 79 | num_recent_rollouts_to_return +=1 80 | num_datapoints_so_far += get_pathlength(recent_rollout) 81 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] 82 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) 83 | return observations, actions, unconcatenated_rews, next_observations, terminals -------------------------------------------------------------------------------- /hw4/cs285/models/__pycache__/ff_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/models/__pycache__/ff_model.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/models/ff_model.py: -------------------------------------------------------------------------------- 1 | from cs285.infrastructure.utils import normalize, unnormalize, MLP 2 | import numpy as np 3 | import torch 4 | from torch import nn 5 | 6 | class FFModel: 7 | def __init__(self, ac_dim, ob_dim, n_layers, size, device, learning_rate = 0.001): 8 | # init vars 9 | self.device = device 10 | 11 | #TODO - specify ouput dim and input dim of delta func MLP 12 | self.delta_func = MLP(input_dim = TODO, 13 | output_dim = TODO, 14 | n_layers = n_layers, 15 | size = size, 16 | device = self.device, 17 | discrete = True) 18 | 19 | #TODO - define the delta func optimizer. Adam optimizer will work well. 20 | self.optimizer = TODO 21 | 22 | ############################# 23 | 24 | def get_prediction(self, obs, acs, data_statistics): 25 | if len(obs.shape) == 1 or len(acs.shape) == 1: 26 | obs = np.squeeze(obs)[None] 27 | acs = np.squeeze(acs)[None] 28 | 29 | # TODO(Q1) normalize the obs and acs above using the normalize function and data_statistics 30 | norm_obs = TODO 31 | norm_acs = TODO 32 | 33 | norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs), axis = 1)).to(self.device) 34 | norm_delta = self.delta_func(norm_input).cpu().detach().numpy() 35 | 36 | # TODO(Q1) Unnormalize the the norm_delta above using the unnormalize function and data_statistics 37 | delta = TODO 38 | # TODO(Q1) Return the predited next observation (You will use obs and delta) 39 | return TODO 40 | 41 | def update(self, observations, actions, next_observations, data_statistics): 42 | # TODO(Q1) normalize the obs and acs above using the normalize function and data_statistics (same as above) 43 | norm_obs = TODO 44 | norm_acs = TODO 45 | 46 | pred_delta = self.delta_func(torch.Tensor(np.concatenate((norm_obs, norm_acs), axis = 1)).to(self.device)) 47 | # TODO(Q1) Define a normalized true_delta using observations, next_observations and the delta stats from data_statistics 48 | true_delta = TODO 49 | 50 | # TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and true change in state 51 | loss = TODO 52 | self.optimizer.zero_grad() 53 | loss.backward() 54 | self.optimizer.step() 55 | 56 | return loss.item() 57 | -------------------------------------------------------------------------------- /hw4/cs285/policies/MPC_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class MPCPolicy: 4 | def __init__(self, 5 | env, 6 | ac_dim, 7 | dyn_models, 8 | horizon, 9 | N, 10 | **kwargs): 11 | super().__init__(**kwargs) 12 | 13 | # init vars 14 | self.env = env 15 | self.dyn_models = dyn_models 16 | self.horizon = horizon 17 | self.N = N 18 | self.data_statistics = None # NOTE must be updated from elsewhere 19 | 20 | self.ob_dim = self.env.observation_space.shape[0] 21 | 22 | # action space 23 | self.ac_space = self.env.action_space 24 | self.ac_dim = ac_dim 25 | self.low = self.ac_space.low 26 | self.high = self.ac_space.high 27 | 28 | def sample_action_sequences(self, num_sequences, horizon): 29 | # TODO(Q1) uniformly sample trajectories and return an array of 30 | # dimensions (num_sequences, horizon, self.ac_dim) 31 | return random_action_sequences 32 | 33 | def get_action(self, obs): 34 | 35 | if self.data_statistics is None: 36 | # print("WARNING: performing random actions.") 37 | return self.sample_action_sequences(num_sequences=1, horizon=1)[0] 38 | 39 | #sample random actions (Nxhorizon) 40 | candidate_action_sequences = self.sample_action_sequences(num_sequences=self.N, horizon=self.horizon) 41 | 42 | # a list you can use for storing the predicted reward for each candidate sequence 43 | predicted_rewards_per_ens = [] 44 | 45 | for model in self.dyn_models: 46 | pass 47 | # TODO(Q2) 48 | 49 | # for each candidate action sequence, predict a sequence of 50 | # states for each dynamics model in your ensemble 51 | 52 | # once you have a sequence of predicted states from each model in your 53 | # ensemble, calculate the reward for each sequence using self.env.get_reward (See files in envs to see how to call this) 54 | 55 | # calculate mean_across_ensembles(predicted rewards). 56 | # the matrix dimensions should change as follows: [ens,N] --> N 57 | predicted_rewards = None # TODO(Q2) 58 | 59 | # pick the action sequence and return the 1st element of that sequence 60 | best_index = None #TODO(Q2) 61 | best_action_sequence = None #TODO(Q2) 62 | action_to_take = None # TODO(Q2) 63 | return action_to_take[None] # the None is for matching expected dimensions 64 | -------------------------------------------------------------------------------- /hw4/cs285/policies/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw4/cs285/policies/__pycache__/MPC_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/policies/__pycache__/MPC_policy.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/policies/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/policies/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw4/cs285/scripts/run_hw4_mb.py: -------------------------------------------------------------------------------- 1 | #If not using anaconda use next two lines: 2 | #import sys 3 | #sys.path.append(r'') 4 | 5 | import torch 6 | import os 7 | import time 8 | from cs285.infrastructure.rl_trainer import RL_Trainer 9 | from cs285.agents.mb_agent import MBAgent 10 | 11 | 12 | class MB_Trainer(object): 13 | 14 | def __init__(self, params): 15 | 16 | ##################### 17 | ## SET AGENT PARAMS 18 | ##################### 19 | 20 | computation_graph_args = { 21 | 'ensemble_size': params['ensemble_size'], 22 | 'n_layers': params['n_layers'], 23 | 'size': params['size'], 24 | 'learning_rate': params['learning_rate'], 25 | 'device': params['device'], 26 | } 27 | 28 | train_args = { 29 | 'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'], 30 | } 31 | 32 | controller_args = { 33 | 'mpc_horizon': params['mpc_horizon'], 34 | 'mpc_num_action_sequences': params['mpc_num_action_sequences'], 35 | } 36 | 37 | agent_params = {**computation_graph_args, **train_args, **controller_args} 38 | 39 | self.params = params 40 | self.params['agent_class'] = MBAgent 41 | self.params['agent_params'] = agent_params 42 | 43 | ################ 44 | ## RL TRAINER 45 | ################ 46 | 47 | self.rl_trainer = RL_Trainer(self.params) 48 | 49 | def run_training_loop(self): 50 | 51 | self.rl_trainer.run_training_loop( 52 | self.params['n_iter'], 53 | collect_policy = self.rl_trainer.agent.actor, 54 | eval_policy = self.rl_trainer.agent.actor, 55 | ) 56 | 57 | 58 | def main(): 59 | 60 | import argparse 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument('--env_name', type=str) #reacher-cs285-v0, ant-cs285-v0, cheetah-cs285-v0, obstacles-cs285-v0 63 | parser.add_argument('--ep_len', type=int, default=200) 64 | parser.add_argument('--exp_name', type=str, default='todo') 65 | parser.add_argument('--n_iter', '-n', type=int, default=20) 66 | 67 | parser.add_argument('--ensemble_size', '-e', type=int, default=3) 68 | parser.add_argument('--mpc_horizon', type=int, default=10) 69 | parser.add_argument('--mpc_num_action_sequences', type=int, default=1000) 70 | 71 | parser.add_argument('--add_sl_noise', '-noise', action='store_true') 72 | parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1000) 73 | parser.add_argument('--batch_size_initial', type=int, default=20000) #(random) steps collected on 1st iteration (put into replay buffer) 74 | parser.add_argument('--batch_size', '-b', type=int, default=8000) #steps collected per train iteration (put into replay buffer) 75 | parser.add_argument('--train_batch_size', '-tb', type=int, default=512) ##steps used per gradient step (used for training) 76 | parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration 77 | 78 | parser.add_argument('--learning_rate', '-lr', type=float, default=0.001) 79 | parser.add_argument('--n_layers', '-l', type=int, default=2) 80 | parser.add_argument('--size', '-s', type=int, default=250) 81 | 82 | parser.add_argument('--seed', type=int, default=1) 83 | parser.add_argument('--use_gpu', '-gpu', default = True) 84 | parser.add_argument('--which_gpu', type=int, default=0) 85 | parser.add_argument('--video_log_freq', type=int, default=1) #-1 to disable 86 | parser.add_argument('--scalar_log_freq', type=int, default=1) #-1 to disable 87 | parser.add_argument('--save_params', action='store_true') 88 | args = parser.parse_args() 89 | 90 | # convert to dictionary 91 | params = vars(args) 92 | 93 | if torch.cuda.is_available() and params["use_gpu"]: 94 | which_gpu = "cuda:" + str(params["which_gpu"]) 95 | params["device"] = torch.device(which_gpu) 96 | print("Pytorch is running on GPU", params["which_gpu"]) 97 | else: 98 | params["device"] = torch.device("cpu") 99 | print("Pytorch is running on the CPU") 100 | 101 | # HARDCODE EPISODE LENGTHS FOR THE ENVS USED IN THIS MB ASSIGNMENT 102 | if params['env_name']=='reacher-cs285-v0': 103 | params['ep_len']=200 104 | if params['env_name']=='cheetah-cs285-v0': 105 | params['ep_len']=500 106 | if params['env_name']=='obstacles-cs285-v0': 107 | params['ep_len']=100 108 | 109 | ################################## 110 | ### CREATE DIRECTORY FOR LOGGING 111 | ################################## 112 | 113 | logdir_prefix = 'mb_' 114 | 115 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data') 116 | 117 | if not (os.path.exists(data_path)): 118 | os.makedirs(data_path) 119 | 120 | logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 121 | logdir = os.path.join(data_path, logdir) 122 | params['logdir'] = logdir 123 | if not(os.path.exists(logdir)): 124 | os.makedirs(logdir) 125 | 126 | print("\n\n\nLOGGING TO: ", logdir, "\n\n\n") 127 | 128 | ################### 129 | ### RUN TRAINING 130 | ################### 131 | 132 | trainer = MB_Trainer(params) 133 | trainer.run_training_loop() 134 | 135 | 136 | if __name__ == "__main__": 137 | main() 138 | -------------------------------------------------------------------------------- /hw4/cs285_hw4.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285_hw4.pdf -------------------------------------------------------------------------------- /hw4/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) -------------------------------------------------------------------------------- /hw5/README.txt: -------------------------------------------------------------------------------- 1 | 1) The code structure for this homeowrk was heavily modified in order to match the structure of the previous three homeworks. 2 | To this end the PDF does not give the most accurate location instructions but should still be referred to for questions and guidance. 3 | The logging procedure in particular was changed to match the previous assignments. 4 | 5 | 2) Code: 6 | 7 | Code to look at: 8 | 9 | - scripts/train_ac_exploration_f18.py 10 | - envs/pointmass.py 11 | - infrastructure/rl_trainer.py (Has been changed for this homework) 12 | - infrastructure/utils.py (Has been changed foir this homework) 13 | 14 | Code to fill in as part of HW: 15 | 16 | - agents/ac_agent.py (new Exploratory_ACAgent class added) 17 | - exploration/exploration.py 18 | - exploration/density_model.py 19 | 20 | 3) Commands to run each experiment: 21 | 22 | ########################## 23 | ### P1 Hist PointMass ### 24 | ########################## 25 | 26 | python cs285/scripts/train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model none -s 8 --exp_name PM_bc0_s8 27 | python cs285/scripts/train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model hist -bc 0.01 -s 8 --exp_name PM_hist_bc0.01_s8 28 | 29 | ########################## 30 | ### P2 RBF PointMass ### 31 | ########################## 32 | 33 | python cs285/scripts/train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model rbf -bc 0.01 -s 8 -sig 0.2 --exp_name PM_rbf_bc0.01_s8_sig0.2 34 | 35 | ########################## 36 | ### P3 EX2 PointMass ### 37 | ########################## 38 | 39 | python cs285/scripts/train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model ex2 -s 8 -bc 0.05 -kl 0.1 -dlr 0.001 -dh 8 --exp_name PM_ex2_s8_bc0.05_kl0.1_dlr0.001_dh8 40 | 41 | ########################### 42 | ### P4 HalfCheetah ### 43 | ########################### 44 | 45 | python cs285/scripts/train_ac_exploration_f18.py sparse-cheetah-cs285-v1 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model none --exp_name HC_bc0 46 | python cs285/scripts/train_ac_exploration_f18.py sparse-cheetah-cs285-v1 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.001 -kl 0.1 -dlr 0.005 -dti 1000 --exp_name HC_bc0.001_kl0.1_dlr0.005_dti1000 47 | python cs285/scripts/train_ac_exploration_f18.py sparse-cheetah-cs285-v1 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.0001 -kl 0.1 -dlr 0.005 -dti 10000 --exp_name HC_bc0.0001_kl0.1_dlr0.005_dti10000 48 | 49 | 50 | 4) Visualize saved tensorboard event file: 51 | 52 | $ cd cs285/data/ 53 | $ tensorboard --logdir . 54 | 55 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab) 56 | -------------------------------------------------------------------------------- /hw5/cs285/agents/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw5/cs285/agents/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/agents/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/agents/__pycache__/ac_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/agents/__pycache__/ac_agent.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/critics/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw5/cs285/critics/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/critics/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/critics/__pycache__/bootstrapped_continuous_critic.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/critics/__pycache__/bootstrapped_continuous_critic.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/critics/bootstrapped_continuous_critic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from cs285.infrastructure.utils import MLP 4 | 5 | class BootstrappedContinuousCritic: 6 | def __init__(self, hparams): 7 | self.ob_dim = hparams['ob_dim'] 8 | self.ac_dim = hparams['ac_dim'] 9 | self.size = hparams['size'] 10 | self.n_layers = hparams['n_layers'] 11 | self.device = hparams['device'] 12 | self.learning_rate = hparams['learning_rate'] 13 | self.num_target_updates = hparams['num_target_updates'] 14 | self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update'] 15 | self.gamma = hparams['gamma'] 16 | 17 | self.value_func = MLP(self.ob_dim, 1, self.n_layers, self.size, self.device, True) 18 | self.optimizer = torch.optim.Adam(self.value_func.parameters(), lr = self.learning_rate) 19 | 20 | def update(self, ob_no, next_ob_no, re_n, terminal_n): 21 | ''' 22 | ts_ob_no, ts_next_ob_no, ts_re_n, ts_terminal_n = map(lambda x: torch.Tensor(x).to(self.device), 23 | [ob_no, next_ob_no, re_n, terminal_n]) 24 | for _ in range(self.num_target_updates): 25 | with torch.no_grad(): 26 | ts_next_V_n = self.value_func(ts_next_ob_no).view(-1) 27 | ts_target_n = ts_re_n + (1 - ts_terminal_n) * self.gamma * ts_next_V_n 28 | for _ in range(self.num_grad_steps_per_target_update): 29 | ts_V_n = self.value_func(ts_ob_no).view(-1) 30 | self.optimizer.zero_grad() 31 | loss = nn.functional.mse_loss(ts_V_n, ts_target_n) 32 | loss.backward() 33 | self.optimizer.step() 34 | ''' 35 | ob, next_ob, rew, done = map(lambda x: torch.Tensor(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n]) 36 | 37 | for update in range(self.num_grad_steps_per_target_update * self.num_target_updates): 38 | if update % self.num_grad_steps_per_target_update == 0: 39 | next_value = self.value_func(next_ob).squeeze() * (1 - done) 40 | target_value = rew + self.gamma * next_value 41 | 42 | self.optimizer.zero_grad() 43 | loss = nn.functional.mse_loss(self.value_func(ob).squeeze(), target_value) 44 | loss.backward() 45 | self.optimizer.step() 46 | target_value.detach_() 47 | #''' 48 | 49 | return loss 50 | -------------------------------------------------------------------------------- /hw5/cs285/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | register( 3 | id='sparse-cheetah-cs285-v1', 4 | entry_point='cs285.envs.sparse_half_cheetah:HalfCheetahEnv', 5 | max_episode_steps=1000, 6 | ) 7 | from cs285.envs.sparse_half_cheetah import HalfCheetahEnv 8 | -------------------------------------------------------------------------------- /hw5/cs285/envs/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/envs/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/envs/__pycache__/pointmass.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/envs/__pycache__/pointmass.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/envs/__pycache__/sparse_half_cheetah.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/envs/__pycache__/sparse_half_cheetah.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/envs/pointmass.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym.envs.registration import EnvSpec 3 | import imageio 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import os 7 | import seaborn as sns 8 | from tqdm import tqdm 9 | 10 | class Env(object): 11 | def __init__(self): 12 | super(Env, self).__init__() 13 | 14 | def reset(self): 15 | raise NotImplementedError 16 | 17 | def step(self, action): 18 | raise NotImplementedError 19 | 20 | def seed(self, seed): 21 | raise NotImplementedError 22 | 23 | class PointMass(Env): 24 | def __init__(self, max_episode_steps_coeff=1, scale=20, goal_padding=2.0): 25 | super(PointMass, self).__init__() 26 | # define scale such that the each square in the grid is 1 x 1 27 | self.scale = int(scale) 28 | self.grid_size = self.scale * self.scale 29 | self.observation_space = gym.spaces.Box( 30 | low=np.array([0.0, 0.0]), 31 | high=np.array([1.0, 1.0])) 32 | self.action_space = gym.spaces.Box( 33 | low=np.array([-np.inf, -np.inf]), 34 | high=np.array([np.inf, np.inf])) 35 | self.goal_padding = goal_padding 36 | self.spec = EnvSpec(id='PointMass-v0', max_episode_steps=int(max_episode_steps_coeff*self.scale)) 37 | 38 | def reset(self): 39 | plt.close() 40 | self.state = np.array([self.goal_padding, self.goal_padding]) 41 | state = self.state/self.scale 42 | return state 43 | 44 | def step(self, action): 45 | x, y = action 46 | 47 | # next state 48 | new_x = self.state[0]+x 49 | new_y = self.state[1]+y 50 | if new_x < 0: 51 | new_x = 0 52 | if new_x > self.scale: 53 | new_x = self.scale 54 | if new_y < 0: 55 | new_y = 0 56 | if new_y > self.scale: 57 | new_y = self.scale 58 | self.state = np.array([new_x, new_y]) 59 | state = self.state/self.scale 60 | 61 | # reward 62 | reg_term = -0.01*np.sum(action**2) 63 | 64 | threshold = self.scale - self.goal_padding 65 | if new_x > threshold and new_y > threshold: 66 | reward = 10 + reg_term 67 | else: 68 | reward = 0 + reg_term 69 | 70 | # done 71 | done = False 72 | 73 | return state, reward, done, None 74 | 75 | def preprocess(self, state): 76 | scaled_state = self.scale * state 77 | x_floor, y_floor = np.floor(scaled_state) 78 | assert x_floor <= self.scale 79 | assert y_floor <= self.scale 80 | if x_floor == self.scale: 81 | x_floor -= 1 82 | if y_floor == self.scale: 83 | y_floor -= 1 84 | index = self.scale*x_floor + y_floor 85 | return index 86 | 87 | def unprocess(self, index): 88 | x_floor = index // self.scale 89 | y_floor = index % self.scale 90 | unscaled_state = np.array([x_floor, y_floor])/self.scale 91 | return unscaled_state 92 | 93 | def seed(self, seed): 94 | pass 95 | 96 | def render(self): 97 | # create a grid 98 | states = [self.state/self.scale] 99 | indices = np.array([int(self.preprocess(s)) for s in states]) 100 | a = np.zeros(self.grid_size) 101 | for i in indices: 102 | a[i] += 1 103 | max_freq = np.max(a) 104 | a/=float(max_freq) # normalize 105 | a = np.reshape(a, (self.scale, self.scale)) 106 | ax = sns.heatmap(a) 107 | plt.draw() 108 | plt.pause(0.001) 109 | plt.clf() 110 | 111 | def visualize(self, states, itr, dirname): 112 | if states is None: 113 | states = np.load(os.path.join(dirname, '{}.npy'.format(itr))) 114 | indices = np.array([int(self.preprocess(s)) for s in states]) 115 | a = np.zeros(int(self.grid_size)) 116 | for i in indices: 117 | a[i] += 1 118 | max_freq = np.max(a) 119 | a/=float(max_freq) # normalize 120 | a = np.reshape(a, (self.scale, self.scale)) 121 | ax = sns.heatmap(a) 122 | plt.savefig(os.path.join(dirname, '{}.png'.format(itr))) 123 | plt.close() 124 | 125 | def create_gif(self, dirname, density=False): 126 | images = [] 127 | if density: 128 | filenames = [x for x in os.listdir(dirname) if '_density.png' in x] 129 | sorted_fnames = sorted(filenames, key=lambda x: int(x.split('_density.png')[0])) 130 | else: 131 | filenames = [x for x in os.listdir(dirname) if ('.png' in x and 'density' not in x)] 132 | sorted_fnames = sorted(filenames, key=lambda x: int(x.split('.png')[0])) 133 | for f in sorted_fnames: 134 | images.append(imageio.imread(os.path.join(dirname, f))) 135 | imageio.mimsave(os.path.join(dirname, 'hist_exploration.gif'), images) 136 | 137 | def create_visualization(self, dirname, density=False): 138 | for s in os.listdir(dirname): 139 | for i in tqdm(range(100)): 140 | self.visualize(None, i, os.path.join(dirname, str(s))) 141 | self.create_gif(os.path.join(dirname, str(s))) 142 | 143 | 144 | if __name__ == "__main__": 145 | logdir = 'pm_debug' 146 | os.mkdir(logdir) 147 | num_episodes = 10 148 | num_steps_per_epoch = 20 149 | 150 | env = PointMass() 151 | for epoch in range(num_episodes): 152 | states = [] 153 | state = env.reset() 154 | for i in range(num_steps_per_epoch): 155 | action = np.random.rand(2) 156 | state, reward, done, _ = env.step(action) 157 | states.append(state) 158 | env.visualize(np.array(states), epoch, logdir) 159 | env.create_gif(logdir) 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /hw5/cs285/envs/sparse_half_cheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5) 8 | utils.EzPickle.__init__(self) 9 | 10 | def step(self, action): 11 | ################################################# 12 | ctrl = False 13 | relu = False 14 | threshold = 10.0 15 | ################################################# 16 | xposbefore = self.sim.data.qpos[0] 17 | self.do_simulation(action, self.frame_skip) 18 | xposafter = self.sim.data.qpos[0] 19 | ob = self._get_obs() 20 | # reward_ctrl = - 0.1 * np.square(action).sum() 21 | # reward_run = (xposafter - xposbefore)/self.dt 22 | ################################################# 23 | if ctrl: 24 | reward_ctrl = - 0.1 * np.square(action).sum() 25 | else: 26 | reward_ctrl = 0 27 | if abs(xposafter) <= threshold: 28 | reward_run = 0.0 29 | else: 30 | if relu: 31 | reward_run = np.sign(xposafter)*(xposafter - xposbefore)/self.dt 32 | else: 33 | reward_run = 1.0 34 | ################################################# 35 | reward = reward_ctrl + reward_run 36 | done = False 37 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 38 | 39 | def _get_obs(self): 40 | return np.concatenate([ 41 | self.sim.data.qpos.flat[1:], 42 | self.sim.data.qvel.flat, 43 | ]) 44 | 45 | def reset_model(self): 46 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 47 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 48 | self.set_state(qpos, qvel) 49 | return self._get_obs() 50 | 51 | def viewer_setup(self): 52 | self.viewer.cam.distance = self.model.stat.extent * 0.5 53 | -------------------------------------------------------------------------------- /hw5/cs285/exploration/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw5/cs285/exploration/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/exploration/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/exploration/__pycache__/density_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/exploration/__pycache__/density_model.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/exploration/__pycache__/exploration.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/exploration/__pycache__/exploration.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/__pycache__/logger.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/logger.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/__pycache__/replay.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/replay.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/logger.py: -------------------------------------------------------------------------------- 1 | import os 2 | from tensorboardX import SummaryWriter 3 | import numpy as np 4 | 5 | class Logger: 6 | def __init__(self, log_dir, n_logged_samples=10, summary_writer=None): 7 | self._log_dir = log_dir 8 | print('########################') 9 | print('logging outputs to ', log_dir) 10 | print('########################') 11 | self._n_logged_samples = n_logged_samples 12 | self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1) 13 | 14 | def log_scalar(self, scalar, name, step_): 15 | self._summ_writer.add_scalar('{}'.format(name), scalar, step_) 16 | 17 | def log_scalars(self, scalar_dict, group_name, step, phase): 18 | """Will log all scalars in the same plot.""" 19 | self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step) 20 | 21 | def log_image(self, image, name, step): 22 | assert(len(image.shape) == 3) # [C, H, W] 23 | self._summ_writer.add_image('{}'.format(name), image, step) 24 | 25 | def log_video(self, video_frames, name, step, fps=10): 26 | assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!" 27 | self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps) 28 | 29 | def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'): 30 | 31 | # reshape the rollouts 32 | videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths] 33 | 34 | # max rollout length 35 | max_videos_to_save = np.min([max_videos_to_save, len(videos)]) 36 | max_length = videos[0].shape[0] 37 | for i in range(max_videos_to_save): 38 | if videos[i].shape[0]>max_length: 39 | max_length = videos[i].shape[0] 40 | 41 | # pad rollouts to all be same length 42 | for i in range(max_videos_to_save): 43 | if videos[i].shape[0] 0, "Figure logging requires input shape [batch x figures]!" 54 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 55 | 56 | def log_figure(self, figure, name, step, phase): 57 | """figure: matplotlib.pyplot figure handle""" 58 | self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step) 59 | 60 | def log_graph(self, array, name, step, phase): 61 | """figure: matplotlib.pyplot figure handle""" 62 | im = plot_graph(array) 63 | self._summ_writer.add_image('{}_{}'.format(name, phase), im, step) 64 | 65 | def dump_scalars(self, log_path=None): 66 | log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path 67 | self._summ_writer.export_scalars_to_json(log_path) 68 | 69 | def flush(self): 70 | self._summ_writer.flush() 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import copy 3 | from cs285.infrastructure.utils import * 4 | 5 | class ReplayBuffer(object): 6 | 7 | def __init__(self, max_size=1000000): 8 | 9 | self.max_size = max_size 10 | self.paths = [] 11 | self.obs = None 12 | self.acs = None 13 | self.concatenated_rews = None 14 | self.unconcatenated_rews = None 15 | self.next_obs = None 16 | self.terminals = None 17 | 18 | def add_rollouts(self, paths, noised = False): 19 | 20 | # add new rollouts into our list of rollouts 21 | for path in paths: 22 | self.paths.append(path) 23 | 24 | # convert new rollouts into their component arrays, and append them onto our arrays 25 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths) 26 | 27 | if noised: 28 | observations = add_noise(observations) 29 | next_observations = add_noise(next_observations) 30 | 31 | if self.obs is None: 32 | self.obs = observations[-self.max_size:] 33 | self.acs = actions[-self.max_size:] 34 | self.next_obs = next_observations[-self.max_size:] 35 | self.terminals = terminals[-self.max_size:] 36 | self.concatenated_rews = concatenated_rews[-self.max_size:] 37 | self.unconcatenated_rews = unconcatenated_rews[-self.max_size:] 38 | else: 39 | self.obs = np.concatenate([self.obs, observations])[-self.max_size:] 40 | self.acs = np.concatenate([self.acs, actions])[-self.max_size:] 41 | self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:] 42 | self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:] 43 | self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:] 44 | if isinstance(unconcatenated_rews, list): 45 | self.unconcatenated_rews += unconcatenated_rews 46 | else: 47 | self.unconcatenated_rews.append(unconcatenated_rews) 48 | 49 | ######################################## 50 | ######################################## 51 | 52 | def sample_random_rollouts(self, num_rollouts): 53 | rand_indices = np.random.permutation(len(self.paths))[:num_rollouts] 54 | return self.paths[rand_indices] 55 | 56 | def sample_recent_rollouts(self, num_rollouts=1): 57 | return self.paths[-num_rollouts:] 58 | 59 | def get_all_obs(self): 60 | return copy.deepcopy(self.obs) 61 | 62 | ######################################## 63 | ######################################## 64 | 65 | def sample_random_data(self, batch_size): 66 | 67 | assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0] 68 | rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size] 69 | return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices] 70 | 71 | def sample_recent_data(self, batch_size=1, concat_rew=True): 72 | 73 | if concat_rew: 74 | return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:] 75 | else: 76 | num_recent_rollouts_to_return = 0 77 | num_datapoints_so_far = 0 78 | index = -1 79 | while num_datapoints_so_far < batch_size: 80 | recent_rollout = self.paths[index] 81 | index -=1 82 | num_recent_rollouts_to_return +=1 83 | num_datapoints_so_far += get_pathlength(recent_rollout) 84 | rollouts_to_return = self.paths[-num_recent_rollouts_to_return:] 85 | observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return) 86 | return observations, actions, unconcatenated_rews, next_observations, terminals 87 | 88 | def __len__(self): 89 | return len(self.obs) 90 | -------------------------------------------------------------------------------- /hw5/cs285/infrastructure/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import torch 4 | from torch import nn 5 | 6 | class MLP(nn.Module): 7 | def __init__(self, 8 | input_dim, 9 | output_dim, 10 | n_layers, 11 | size, 12 | device, 13 | discrete, 14 | activation = nn.Tanh()): 15 | super().__init__() 16 | 17 | self.discrete = discrete 18 | 19 | # network architecture 20 | self.mlp = nn.ModuleList() 21 | self.mlp.append(nn.Linear(input_dim, size)) #first hidden layer 22 | self.mlp.append(activation) 23 | 24 | for h in range(n_layers - 1): #additional hidden layers 25 | self.mlp.append(nn.Linear(size, size)) 26 | self.mlp.append(activation) 27 | 28 | self.mlp.append(nn.Linear(size, output_dim)) #output layer, no activation function 29 | 30 | #if continuous define logstd variable 31 | if not self.discrete: 32 | self.logstd = nn.Parameter(torch.zeros(output_dim)) 33 | 34 | self.to(device) 35 | 36 | def forward(self, x): 37 | for layer in self.mlp: 38 | x = layer(x) 39 | if self.discrete: 40 | return x 41 | else: 42 | return (x, self.logstd.exp()) 43 | 44 | def save(self, filepath): 45 | torch.save(self.state_dict(), filepath) 46 | 47 | def restore(self, filepath): 48 | self.load_state_dict(torch.load(filepath)) 49 | 50 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, animate, itr): 51 | # Collect paths until we have enough timesteps 52 | timesteps_this_batch = 0 53 | paths = [] 54 | while True: 55 | #animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) 56 | animate_this_episode = (len(paths) == 0 and animate) 57 | path = sample_trajectory(env, policy, max_path_length, animate_this_episode) 58 | paths.append(path) 59 | timesteps_this_batch += get_pathlength(path) 60 | if timesteps_this_batch > min_timesteps_per_batch: 61 | break 62 | return paths, timesteps_this_batch 63 | 64 | def sample_trajectory(env, policy, max_path_length, animate_this_episode): 65 | ob = env.reset() 66 | obs, acs, rewards, next_obs, terminals = [], [], [], [], [] 67 | steps = 0 68 | while True: 69 | if animate_this_episode: 70 | env.render() 71 | time.sleep(0.1) 72 | 73 | obs.append(ob) 74 | ac = policy.get_action(ob) 75 | acs.append(ac) 76 | 77 | ob, rew, done, _ = env.step(ac) 78 | 79 | steps += 1 80 | next_obs.append(ob) 81 | rewards.append(rew) 82 | 83 | if done or steps > max_path_length: 84 | terminals.append(1) 85 | break 86 | else: 87 | terminals.append(0) 88 | 89 | path = {"observation" : np.array(obs, dtype=np.float32), 90 | "reward" : np.array(rewards, dtype=np.float32), 91 | "action" : np.array(acs, dtype=np.float32), 92 | "next_observation": np.array(next_obs, dtype=np.float32), 93 | "terminal": np.array(terminals, dtype=np.float32)} 94 | 95 | return path 96 | 97 | def convert_listofrollouts(paths): 98 | """ 99 | Take a list of rollout dictionaries 100 | and return separate arrays, 101 | where each array is a concatenation of that array from across the rollouts 102 | """ 103 | observations = np.concatenate([path["observation"] for path in paths]) 104 | actions = np.concatenate([path["action"] for path in paths]) 105 | next_observations = np.concatenate([path["next_observation"] for path in paths]) 106 | terminals = np.concatenate([path["terminal"] for path in paths]) 107 | concatenated_rewards = np.concatenate([path["reward"] for path in paths]) 108 | unconcatenated_rewards = [path["reward"] for path in paths] 109 | return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards 110 | 111 | def get_pathlength(path): 112 | return len(path["reward"]) 113 | -------------------------------------------------------------------------------- /hw5/cs285/policies/MLP_policy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | from cs285.infrastructure.utils import MLP 5 | 6 | class MLPPolicy: 7 | def __init__(self, 8 | ac_dim, 9 | ob_dim, 10 | n_layers, 11 | size, 12 | device, 13 | learning_rate, 14 | training=True, 15 | discrete=False, 16 | nn_baseline=False, 17 | **kwargs): 18 | super().__init__() 19 | 20 | # init vars 21 | self.device = device 22 | self.discrete = discrete 23 | self.training = training 24 | self.nn_baseline = nn_baseline 25 | 26 | # network architecture 27 | self.policy_mlp = MLP(ac_dim, ob_dim, n_layers, size, device, discrete) 28 | params = list(self.policy_mlp.parameters()) 29 | if self.nn_baseline: 30 | self.baseline_mlp = MLP(1, ob_dim, n_layers, size, device, True) 31 | params += list(self.baseline_mlp.parameters()) 32 | 33 | #optimizer 34 | if self.training: 35 | self.optimizer = torch.optim.Adam(params, lr = learning_rate) 36 | 37 | ################################## 38 | 39 | # update/train this policy 40 | def update(self, observations, actions): 41 | raise NotImplementedError 42 | 43 | # query the neural net that's our 'policy' function, as defined by an mlp above 44 | # query the policy with observation(s) to get selected action(s) 45 | def get_action(self, obs): 46 | output = self.policy_mlp(torch.Tensor(obs).to(self.device)) 47 | if self.discrete: 48 | action_probs = nn.functional.log_softmax(output).exp() 49 | return torch.multinomial(action_probs, num_samples = 1).cpu().detach().numpy()[0] 50 | else: 51 | return torch.normal(output[0], output[1]).cpu().detach().numpy() 52 | 53 | def get_log_prob(self, network_outputs, actions_taken): 54 | actions_taken = torch.Tensor(actions_taken).to(self.device) 55 | if self.discrete: 56 | network_outputs = nn.functional.log_softmax(network_outputs).exp() 57 | return torch.distributions.Categorical(network_outputs).log_prob(actions_taken) 58 | else: 59 | return torch.distributions.Normal(network_outputs[0], network_outputs[1]).log_prob(actions_taken).sum(-1) 60 | 61 | ##################################################### 62 | ##################################################### 63 | 64 | class MLPPolicyPG(MLPPolicy): 65 | 66 | def update(self, observations, acs_na, adv_n = None, acs_labels_na = None, qvals = None): 67 | policy_output = self.policy_mlp(torch.Tensor(observations).to(self.device)) 68 | logprob_pi = self.get_log_prob(policy_output, acs_na) 69 | 70 | self.optimizer.zero_grad() 71 | 72 | loss = torch.sum((-logprob_pi * torch.Tensor(adv_n).to(self.device))) 73 | loss.backward() 74 | 75 | if self.nn_baseline: 76 | baseline_prediction = self.baseline_mlp(torch.Tensor(observations).to(self.device)).view(-1) 77 | baseline_target = torch.Tensor((qvals - qvals.mean()) / (qvals.std() + 1e-8)).to(self.device) 78 | baseline_loss = nn.functional.mse_loss(baseline_prediction, baseline_target) 79 | baseline_loss.backward() 80 | 81 | self.optimizer.step() 82 | 83 | return loss 84 | 85 | ##################################################### 86 | ##################################################### 87 | 88 | class MLPPolicyAC(MLPPolicyPG): 89 | """ MLP policy required for actor-critic. 90 | 91 | Note: Your code for this class could in fact the same as MLPPolicyPG, except the neural net baseline 92 | would not be required (i.e. self.nn_baseline would always be false. It is separated here only 93 | to avoid any unintended errors. 94 | """ 95 | def __init__(self, *args, **kwargs): 96 | if 'nn_baseline' in kwargs.keys(): 97 | assert kwargs['nn_baseline'] == False, "MLPPolicyAC should not use the nn_baseline flag" 98 | super().__init__(*args, **kwargs) 99 | -------------------------------------------------------------------------------- /hw5/cs285/policies/__init__.py: -------------------------------------------------------------------------------- 1 | #init for making the folder a package 2 | -------------------------------------------------------------------------------- /hw5/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285/policies/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/policies/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /hw5/cs285_hw5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285_hw5.pdf -------------------------------------------------------------------------------- /hw5/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.5 2 | mujoco-py==1.50.1.56 3 | numpy 4 | seaborn 5 | tqdm -------------------------------------------------------------------------------- /hw5/setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='cs285', 6 | version='0.1.0', 7 | packages=['cs285'], 8 | ) --------------------------------------------------------------------------------