├── .gitattributes
├── README.md
├── hw1
    ├── README.txt
    ├── cs285
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── base_agent.cpython-37.pyc
    │   │   │   └── bc_agent.cpython-37.pyc
    │   │   └── bc_agent.py
    │   ├── expert_data
    │   │   ├── expert_data_Ant-v2.pkl
    │   │   ├── expert_data_HalfCheetah-v2.pkl
    │   │   ├── expert_data_Hopper-v2.pkl
    │   │   ├── expert_data_Humanoid-v2.pkl
    │   │   └── expert_data_Walker2d-v2.pkl
    │   ├── infrastructure
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── logger.cpython-37.pyc
    │   │   │   ├── replay_buffer.cpython-37.pyc
    │   │   │   ├── rl_trainer.cpython-37.pyc
    │   │   │   ├── tf_utils.cpython-37.pyc
    │   │   │   └── utils.cpython-37.pyc
    │   │   ├── logger.py
    │   │   ├── replay_buffer.py
    │   │   ├── rl_trainer.py
    │   │   └── utils.py
    │   ├── policies
    │   │   ├── MLP_policy.py
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── MLP_policy.cpython-37.pyc
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── base_policy.cpython-37.pyc
    │   │   │   └── loaded_gaussian_policy.cpython-37.pyc
    │   │   ├── experts
    │   │   │   ├── Ant.pkl
    │   │   │   ├── HalfCheetah.pkl
    │   │   │   ├── Hopper.pkl
    │   │   │   ├── Humanoid.pkl
    │   │   │   └── Walker2d.pkl
    │   │   └── loaded_gaussian_policy.py
    │   └── scripts
    │   │   └── run_hw1_behavior_cloning.py
    ├── cs285_hw1.pdf
    ├── downloads
    │   └── mjpro150
    │   │   ├── bin
    │   │       ├── basic
    │   │       ├── compile
    │   │       ├── derivative
    │   │       ├── libglew.so
    │   │       ├── libglewegl.so
    │   │       ├── libglewosmesa.so
    │   │       ├── libglfw.so.3
    │   │       ├── libmujoco150.so
    │   │       ├── libmujoco150nogl.so
    │   │       ├── record
    │   │       ├── simulate
    │   │       └── test
    │   │   ├── doc
    │   │       ├── README.txt
    │   │       └── REFERENCE.txt
    │   │   ├── include
    │   │       ├── glfw3.h
    │   │       ├── mjdata.h
    │   │       ├── mjmodel.h
    │   │       ├── mjrender.h
    │   │       ├── mjvisualize.h
    │   │       ├── mjxmacro.h
    │   │       └── mujoco.h
    │   │   ├── model
    │   │       ├── humanoid.xml
    │   │       └── humanoid100.xml
    │   │   └── sample
    │   │       ├── basic.cpp
    │   │       ├── compile.cpp
    │   │       ├── derivative.cpp
    │   │       ├── makefile
    │   │       ├── record.cpp
    │   │       ├── simulate.cpp
    │   │       └── test.cpp
    ├── requirements.txt
    └── setup.py
├── hw2
    ├── README.txt
    ├── cs285
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── pg_agent.cpython-37.pyc
    │   │   └── pg_agent.py
    │   ├── infrastructure
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── logger.cpython-37.pyc
    │   │   │   ├── replay_buffer.cpython-37.pyc
    │   │   │   ├── rl_trainer.cpython-37.pyc
    │   │   │   └── utils.cpython-37.pyc
    │   │   ├── logger.py
    │   │   ├── replay_buffer.py
    │   │   ├── rl_trainer.py
    │   │   └── utils.py
    │   ├── policies
    │   │   ├── MLP_policy.py
    │   │   ├── __init__.py
    │   │   └── __pycache__
    │   │   │   ├── MLP_policy.cpython-37.pyc
    │   │   │   └── __init__.cpython-37.pyc
    │   └── scripts
    │   │   └── run_hw2_policy_gradient.py
    ├── cs285_hw2.pdf
    ├── requirements.txt
    └── setup.py
├── hw3
    ├── README.txt
    ├── cs285
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── ac_agent.cpython-37.pyc
    │   │   │   └── dqn_agent.cpython-37.pyc
    │   │   ├── ac_agent.py
    │   │   └── dqn_agent.py
    │   ├── critics
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── bootstrapped_continuous_critic.cpython-37.pyc
    │   │   │   └── dqn_critic.cpython-37.pyc
    │   │   ├── bootstrapped_continuous_critic.py
    │   │   └── dqn_critic.py
    │   ├── infrastructure
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── atari_wrappers.cpython-37.pyc
    │   │   │   ├── dqn_utils.cpython-37.pyc
    │   │   │   ├── logger.cpython-37.pyc
    │   │   │   ├── models.cpython-37.pyc
    │   │   │   ├── replay_buffer.cpython-37.pyc
    │   │   │   ├── rl_trainer.cpython-37.pyc
    │   │   │   └── utils.cpython-37.pyc
    │   │   ├── atari_wrappers.py
    │   │   ├── dqn_utils.py
    │   │   ├── logger.py
    │   │   ├── models.py
    │   │   ├── replay_buffer.py
    │   │   ├── rl_trainer.py
    │   │   └── utils.py
    │   ├── policies
    │   │   ├── MLP_policy.py
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── MLP_policy.cpython-37.pyc
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── argmax_policy.cpython-37.pyc
    │   │   └── argmax_policy.py
    │   └── scripts
    │   │   ├── run_hw3_actor_critic.py
    │   │   └── run_hw3_dqn.py
    ├── cs285_hw3.pdf
    ├── lunar_lander.py
    ├── requirements.txt
    └── setup.py
├── hw4
    ├── README.txt
    ├── cs285
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   └── mb_agent.cpython-37.pyc
    │   │   └── mb_agent.py
    │   ├── envs
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   └── __init__.cpython-37.pyc
    │   │   ├── ant
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   │   ├── ant.cpython-35.pyc
    │   │   │   │   └── ant.cpython-37.pyc
    │   │   │   └── ant.py
    │   │   ├── cheetah
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   │   ├── cheetah.cpython-35.pyc
    │   │   │   │   └── cheetah.cpython-37.pyc
    │   │   │   └── cheetah.py
    │   │   ├── obstacles
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │   │   ├── __init__.cpython-35.pyc
    │   │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   │   ├── obstacles_env.cpython-35.pyc
    │   │   │   │   └── obstacles_env.cpython-37.pyc
    │   │   │   └── obstacles_env.py
    │   │   └── reacher
    │   │   │   ├── __init__.py
    │   │   │   ├── __pycache__
    │   │   │       ├── __init__.cpython-35.pyc
    │   │   │       ├── __init__.cpython-37.pyc
    │   │   │       ├── reacher_env.cpython-35.pyc
    │   │   │       └── reacher_env.cpython-37.pyc
    │   │   │   ├── assets
    │   │   │       └── sawyer.xml
    │   │   │   └── reacher_env.py
    │   ├── infrastructure
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-37.pyc
    │   │   │   ├── logger.cpython-37.pyc
    │   │   │   ├── replay_buffer.cpython-37.pyc
    │   │   │   ├── rl_trainer.cpython-37.pyc
    │   │   │   └── utils.cpython-37.pyc
    │   │   ├── logger.py
    │   │   ├── replay_buffer.py
    │   │   ├── rl_trainer.py
    │   │   └── utils.py
    │   ├── models
    │   │   ├── __pycache__
    │   │   │   └── ff_model.cpython-37.pyc
    │   │   └── ff_model.py
    │   ├── policies
    │   │   ├── MPC_policy.py
    │   │   ├── __init__.py
    │   │   └── __pycache__
    │   │   │   ├── MPC_policy.cpython-37.pyc
    │   │   │   └── __init__.cpython-37.pyc
    │   └── scripts
    │   │   └── run_hw4_mb.py
    ├── cs285_hw4.pdf
    └── setup.py
└── hw5
    ├── README.txt
    ├── cs285
        ├── agents
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-37.pyc
        │   │   └── ac_agent.cpython-37.pyc
        │   └── ac_agent.py
        ├── critics
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-37.pyc
        │   │   └── bootstrapped_continuous_critic.cpython-37.pyc
        │   └── bootstrapped_continuous_critic.py
        ├── envs
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-37.pyc
        │   │   ├── pointmass.cpython-37.pyc
        │   │   └── sparse_half_cheetah.cpython-37.pyc
        │   ├── pointmass.py
        │   └── sparse_half_cheetah.py
        ├── exploration
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-37.pyc
        │   │   ├── density_model.cpython-37.pyc
        │   │   └── exploration.cpython-37.pyc
        │   ├── density_model.py
        │   └── exploration.py
        ├── infrastructure
        │   ├── __init__.py
        │   ├── __pycache__
        │   │   ├── __init__.cpython-37.pyc
        │   │   ├── logger.cpython-37.pyc
        │   │   ├── replay.cpython-37.pyc
        │   │   ├── replay_buffer.cpython-37.pyc
        │   │   ├── rl_trainer.cpython-37.pyc
        │   │   └── utils.cpython-37.pyc
        │   ├── logger.py
        │   ├── replay_buffer.py
        │   ├── rl_trainer.py
        │   └── utils.py
        ├── policies
        │   ├── MLP_policy.py
        │   ├── __init__.py
        │   └── __pycache__
        │   │   ├── MLP_policy.cpython-37.pyc
        │   │   └── __init__.cpython-37.pyc
        └── scripts
        │   └── train_ac_exploration_f18.py
    ├── cs285_hw5.pdf
    ├── requirements.txt
    └── setup.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | hw1/downloads/* linguist-detectable=false
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # UC Berkeley Deep RL in Pytorch
 2 | 
 3 | Pytorch starter code for [UC Berkeley's CS285 Deep RL course](http://rail.eecs.berkeley.edu/deeprlcourse/). The code is meant to be used as a direct alternative to the [official HW repository](https://github.com/berkeleydeeprlcourse/homework_fall2019) for those who would rather complete the course assignments in pytorch. [Solutions to this starter code is available](https://github.com/mdeib/berkeley-deep-RL-pytorch-solutions).
 4 | 
 5 | # Changes
 6 | 
 7 | All tensorflow in the starter code was converted to pytorch and or numpy, and all solutions are to be written in pytorch. Overall structure of the HW starter code was kept mostly the same, although with the move to pytorch it made sense to delete some files and move thier contents elsewhere. The README.txt in each HW folder has been modified where necessary but the pdf has not - refer to the README.txt for any changes made in the pytorch version. Although tensorflow is not needed within the main code, the logging is still done with tensorboard and thus tensorflow is still needed to easily use and view tensorboard in your browser.
 8 | 
 9 | Please note that while this starter code has been shown to produce reasonable results when filled in correctly there may still exist small bugs/errors. 
10 | 


--------------------------------------------------------------------------------
/hw1/README.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 1) install package by running:
  3 | 
  4 | $ python setup.py develop
  5 | 
  6 | ##############################################
  7 | ##############################################
  8 | 
  9 | 2)install mujoco:
 10 | $ cd ~
 11 | $ mkdir .mujoco
 12 | $ cd <location_of_your_mjkey.txt>
 13 | $ cp mjkey.txt ~/.mujoco/
 14 | $ cd <this_repo>/downloads
 15 | $ cp -r mjpro150 ~/.mujoco/
 16 | 
 17 | add the following to bottom of your bashrc:
 18 | export LD_LIBRARY_PATH=~/.mujoco/mjpro150/bin/
 19 | 
 20 | NOTE IF YOU'RE USING A MAC:
 21 | The provided mjpro150 folder is for Linux. 
 22 | Please download the OSX version yourself, from https://www.roboti.us/index.html
 23 | 
 24 | ##############################################
 25 | ##############################################
 26 | 
 27 | 3)install other dependencies
 28 | 
 29 | -------------------
 30 | 
 31 | a) [PREFERRED] Option A:
 32 | 
 33 | i) install anaconda, if you don't already have it:
 34 | Download Anaconda2 (suggested v5.2 for linux): https://www.continuum.io/downloads
 35 | $ cd Downloads
 36 | $ bash Anaconda2-5.2.0-Linux-x86_64.sh #file name might be slightly different, but follows this format
 37 | 
 38 | Note that this install will modify the PATH variable in your bashrc.
 39 | You need to open a new terminal for that path change to take place (to be able to find 'conda' in the next step).
 40 | 
 41 | ii) create a conda env that will contain python 3:
 42 | $ conda create -n cs285_env python=3.5
 43 | 
 44 | iii) activate the environment (do this every time you open a new terminal and want to run code):
 45 | $ source activate cs285_env
 46 | 
 47 | iv) install the requirements into this conda env
 48 | $ pip install --user --requirement requirements.txt
 49 | 
 50 | v) get the appropriate version of pytorch (1.5.0+cu101 was used here, but your version will vary based on your device) and some version of tnesorflow to run tensorboard
 51 | 
 52 | vi) allow your code to be able to see 'cs285'
 53 | $ cd <path_to_hw>
 54 | $ pip install -e .
 55 | 
 56 | Note: This conda environment requires activating it every time you open a new terminal (in order to run code), but the benefit is that the required dependencies for this codebase will not affect existing/other versions of things on your computer. This stand-alone environment will have everything that is necessary.
 57 | 
 58 | -------------------
 59 | 
 60 | b) Option B:
 61 | 
 62 | i) install dependencies locally, by running:
 63 | $ pip install -r requirements.txt
 64 | 
 65 | ii) get the appropriate version of pytorch (1.5.0+cu101 was used here, but your version will vary based on your device) and some version of tnesorflow to run tensorboard
 66 | 
 67 | iii) set path to cs285 folder in run_hw1_behavior_cloning.py 
 68 | 
 69 | ##############################################
 70 | ##############################################
 71 | 
 72 | 4) code:
 73 | 
 74 | Blanks to be filled in are marked with "TODO"
 75 | The following files have blanks in them:
 76 | - scripts/run_hw1_behavior_cloning.py
 77 | - infrastructure/rl_trainer.py
 78 | - agents/bc_agent.py
 79 | - policies/MLP_policy.py
 80 | - infrastructure/replay_buffer.py
 81 | - infrastructure/utils.py
 82 | 
 83 | NOTE - tf_utils.py was deleted in the pytorch version
 84 | 
 85 | See the code + the hw pdf for more details.
 86 | 
 87 | ##############################################
 88 | ##############################################
 89 | 
 90 | 5) run code: 
 91 | 
 92 | Run the following command(s) for Section 1 (Behavior Cloning):
 93 | (All identical, one for each env)
 94 | 
 95 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Ant.pkl --env_name Ant-v2 --exp_name test_bc_ant --n_iter 1 --expert_data cs285/expert_data/expert_data_Ant-v2.pkl
 96 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/HalfCheetah.pkl --env_name HalfCheetah-v2 --exp_name test_bc_halfcheetah --n_iter 1 --expert_data cs285/expert_data/expert_data_HalfCheetah-v2.pkl
 97 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Hopper.pkl --env_name Hopper-v2 --exp_name test_bc_hopper --n_iter 1 --expert_data cs285/expert_data/expert_data_Hopper-v2.pkl
 98 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Humanoid.pkl --env_name Humanoid-v2 --exp_name test_bc_humanoid --n_iter 1 --expert_data cs285/expert_data/expert_data_Humanoid-v2.pkl
 99 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Walker2d.pkl --env_name Walker2d-v2 --exp_name test_bc_walker2d --n_iter 1 --expert_data cs285/expert_data/expert_data_Walker2d-v2.pkl
100 | 
101 | Run the following command for Section 2 (DAGGER):
102 | (NOTE: the --do_dagger flag, and the higher value for n_iter)
103 | 
104 | $ python cs285/scripts/run_hw1_behavior_cloning.py --expert_policy_file cs285/policies/experts/Walker2d.pkl --env_name Walker2d-v2 --exp_name test_dagger_walker --n_iter 10 --do_dagger --expert_data cs285/expert_data/expert_data_Walker2d-v2.pkl
105 | 
106 | ##############################################
107 | 
108 | 6) visualize saved tensorboard event file:
109 | 
110 | $ cd cs285/data/<your_log_dir>
111 | $ tensorboard --logdir .
112 | 
113 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab)
114 | 


--------------------------------------------------------------------------------
/hw1/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw1/cs285/agents/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/agents/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/agents/__pycache__/base_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/agents/__pycache__/base_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/agents/__pycache__/bc_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/agents/__pycache__/bc_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/agents/bc_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | 
 4 | from cs285.policies.MLP_policy import *
 5 | from cs285.infrastructure.replay_buffer import ReplayBuffer
 6 | from cs285.infrastructure.utils import *
 7 | 
 8 | class BCAgent:
 9 |     def __init__(self, env, agent_params):
10 |         # init vars
11 |         self.env = env
12 |         self.agent_params = agent_params
13 | 
14 |         # actor/policy
15 |         self.actor = MLPPolicySL(self.agent_params['ac_dim'],
16 |                                self.agent_params['ob_dim'],
17 |                                self.agent_params['n_layers'],
18 |                                self.agent_params['size'],
19 |                                self.agent_params['device'],
20 |                                discrete = self.agent_params['discrete'],
21 |                                learning_rate = self.agent_params['learning_rate'],
22 |                                ) ## TODO: look in here and implement this
23 | 
24 |         # replay buffer
25 |         self.replay_buffer = ReplayBuffer(self.agent_params['max_replay_buffer_size'])
26 | 
27 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
28 |         # training a BC agent refers to updating its actor using
29 |         # the given observations and corresponding action labels
30 |         self.actor.update(ob_no, ac_na) ## TODO: look in here and implement this
31 | 
32 |     def add_to_replay_buffer(self, paths):
33 |         self.replay_buffer.add_rollouts(paths)
34 | 
35 |     def sample(self, batch_size):
36 |         return self.replay_buffer.sample_random_data(batch_size) ## TODO: look in here and implement this
37 | 


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Ant-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_Ant-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_HalfCheetah-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_Hopper-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_Humanoid-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/expert_data/expert_data_Walker2d-v2.pkl


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__pycache__/logger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/logger.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__pycache__/tf_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/tf_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/infrastructure/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.tensorboard import SummaryWriter
 4 | import numpy as np
 5 | 
 6 | class Logger:
 7 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 8 |         self._log_dir = log_dir
 9 |         print('########################')
10 |         print('logging outputs to ', log_dir)
11 |         print('########################')
12 |         self._n_logged_samples = n_logged_samples
13 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
14 | 
15 |     def log_scalar(self, scalar, name, step_):
16 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
17 | 
18 |     def log_scalars(self, scalar_dict, group_name, step, phase):
19 |         """Will log all scalars in the same plot."""
20 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
21 | 
22 |     def log_image(self, image, name, step):
23 |         assert(len(image.shape) == 3)  # [C, H, W]
24 |         self._summ_writer.add_image('{}'.format(name), image, step)
25 | 
26 |     def log_video(self, video_frames, name, step, fps=10):
27 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
28 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
29 | 
30 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
31 | 
32 |         # reshape the rollouts
33 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
34 | 
35 |         # max rollout length
36 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
37 |         max_length = videos[0].shape[0]
38 |         for i in range(max_videos_to_save):
39 |             if videos[i].shape[0]>max_length:
40 |                 max_length = videos[i].shape[0]
41 | 
42 |         # pad rollouts to all be same length
43 |         for i in range(max_videos_to_save):
44 |             if videos[i].shape[0]<max_length:
45 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
46 |                 videos[i] = np.concatenate([videos[i], padding], 0)
47 | 
48 |         # log videos to tensorboard event file
49 |         print("Logging videos")
50 |         videos = np.stack(videos[:max_videos_to_save], 0)
51 |         self.log_video(videos, video_title, step, fps=fps)
52 | 
53 |     def log_figures(self, figure, name, step, phase):
54 |         """figure: matplotlib.pyplot figure handle"""
55 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
56 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
57 | 
58 |     def log_figure(self, figure, name, step, phase):
59 |         """figure: matplotlib.pyplot figure handle"""
60 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
61 | 
62 |     def log_graph(self, array, name, step, phase):
63 |         """figure: matplotlib.pyplot figure handle"""
64 |         im = plot_graph(array)
65 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
66 | 
67 |     def dump_scalars(self, log_path=None):
68 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
69 |         self._summ_writer.export_scalars_to_json(log_path)
70 | 
71 |     def flush(self):
72 |         self._summ_writer.flush()
73 | 


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import numpy as np
 3 | import gym
 4 | import os
 5 | 
 6 | from cs285.infrastructure.utils import *
 7 | 
 8 | class ReplayBuffer(object):
 9 | 
10 |     def __init__(self, max_size=1000000):
11 | 
12 |         self.max_size = max_size
13 | 
14 |         # store each rollout
15 |         self.paths = []
16 | 
17 |         # store (concatenated) component arrays from each rollout
18 |         self.obs = None
19 |         self.acs = None
20 |         self.rews = None
21 |         self.next_obs = None
22 |         self.terminals = None
23 | 
24 |     def __len__(self):
25 |         if self.obs is not None:
26 |             return self.obs.shape[0]
27 |         else:
28 |             return 0
29 | 
30 |     def add_rollouts(self, paths, concat_rew=True):
31 | 
32 |         # add new rollouts into our list of rollouts
33 |         for path in paths:
34 |             self.paths.append(path)
35 | 
36 |         # convert new rollouts into their component arrays, and append them onto our arrays
37 |         observations, actions, rewards, next_observations, terminals = convert_listofrollouts(paths, concat_rew)
38 | 
39 |         if self.obs is None:
40 |             self.obs = observations[-self.max_size:]
41 |             self.acs = actions[-self.max_size:]
42 |             self.rews = rewards[-self.max_size:]
43 |             self.next_obs = next_observations[-self.max_size:]
44 |             self.terminals = terminals[-self.max_size:]
45 |         else:
46 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
47 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
48 |             if concat_rew:
49 |                 self.rews = np.concatenate([self.rews, rewards])[-self.max_size:]
50 |             else:
51 |                 if isinstance(rewards, list):
52 |                     self.rews += rewards
53 |                 else:
54 |                     self.rews.append(rewards)
55 |                 self.rews = self.rews[-self.max_size:]
56 |             self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:]
57 |             self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:]
58 | 
59 |     ########################################
60 |     ########################################
61 | 
62 |     def sample_random_data(self, batch_size):
63 |         assert self.obs.shape[0] == self.acs.shape[0] == self.rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
64 | 
65 |         ## TODO return batch_size number of random entries from each of the 5 component arrays above
66 |         ## HINT 1: use np.random.permutation to sample random indices
67 |         ## HINT 2: return corresponding data points from each array (i.e., not different indices from each array)
68 |         ## HINT 3: look at the sample_recent_data function below
69 |         return TODO, TODO, TODO, TODO, TODO
70 | 
71 |     def sample_recent_data(self, batch_size=1):
72 |         return self.obs[-batch_size:], self.acs[-batch_size:], self.rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
73 | 


--------------------------------------------------------------------------------
/hw1/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import scipy
  4 | 
  5 | ############################################
  6 | ############################################
  7 | 
  8 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=("rgb_array")):
  9 | 
 10 |     #next two lines is a fix for the error: "GLEW initalization error: Missing GL version"
 11 |     #ignore if you do not recieve this error
 12 |     #if render:
 13 |     #    env.render(mode = "human")
 14 | 
 15 |     # initialize env for the beginning of a new rollout
 16 |     ob = TODO # HINT: should be the output of resetting the env
 17 | 
 18 |     # init vars
 19 |     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
 20 |     steps = 0
 21 |     while True:
 22 | 
 23 |         #  render image of the simulated env
 24 |         if render:
 25 |             if 'rgb_array' in render_mode:
 26 |                 if hasattr(env, 'sim'):
 27 |                     image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
 28 |                 else:
 29 |                     image_obs.append(env.render(mode=render_mode))
 30 |             if 'human' in render_mode:
 31 |                 env.render(mode=render_mode)
 32 |                 time.sleep(env.model.opt.timestep)
 33 | 
 34 |         # use the most recent ob to decide what to do
 35 |         obs.append(ob)
 36 |         ac = TODO # HINT: query the policy's get_action function
 37 |         ac = ac[0]
 38 |         acs.append(ac)
 39 | 
 40 |         # take that action and record results
 41 |         ob, rew, done, _ = env.step(ac)
 42 | 
 43 |         # record result of taking that action
 44 |         steps += 1
 45 |         next_obs.append(ob)
 46 |         rewards.append(rew)
 47 | 
 48 |         # TODO end the rollout if the rollout ended
 49 |         # HINT: rollout can end due to done, or due to max_path_length
 50 |         rollout_done = TODO # HINT: this is either 0 or 1
 51 |         terminals.append(rollout_done)
 52 | 
 53 |         if rollout_done:
 54 |             break
 55 | 
 56 |     return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 57 | 
 58 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
 59 |     """
 60 |         Collect rollouts until we have collected min_timesteps_per_batch steps.
 61 |         TODO implement this function
 62 |         Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
 63 |         Hint2: use get_pathlength to count the timesteps collected in each path
 64 |     """
 65 |     timesteps_this_batch = 0
 66 |     paths = []
 67 |     while timesteps_this_batch < min_timesteps_per_batch:
 68 | 
 69 |         TODO
 70 | 
 71 |     return paths, timesteps_this_batch
 72 | 
 73 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
 74 |     """
 75 |         Collect ntraj rollouts.
 76 |         TODO implement this function
 77 |         Hint1: use sample_trajectory to get each path (i.e. rollout) that goes into paths
 78 |     """
 79 |     paths = []
 80 | 
 81 |     TODO
 82 | 
 83 |     return paths
 84 | 
 85 | ############################################
 86 | ############################################
 87 | 
 88 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
 89 |     """
 90 |         Take info (separate arrays) from a single rollout
 91 |         and return it in a single dictionary
 92 |     """
 93 |     if image_obs != []:
 94 |         image_obs = np.stack(image_obs, axis=0)
 95 |     return {"observation" : np.array(obs, dtype=np.float32),
 96 |             "image_obs" : np.array(image_obs, dtype=np.uint8),
 97 |             "reward" : np.array(rewards, dtype=np.float32),
 98 |             "action" : np.array(acs, dtype=np.float32),
 99 |             "next_observation": np.array(next_obs, dtype=np.float32),
100 |             "terminal": np.array(terminals, dtype=np.float32)}
101 | 
102 | 
103 | def convert_listofrollouts(paths, concat_rew=True):
104 |     """
105 |         Take a list of rollout dictionaries
106 |         and return separate arrays,
107 |         where each array is a concatenation of that array from across the rollouts
108 |     """
109 |     observations = np.concatenate([path["observation"] for path in paths])
110 |     actions = np.concatenate([path["action"] for path in paths])
111 |     if concat_rew:
112 |         rewards = np.concatenate([path["reward"] for path in paths])
113 |     else:
114 |         rewards = [path["reward"] for path in paths]
115 |     next_observations = np.concatenate([path["next_observation"] for path in paths])
116 |     terminals = np.concatenate([path["terminal"] for path in paths])
117 |     return observations, actions, rewards, next_observations, terminals
118 | 
119 | ############################################
120 | ############################################
121 | 
122 | def get_pathlength(path):
123 |     return len(path["reward"])
124 | 


--------------------------------------------------------------------------------
/hw1/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | class MLPPolicy(nn.Module):
 6 | 
 7 |     def __init__(self,
 8 |         ac_dim,
 9 |         ob_dim,
10 |         n_layers,
11 |         size,
12 |         device,
13 |         lr = 1e-4,
14 |         training=True,
15 |         discrete=False, # unused for now
16 |         nn_baseline=False, # unused for now
17 |         **kwargs):
18 |         super().__init__()
19 | 
20 |         # init vars
21 |         self.training = training
22 |         self.device = device
23 | 
24 |         # network architecture
25 |         #TODO -build the network architecture
26 |         #HINT -build an nn.Modulelist() using the passed in parameters
27 | 
28 |         #loss and optimizer
29 |         if self.training:
30 |             # TODO define the loss that will be used to train this policy
31 |             self.loss_func = TODO
32 |             self.optimizer = torch.optim.Adam(self.parameters(), lr)
33 | 
34 |         self.to(device)
35 | 
36 |     ##################################
37 | 
38 |     def forward(self, x):
39 |         for layer in self.mlp:
40 |             x = layer(x)
41 |         return x
42 | 
43 |     ##################################
44 | 
45 |     def save(self, filepath):
46 |         torch.save(self.state_dict(), filepath)
47 | 
48 |     def restore(self, filepath):
49 |         self.load_state_dict(torch.load(filepath))
50 | 
51 |     ##################################
52 | 
53 |     # query this policy with observation(s) to get selected action(s)
54 |     def get_action(self, obs):
55 |         if len(obs.shape)>1:
56 |             observation = obs
57 |         else:
58 |             observation = obs[None]
59 | 
60 |         # TODO return the action that the policy prescribes
61 |         return TODO
62 | 
63 |     # update/train this policy
64 |     def update(self, observations, actions):
65 |         raise NotImplementedError
66 | 
67 | #####################################################
68 | #####################################################
69 | 
70 | class MLPPolicySL(MLPPolicy):
71 | 
72 |     """
73 |         This class is a special case of MLPPolicy,
74 |         which is trained using supervised learning.
75 |         The relevant functions to define are included below.
76 |     """
77 | 
78 |     def update(self, observations, actions):
79 |         assert self.training, 'Policy must be created with training = true in order to perform training updates...'
80 | 
81 |         # TODO define network update
82 |         #HINT - you need to calculate the prediction loss and then use optimizer.step()
83 | 


--------------------------------------------------------------------------------
/hw1/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw1/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/policies/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/policies/__pycache__/base_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/__pycache__/base_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/policies/__pycache__/loaded_gaussian_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/__pycache__/loaded_gaussian_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Ant.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/Ant.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/HalfCheetah.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/HalfCheetah.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Hopper.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/Hopper.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Humanoid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/Humanoid.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/experts/Walker2d.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285/policies/experts/Walker2d.pkl


--------------------------------------------------------------------------------
/hw1/cs285/policies/loaded_gaussian_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | import pickle
 5 | 
 6 | class Loaded_Gaussian_Policy(nn.Module):
 7 |     def __init__(self, filename, **kwargs):
 8 |         super().__init__()
 9 |         with open(filename, 'rb') as f:
10 |             data = pickle.loads(f.read())
11 | 
12 |         self.nonlin_type = data['nonlin_type']
13 |         policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
14 | 
15 |         assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
16 |         self.policy_params = data[policy_type]
17 | 
18 |         assert set(self.policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
19 | 
20 |         self.obsnorm_mean = self.policy_params['obsnorm']['Standardizer']['mean_1_D']
21 |         self.obsnorm_meansq = self.policy_params['obsnorm']['Standardizer']['meansq_1_D']
22 |         layer_params = self.policy_params['hidden']['FeedforwardNet']
23 | 
24 |         self.mlp = nn.ModuleList()
25 |         for layer_name in sorted(layer_params.keys()):
26 |             W = layer_params[layer_name]['AffineLayer']['W'].astype(np.float32)
27 |             b = layer_params[layer_name]['AffineLayer']['b'].astype(np.float32)
28 |             r, h = W.shape
29 | 
30 |             layer = nn.Linear(r,h)
31 |             layer.weight.data.copy_(torch.from_numpy(W.transpose()))
32 |             layer.bias.data.copy_(torch.from_numpy(b.squeeze(0)))
33 |             self.mlp.append(layer)
34 | 
35 |             if self.nonlin_type == 'lrelu':
36 |                 self.mlp.append(nn.LeakyReLU())
37 |             elif self.nonlin_type == 'tanh':
38 |                 self.mlp.append(nn.Tanh())
39 |             else:
40 |                 raise NotImplementedError(self.nonlin_type)
41 | 
42 |         #output layer
43 |         W = self.policy_params['out']['AffineLayer']['W'].astype(np.float32)
44 |         b = self.policy_params['out']['AffineLayer']['b'].astype(np.float32)
45 |         r, h = W.shape
46 |         layer = nn.Linear(r, h)
47 |         layer.weight.data.copy_(torch.from_numpy(W.transpose()))
48 |         layer.bias.data.copy_(torch.from_numpy(b.squeeze(0)))
49 |         self.mlp.append(layer)
50 | 
51 |     ##################################
52 | 
53 |     def obs_norm(self, obs_bo, obsnorm_mean, obsnorm_meansq):
54 |         obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
55 |         normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6)
56 |         return torch.FloatTensor(normedobs_bo).squeeze(0)
57 | 
58 |     ##################################
59 | 
60 |     def forward(self, obs):
61 |         x = self.obs_norm(obs, self.obsnorm_mean, self.obsnorm_meansq)
62 |         for layer in self.mlp:
63 |             x = layer(x)
64 |         return x
65 | 
66 |     ##################################
67 | 
68 |     def update(self, obs_no, acs_na, adv_n=None, acs_labels_na=None):
69 |         print("\n\nThis policy class simply loads in a particular type of policy and queries it.")
70 |         print("Not training procedure has been written, so do not try to train it.\n\n")
71 |         raise NotImplementedError
72 | 
73 |     def get_action(self, obs):
74 |         if len(obs.shape) > 1:
75 |             observation = obs
76 |         else:
77 |             observation = obs[None, :]
78 |         return self(obs)
79 | 


--------------------------------------------------------------------------------
/hw1/cs285/scripts/run_hw1_behavior_cloning.py:
--------------------------------------------------------------------------------
  1 | #Uncomment next two lines and replace the path if not using anaconda
  2 | #import sys
  3 | #sys.path.append(r'<your path to hw1 folder>')
  4 | 
  5 | import torch
  6 | import os
  7 | import time
  8 | import numpy as np
  9 | 
 10 | from cs285.infrastructure.rl_trainer import RL_Trainer
 11 | from cs285.agents.bc_agent import BCAgent
 12 | from cs285.policies.loaded_gaussian_policy import Loaded_Gaussian_Policy
 13 | 
 14 | class BC_Trainer(object):
 15 |     def __init__(self, params):
 16 | 
 17 |         #######################
 18 |         ## AGENT PARAMS
 19 |         #######################
 20 | 
 21 |         agent_params = {
 22 |             'n_layers': params['n_layers'],
 23 |             'size': params['size'],
 24 |             'learning_rate': params['learning_rate'],
 25 |             'max_replay_buffer_size': params['max_replay_buffer_size'],
 26 |             }
 27 | 
 28 |         self.params = params
 29 |         self.params['agent_class'] = BCAgent ## TODO: look in here and implement this
 30 |         self.params['agent_params'] = agent_params
 31 | 
 32 |         ################
 33 |         ## RL TRAINER
 34 |         ################
 35 | 
 36 |         self.rl_trainer = RL_Trainer(self.params) ## TODO: look in here and implement this
 37 | 
 38 |         #######################
 39 |         ## LOAD EXPERT POLICY
 40 |         #######################
 41 | 
 42 |         print('Loading expert policy from...', self.params['expert_policy_file'])
 43 |         self.loaded_expert_policy = Loaded_Gaussian_Policy(self.params['expert_policy_file'])
 44 |         print('Done restoring expert policy...')
 45 | 
 46 |     def run_training_loop(self):
 47 | 
 48 |         self.rl_trainer.run_training_loop(
 49 |             n_iter=self.params['n_iter'],
 50 |             initial_expertdata=self.params['expert_data'],
 51 |             collect_policy=self.rl_trainer.agent.actor,
 52 |             eval_policy=self.rl_trainer.agent.actor,
 53 |             relabel_with_expert=self.params['do_dagger'],
 54 |             expert_policy=self.loaded_expert_policy,
 55 |         )
 56 | 
 57 | 
 58 | def main():
 59 |     import argparse
 60 |     parser = argparse.ArgumentParser()
 61 |     parser.add_argument('--expert_policy_file', '-epf', type=str, required=True)  # relative to where you're running this script from
 62 |     parser.add_argument('--expert_data', '-ed', type=str, required=True) #relative to where you're running this script from
 63 |     parser.add_argument('--env_name', '-env', type=str, help='choices: Ant-v2, Humanoid-v2, Walker-v2, HalfCheetah-v2, Hopper-v2', required=True)
 64 |     parser.add_argument('--exp_name', '-exp', type=str, default='pick an experiment name', required=True)
 65 |     parser.add_argument('--do_dagger', action='store_true')
 66 |     parser.add_argument('--ep_len', type=int)
 67 | 
 68 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=10000)  # number of gradient steps for training policy (per iter in n_iter)
 69 |     parser.add_argument('--n_iter', '-n', type=int, default=1)
 70 | 
 71 |     parser.add_argument('--batch_size', type=int, default=1000)  # training data collected (in the env) during each iteration
 72 |     parser.add_argument('--eval_batch_size', type=int,
 73 |                         default=10000)  # eval data collected (in the env) for logging metrics
 74 |     parser.add_argument('--train_batch_size', type=int,
 75 |                         default=100)  # number of sampled data points to be used per gradient/train step
 76 | 
 77 |     parser.add_argument('--n_layers', type=int, default=2)  # depth, of policy to be learned
 78 |     parser.add_argument('--size', type=int, default=64)  # width of each layer, of policy to be learned
 79 |     parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)  # LR for supervised learning
 80 | 
 81 |     parser.add_argument('--video_log_freq', type=int, default=5)
 82 |     parser.add_argument('--scalar_log_freq', type=int, default=1)
 83 |     parser.add_argument('--use_gpu', action='store_true', default=True)
 84 |     parser.add_argument('--which_gpu', type=int, default=0)
 85 |     parser.add_argument('--max_replay_buffer_size', type=int, default=1000000)
 86 |     parser.add_argument('--seed', type=int, default=1)
 87 |     args = parser.parse_args()
 88 | 
 89 |     # convert args to dictionary
 90 |     params = vars(args)
 91 | 
 92 |     if torch.cuda.is_available() and params["use_gpu"]:
 93 |         which_gpu = "cuda:" + str(params["which_gpu"])
 94 |         params["device"] = torch.device(which_gpu)
 95 |         print("Pytorch is running on GPU", params["which_gpu"])
 96 |     else:
 97 |         params["device"] = torch.device("cpu")
 98 |         print("Pytorch is running on the CPU")
 99 | 
100 |     ##################################
101 |     ### CREATE DIRECTORY FOR LOGGING
102 |     ##################################
103 | 
104 |     logdir_prefix = 'bc_'
105 |     if args.do_dagger:
106 |         logdir_prefix = 'dagger_'
107 |         assert args.n_iter>1, ('DAGGER needs more than 1 iteration (n_iter>1) of training, to iteratively query the expert and train (after 1st warmstarting from behavior cloning).')
108 |     else:
109 |         assert args.n_iter==1, ('Vanilla behavior cloning collects expert data just once (n_iter=1)')
110 | 
111 |     ## directory for logging
112 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
113 |     if not (os.path.exists(data_path)):
114 |         os.makedirs(data_path)
115 |     logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
116 |     logdir = os.path.join(data_path, logdir)
117 |     params['logdir'] = logdir
118 |     if not(os.path.exists(logdir)):
119 |         os.makedirs(logdir)
120 | 
121 |     ###################
122 |     ### RUN TRAINING
123 |     ###################
124 | 
125 |     trainer = BC_Trainer(params)
126 |     trainer.run_training_loop()
127 | 
128 | if __name__ == "__main__":
129 |     main()
130 | 


--------------------------------------------------------------------------------
/hw1/cs285_hw1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/cs285_hw1.pdf


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/basic:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/basic


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/compile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/compile


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/derivative:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/derivative


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/libglew.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libglew.so


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/libglewegl.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libglewegl.so


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/libglewosmesa.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libglewosmesa.so


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/libglfw.so.3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libglfw.so.3


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/libmujoco150.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libmujoco150.so


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/libmujoco150nogl.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/libmujoco150nogl.so


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/record:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/record


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/simulate:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/simulate


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/bin/test:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw1/downloads/mjpro150/bin/test


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/doc/README.txt:
--------------------------------------------------------------------------------
 1 | Welcome to MuJoCo Pro version 1.50.
 2 | 
 3 | The full documentation is available at http://www.mujoco.org/book
 4 | The most relevant chapters are Overview, MJCF Models, and MuJoCo Pro.
 5 | 
 6 | Here we provide brief notes to get you started:
 7 | 
 8 | 
 9 | The activation key (which you should have received with your license) is a
10 | plain-text file whose path must be passed to the mj_activate() function.
11 | The code samples assume that it is called mjkey.txt in the bin directory.
12 | 
13 | Once you have mjkey.txt in the bin directory, run:
14 |   simulate ../model/humanoid.xml  (or ./simulate on Linux and OSX)
15 | to see MuJoCo Pro in action.
16 | 
17 | On Linux, you can use LD_LIBRARY_PATH to point the dynamic linker to the
18 | .so files, or copy them to a directory that is already in the linker path.
19 | On OSX, the MuJoCo Pro dynamic library is compiled with @executable_path/
20 | to avoid the need for installation in a predefined directory.
21 | 
22 | In general, the directory structure we have provided is merely a suggestion;
23 | feel free to re-organize it if needed. MuJoCo Pro does not have an installer
24 | and does not write any files outside the executable directory.
25 | 
26 | The makefile in the sample directory generates binaries in the bin directory.
27 | These binaries are pre-compiled and included in the software distribution.
28 | 
29 | While the software distribution contains only one model (humanoid.xml),
30 | additional models are available at http://www.mujoco.org/forum under Resources.
31 | 


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/include/mjrender.h:
--------------------------------------------------------------------------------
  1 | //---------------------------------//
  2 | //  This file is part of MuJoCo    //
  3 | //  Written by Emo Todorov         //
  4 | //  Copyright (C) 2017 Roboti LLC  //
  5 | //---------------------------------//
  6 | 
  7 | 
  8 | #pragma once
  9 | 
 10 | 
 11 | typedef enum _mjtGridPos            // grid position for overlay
 12 | {
 13 |     mjGRID_TOPLEFT      = 0,        // top left
 14 |     mjGRID_TOPRIGHT,                // top right
 15 |     mjGRID_BOTTOMLEFT,              // bottom left
 16 |     mjGRID_BOTTOMRIGHT              // bottom right
 17 | } mjtGridPos;
 18 | 
 19 | 
 20 | typedef enum _mjtFramebuffer        // OpenGL framebuffer option
 21 | {
 22 |     mjFB_WINDOW         = 0,        // default/window buffer
 23 |     mjFB_OFFSCREEN                  // offscreen buffer
 24 | } mjtFramebuffer;
 25 | 
 26 | 
 27 | typedef enum _mjtFontScale          // font scale, used at context creation
 28 | {
 29 |     mjFONTSCALE_100     = 100,      // normal scale, suitable in the absence of DPI scaling
 30 |     mjFONTSCALE_150     = 150,      // 150% scale
 31 |     mjFONTSCALE_200     = 200       // 200% scale
 32 | } mjtFontScale;
 33 | 
 34 | 
 35 | typedef enum _mjtFont               // font type, used at each text operation
 36 | {
 37 |     mjFONT_NORMAL       = 0,        // normal font
 38 |     mjFONT_SHADOW,                  // normal font with shadow (for higher contrast)
 39 |     mjFONT_BIG                      // big font (for user alerts)
 40 | } mjtFont;
 41 | 
 42 | 
 43 | struct _mjrRect                     // OpenGL rectangle
 44 | {
 45 |     int left;                       // left (usually 0)
 46 |     int bottom;                     // bottom (usually 0)
 47 |     int width;                      // width (usually buffer width)
 48 |     int height;                     // height (usually buffer height)
 49 | };
 50 | typedef struct _mjrRect mjrRect;
 51 | 
 52 | 
 53 | struct _mjrContext                  // custom OpenGL context
 54 | {
 55 |     // parameters copied from mjVisual
 56 |     float lineWidth;                // line width for wireframe rendering
 57 |     float shadowClip;               // clipping radius for directional lights
 58 |     float shadowScale;              // fraction of light cutoff for spot lights
 59 |     int shadowSize;                 // size of shadow map texture
 60 |     int offWidth;                   // width of offscreen buffer
 61 |     int offHeight;                  // height of offscreen buffer
 62 |     int offSamples;                 // number of offscreen buffer multisamples
 63 | 
 64 |     // offscreen rendering objects
 65 |     unsigned int offFBO;            // offscreen framebuffer object
 66 |     unsigned int offFBO_r;          // offscreen framebuffer for resolving multisamples
 67 |     unsigned int offColor;          // offscreen color buffer
 68 |     unsigned int offColor_r;        // offscreen color buffer for resolving multisamples
 69 |     unsigned int offDepthStencil;   // offscreen depth and stencil buffer
 70 |     unsigned int offDepthStencil_r; // offscreen depth and stencil buffer for resolving multisamples
 71 | 
 72 |     // shadow rendering objects
 73 |     unsigned int shadowFBO;         // shadow map framebuffer object
 74 |     unsigned int shadowTex;         // shadow map texture
 75 | 
 76 |     // texture objects and info
 77 |     int ntexture;                   // number of allocated textures
 78 |     int textureType[100];           // type of texture (mjtTexture)
 79 |     unsigned int texture[100];      // texture names
 80 | 
 81 |     // displaylist starting positions
 82 |     unsigned int basePlane;         // all planes from model
 83 |     unsigned int baseMesh;          // all meshes from model
 84 |     unsigned int baseHField;        // all hfields from model
 85 |     unsigned int baseBuiltin;       // all buildin geoms, with quality from model
 86 |     unsigned int baseFontNormal;    // normal font
 87 |     unsigned int baseFontShadow;    // shadow font
 88 |     unsigned int baseFontBig;       // big font
 89 | 
 90 |     // displaylist ranges
 91 |     int     rangePlane;             // all planes from model
 92 |     int     rangeMesh;              // all meshes from model
 93 |     int     rangeHField;            // all hfields from model
 94 |     int     rangeBuiltin;           // all builtin geoms, with quality from model
 95 |     int     rangeFont;              // all characters in font
 96 | 
 97 |     // character info
 98 |     int     charWidth[127];         // character widths: normal and shadow
 99 |     int     charWidthBig[127];      // chacarter widths: big
100 |     int     charHeight;             // character heights: normal and shadow
101 |     int     charHeightBig;          // character heights: big
102 | 
103 |     // capabilities
104 |     int     glewInitialized;        // is glew initialized
105 |     int     windowAvailable;        // is default/window framebuffer available
106 |     int     windowSamples;          // number of samples for default/window framebuffer
107 |     int     windowStereo;           // is stereo available for default/window framebuffer
108 |     int     windowDoublebuffer;     // is default/window framebuffer double buffered
109 | 
110 |     // only field that changes after mjr_makeContext
111 |     int     currentBuffer;          // currently active framebuffer: mjFB_WINDOW or mjFB_OFFSCREEN
112 | };
113 | typedef struct _mjrContext mjrContext;
114 | 
115 | 


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/sample/compile.cpp:
--------------------------------------------------------------------------------
  1 | //---------------------------------//
  2 | //  This file is part of MuJoCo    //
  3 | //  Written by Emo Todorov         //
  4 | //  Copyright (C) 2017 Roboti LLC  //
  5 | //---------------------------------//
  6 | 
  7 | 
  8 | #include "mujoco.h"
  9 | #include <stdlib.h>
 10 | #include <stdio.h>
 11 | #include <string.h>
 12 | #include <ctype.h>
 13 | 
 14 | 
 15 | // help
 16 | const char helpstring[] = 
 17 |     "\n Usage:  compile infile outfile\n"
 18 |     "   infile can be in mjcf, urdf, mjb format\n"
 19 |     "   outfile can be in mjcf, mjb, txt format\n\n"
 20 |     " Example:  compile model.xml model.mjb\n";
 21 | 
 22 | 
 23 | // deallocate and print message
 24 | int finish(const char* msg = 0, mjModel* m = 0)
 25 | {
 26 |     // deallocated everything
 27 |     if( m )
 28 |         mj_deleteModel(m);
 29 |     mj_deactivate();
 30 | 
 31 |     // print message
 32 |     if( msg )
 33 |         printf("%s\n", msg);
 34 | 
 35 |     return 0;
 36 | }
 37 | 
 38 | 
 39 | // possible file types
 40 | enum
 41 | {
 42 |     typeUNKNOWN = 0,
 43 |     typeXML,
 44 |     typeMJB,
 45 |     typeTXT
 46 | };
 47 | 
 48 | 
 49 | // determine file type
 50 | int filetype(const char* filename)
 51 | {
 52 |     // convert to lower case for string comparison
 53 |     char lower[1000];
 54 |     size_t i=0;
 55 |     while( i<strlen(filename) && i<999 )
 56 |     {
 57 |         lower[i] = (char)tolower(filename[i]);
 58 |         i++;
 59 |     }
 60 |     lower[i] = 0;
 61 | 
 62 |     // find last dot
 63 |     int dot = (int)strlen(lower);
 64 |     while( dot>=0 && lower[dot]!='.' )
 65 |         dot--;
 66 | 
 67 |     // no dot found
 68 |     if( dot<0 )
 69 |         return typeUNKNOWN;
 70 | 
 71 |     // check extension
 72 |     if( !strcmp(lower+dot, ".xml") || !strcmp(lower+dot, ".urdf") )
 73 |         return typeXML;
 74 |     else if( !strcmp(lower+dot, ".mjb") )
 75 |         return typeMJB;
 76 |     else if( !strcmp(lower+dot, ".txt") )
 77 |         return typeTXT;
 78 |     else
 79 |         return typeUNKNOWN;
 80 | }
 81 | 
 82 | 
 83 | 
 84 | // main function
 85 | int main(int argc, const char** argv)
 86 | {
 87 |     // model and error
 88 |     mjModel* m = 0;
 89 |     char error[1000];
 90 | 
 91 |     // print help if arguments are missing
 92 |     if( argc!=3 )
 93 |         return finish(helpstring);
 94 | 
 95 |     // activate MuJoCo Pro license (this must be *your* activation key)
 96 |     mj_activate("mjkey.txt");
 97 | 
 98 |     // determine file types
 99 |     int type1 = filetype(argv[1]);
100 |     int type2 = filetype(argv[2]);
101 | 
102 |     // check types
103 |     if( type1==typeUNKNOWN || type1==typeTXT || 
104 |         type2==typeUNKNOWN || (type1==typeMJB && type2==typeXML) )
105 |         return finish("Illegal combination of file formats");
106 | 
107 |     // make sure output file does not exist
108 |     FILE* fp = fopen(argv[2], "r");
109 |     if( fp )
110 |     {
111 |         fclose(fp);
112 |         return finish("Output file already exists");
113 |     }
114 | 
115 |     // load model
116 |     if( type1==typeXML )
117 |         m = mj_loadXML(argv[1], 0, error, 1000);
118 |     else
119 |         m = mj_loadModel(argv[1], 0);
120 | 
121 |     // check error
122 |     if( !m )
123 |     {
124 |         if( type1==typeXML )
125 |             return finish(error, 0);
126 |         else
127 |             return finish("Could not load model", 0);
128 |     }
129 | 
130 |     // save model
131 |     if( type2==typeXML )
132 |     {
133 |         if( mj_saveLastXML(argv[2], m, error, 1000) )
134 |             return finish(error, m);
135 |     }
136 |     else if( type2==typeMJB )
137 |         mj_saveModel(m, argv[2], 0, 0);
138 |     else
139 |         mj_printModel(m, argv[2]);
140 | 
141 |     // finalize
142 |     return finish("Done", m);
143 | }
144 | 


--------------------------------------------------------------------------------
/hw1/downloads/mjpro150/sample/makefile:
--------------------------------------------------------------------------------
 1 | COMMON=-O2 -I../include -L../bin -std=c++11 -mavx
 2 | 
 3 | default:
 4 | 	g++ $(COMMON) test.cpp -lmujoco150nogl -o ../bin/test
 5 | 	g++ $(COMMON) compile.cpp -lmujoco150nogl -o ../bin/compile
 6 | 	g++ $(COMMON) derivative.cpp -lmujoco150nogl -fopenmp -o ../bin/derivative
 7 | 	g++ $(COMMON) simulate.cpp -lmujoco150 -lGL -lglew ../bin/libglfw.so.3 -o ../bin/simulate
 8 | 	g++ $(COMMON) record.cpp -lmujoco150 -lGL -lglew ../bin/libglfw.so.3 -o ../bin/record
 9 | 	g++ $(COMMON) basic.cpp -lmujoco150 -lGL -lglew ../bin/libglfw.so.3 -o ../bin/basic
10 | 
11 | egl:	
12 | 	g++ $(COMMON) -DMJ_EGL record.cpp -lmujoco150 -lOpenGL -lEGL -lglewegl -o ../bin/recordegl
13 | 
14 | osmesa:	
15 | 	g++ $(COMMON) -DMJ_OSMESA record.cpp -lmujoco150 -lOSMesa -lglewosmesa -o ../bin/recordosmesa
16 | 
17 | all: default egl osmesa
18 | 


--------------------------------------------------------------------------------
/hw1/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.11
2 | mujoco-py==1.50.1.35
3 | matplotlib==2.2.2
4 | ipython==6.4.0
5 | moviepy==1.0.0


--------------------------------------------------------------------------------
/hw1/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------
/hw2/README.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 1) See hw1 if you'd like to see installation instructions. You do NOT have to redo them.
 3 | 
 4 | 
 5 | ##############################################
 6 | ##############################################
 7 | 
 8 | 
 9 | 2) Code:
10 | 
11 | -------------------------------------------
12 | 
13 | Files to look at, even though there are no explicit 'TODO' markings:
14 | - scripts/run_hw2_policy_gradient.py
15 | 
16 | -------------------------------------------
17 | 
18 | Relevant Code from the first HW has already been filled in in the following files:
19 | - infrastructure/rl_trainer.py
20 | - infrastructure/utils.py
21 | - policies/MLP_policy.py
22 | 
23 | -------------------------------------------
24 | 
25 | Blanks to be filled in now (for this assignment) are marked with 'TODO'
26 | 
27 | The following files have these:
28 | - agents/pg_agent.py
29 | - policies/MLP_policy.py
30 | 
31 | 
32 | ##############################################
33 | ##############################################
34 | 
35 | 
36 | 3) Run code with the following command: 
37 | 
38 | $ python cs285/scripts/run_hw2_policy_gradient.py --env_name CartPole-v1 --exp_name test_pg_cartpole
39 | $ python cs285/scripts/run_hw2_policy_gradient.py --env_name InvertedPendulum-v2 --exp_name test_pg_pendulum
40 | 
41 | Flags of relevance, when running the commands above (see pdf for more info):
42 | -n number of policy training iterations
43 | -rtg use reward_to_go for the value
44 | -dsa do not standardize the advantage values
45 | 
46 | ##############################################
47 | 
48 | 
49 | 4) Visualize saved tensorboard event file:
50 | 
51 | $ cd cs285/data/<your_log_dir>
52 | $ tensorboard --logdir .
53 | 
54 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab)
55 | 
56 | 


--------------------------------------------------------------------------------
/hw2/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw2/cs285/agents/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/agents/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/agents/__pycache__/pg_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/agents/__pycache__/pg_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/__pycache__/logger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/logger.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/infrastructure/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.tensorboard import SummaryWriter
 4 | import numpy as np
 5 | 
 6 | class Logger:
 7 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 8 |         self._log_dir = log_dir
 9 |         print('########################')
10 |         print('logging outputs to ', log_dir)
11 |         print('########################')
12 |         self._n_logged_samples = n_logged_samples
13 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
14 | 
15 |     def log_scalar(self, scalar, name, step_):
16 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
17 | 
18 |     def log_scalars(self, scalar_dict, group_name, step, phase):
19 |         """Will log all scalars in the same plot."""
20 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
21 | 
22 |     def log_image(self, image, name, step):
23 |         assert(len(image.shape) == 3)  # [C, H, W]
24 |         self._summ_writer.add_image('{}'.format(name), image, step)
25 | 
26 |     def log_video(self, video_frames, name, step, fps=10):
27 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
28 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
29 | 
30 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
31 | 
32 |         # reshape the rollouts
33 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
34 | 
35 |         # max rollout length
36 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
37 |         max_length = videos[0].shape[0]
38 |         for i in range(max_videos_to_save):
39 |             if videos[i].shape[0]>max_length:
40 |                 max_length = videos[i].shape[0]
41 | 
42 |         # pad rollouts to all be same length
43 |         for i in range(max_videos_to_save):
44 |             if videos[i].shape[0]<max_length:
45 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
46 |                 videos[i] = np.concatenate([videos[i], padding], 0)
47 | 
48 |         # log videos to tensorboard event file
49 |         videos = np.stack(videos[:max_videos_to_save], 0)
50 |         self.log_video(videos, video_title, step, fps=fps)
51 | 
52 |     def log_figures(self, figure, name, step, phase):
53 |         """figure: matplotlib.pyplot figure handle"""
54 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
55 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
56 | 
57 |     def log_figure(self, figure, name, step, phase):
58 |         """figure: matplotlib.pyplot figure handle"""
59 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
60 | 
61 |     def log_graph(self, array, name, step, phase):
62 |         """figure: matplotlib.pyplot figure handle"""
63 |         im = plot_graph(array)
64 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
65 | 
66 |     def dump_scalars(self, log_path=None):
67 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
68 |         self._summ_writer.export_scalars_to_json(log_path)
69 | 
70 |     def flush(self):
71 |         self._summ_writer.flush()
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from cs285.infrastructure.utils import *
 4 | 
 5 | class ReplayBuffer(object):
 6 | 
 7 |     def __init__(self, max_size=1000000):
 8 | 
 9 |         self.max_size = max_size
10 |         self.paths = []
11 |         self.obs = None
12 |         self.acs = None
13 |         self.concatenated_rews = None
14 |         self.unconcatenated_rews = None
15 |         self.next_obs = None
16 |         self.terminals = None
17 | 
18 |     def add_rollouts(self, paths):
19 | 
20 |         # add new rollouts into our list of rollouts
21 |         for path in paths:
22 |             self.paths.append(path)
23 | 
24 |         # convert new rollouts into their component arrays, and append them onto our arrays
25 |         observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
26 | 
27 |         if self.obs is None:
28 |             self.obs = observations[-self.max_size:]
29 |             self.acs = actions[-self.max_size:]
30 |             self.next_obs = next_observations[-self.max_size:]
31 |             self.terminals = terminals[-self.max_size:]
32 |             self.concatenated_rews = concatenated_rews[-self.max_size:]
33 |             self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
34 |         else:
35 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
36 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
37 |             self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:]
38 |             self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:]
39 |             self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:]
40 |             if isinstance(unconcatenated_rews, list):
41 |                 self.unconcatenated_rews += unconcatenated_rews
42 |             else:
43 |                 self.unconcatenated_rews.append(unconcatenated_rews)
44 | 
45 |     ########################################
46 |     ########################################
47 | 
48 |     def sample_random_rollouts(self, num_rollouts):
49 |         rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
50 |         return self.paths[rand_indices]
51 | 
52 |     def sample_recent_rollouts(self, num_rollouts=1):
53 |         return self.paths[-num_rollouts:]
54 | 
55 |     ########################################
56 |     ########################################
57 | 
58 |     def sample_random_data(self, batch_size):
59 | 
60 |         assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
61 |         rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
62 |         return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
63 | 
64 |     def sample_recent_data(self, batch_size=1, concat_rew=True):
65 | 
66 |         if concat_rew:
67 |             return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
68 |         else:
69 |             num_recent_rollouts_to_return = 0
70 |             num_datapoints_so_far = 0
71 |             index = -1
72 |             while num_datapoints_so_far < batch_size:
73 |                 recent_rollout = self.paths[index]
74 |                 index -=1
75 |                 num_recent_rollouts_to_return +=1
76 |                 num_datapoints_so_far += get_pathlength(recent_rollout)
77 |             rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
78 |             observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
79 |             return observations, actions, unconcatenated_rews, next_observations, terminals


--------------------------------------------------------------------------------
/hw2/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | 
  4 | ############################################
  5 | ############################################
  6 | 
  7 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
  8 | 
  9 |     if render:
 10 |         env.render(mode = "human")
 11 | 
 12 |     # initialize env for the beginning of a new rollout
 13 |     ob = env.reset() # TODO: GETTHIS from HW1
 14 | 
 15 |     # init vars
 16 |     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
 17 |     steps = 0
 18 |     while True:
 19 | 
 20 |         # render image of the simulated env
 21 |         if render:
 22 |             if 'rgb_array' in render_mode:
 23 |                 if hasattr(env, 'sim'):
 24 |                     if 'track' in env.env.model.camera_names:
 25 |                         image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
 26 |                     else:
 27 |                         image_obs.append(env.sim.render(height=500, width=500)[::-1])
 28 |                 else:
 29 |                     image_obs.append(env.render(mode=render_mode))
 30 |             if 'human' in render_mode:
 31 |                 env.render(mode=render_mode)
 32 |                 time.sleep(env.model.opt.timestep)
 33 | 
 34 |         # use the most recent ob to decide what to do
 35 |         obs.append(ob)
 36 |         ac = policy.get_action(ob)
 37 |         acs.append(ac)
 38 | 
 39 |         # take that action and record results
 40 |         ob, rew, done, _ = env.step(ac)
 41 | 
 42 |         # record result of taking that action
 43 |         steps += 1
 44 |         next_obs.append(ob)
 45 |         rewards.append(rew)
 46 | 
 47 |         # End the rollout if the rollout ended
 48 |         # Note that the rollout can end due to done, or due to max_path_length
 49 |         rollout_done = done or steps >= max_path_length
 50 |         terminals.append(rollout_done)
 51 | 
 52 |         if rollout_done:
 53 |             break
 54 | 
 55 |     return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 56 | 
 57 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
 58 | 
 59 |     timesteps_left = min_timesteps_per_batch
 60 |     timesteps_this_batch = 0
 61 |     paths = []
 62 | 
 63 |     while timesteps_this_batch < min_timesteps_per_batch:
 64 |         paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode))
 65 |         timesteps_this_batch += get_pathlength(paths[-1])
 66 | 
 67 |     return paths, timesteps_this_batch
 68 | 
 69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
 70 |     paths = []
 71 |     for n in range(ntraj):
 72 |         paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode))
 73 | 
 74 |     return paths
 75 | 
 76 | ############################################
 77 | ############################################
 78 | 
 79 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
 80 |     """
 81 |         Take info (separate arrays) from a single rollout
 82 |         and return it in a single dictionary
 83 |     """
 84 |     if image_obs != []:
 85 |         image_obs = np.stack(image_obs, axis=0)
 86 |     return {"observation" : np.array(obs, dtype=np.float32),
 87 |             "image_obs" : np.array(image_obs, dtype=np.uint8),
 88 |             "reward" : np.array(rewards, dtype=np.float32),
 89 |             "action" : np.array(acs, dtype=np.float32),
 90 |             "next_observation": np.array(next_obs, dtype=np.float32),
 91 |             "terminal": np.array(terminals, dtype=np.float32)}
 92 | 
 93 | 
 94 | def convert_listofrollouts(paths):
 95 |     """
 96 |         Take a list of rollout dictionaries
 97 |         and return separate arrays,
 98 |         where each array is a concatenation of that array from across the rollouts
 99 |     """
100 |     observations = np.concatenate([path["observation"] for path in paths])
101 |     actions = np.concatenate([path["action"] for path in paths])
102 |     next_observations = np.concatenate([path["next_observation"] for path in paths])
103 |     terminals = np.concatenate([path["terminal"] for path in paths])
104 |     concatenated_rewards = np.concatenate([path["reward"] for path in paths])
105 |     unconcatenated_rewards = [path["reward"] for path in paths]
106 |     return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
107 | 
108 | ############################################
109 | ############################################
110 | 
111 | def get_pathlength(path):
112 |     return len(path["reward"])
113 | 


--------------------------------------------------------------------------------
/hw2/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | class MLP(nn.Module):
  6 |     def __init__(self,
  7 |         ac_dim,
  8 |         ob_dim,
  9 |         n_layers,
 10 |         size,
 11 |         device,
 12 |         discrete,
 13 |         activation = nn.Tanh()):
 14 |         super().__init__()
 15 | 
 16 |         self.discrete = discrete
 17 | 
 18 |         #TODO -build the network architecture -can be taken from HW1
 19 |         #HINT -build an nn.Modulelist() using the passed in parameters
 20 | 
 21 |         #if continuous define logstd variable
 22 |         if not self.discrete:
 23 |             self.logstd = nn.Parameter(torch.zeros(ac_dim))
 24 | 
 25 |         self.to(device)
 26 | 
 27 |     def forward(self, x):
 28 |         for layer in self.mlp:
 29 |             x = layer(x)
 30 |         if self.discrete:
 31 |             return x
 32 |         else:
 33 |             return (x, self.logstd.exp())
 34 | 
 35 | class MLPPolicy:
 36 |     def __init__(self,
 37 |         ac_dim,
 38 |         ob_dim,
 39 |         n_layers,
 40 |         size,
 41 |         device,
 42 |         learning_rate,
 43 |         training=True,
 44 |         discrete=False,
 45 |         nn_baseline=False,
 46 |         **kwargs):
 47 |         super().__init__()
 48 | 
 49 |         # init vars
 50 |         self.device = device
 51 |         self.discrete = discrete
 52 |         self.training = training
 53 |         self.nn_baseline = nn_baseline
 54 | 
 55 |         # network architecture
 56 |         self.policy_mlp = MLP(ac_dim, ob_dim, n_layers, size, device, discrete)
 57 |         params = list(self.policy_mlp.parameters())
 58 |         if self.nn_baseline:
 59 |             self.baseline_mlp = MLP(1, ob_dim, n_layers, size, device, True)
 60 |             params += list(self.baseline_mlp.parameters())
 61 | 
 62 |         #optimizer
 63 |         if self.training:
 64 |             self.optimizer = torch.optim.Adam(params, lr = learning_rate)
 65 | 
 66 |     ##################################
 67 | 
 68 |     # update/train this policy
 69 |     def update(self, observations, actions):
 70 |         raise NotImplementedError
 71 | 
 72 |     # query the neural net that's our 'policy' function, as defined by the policy_mlp above
 73 |     # query the policy with observation(s) to get selected action(s)
 74 |     def get_action(self, obs):
 75 |         raise NotImplementedError
 76 |         #implement similar to HW1
 77 | 
 78 |     def get_log_prob(self, network_outputs, actions_taken):
 79 |         actions_taken = torch.Tensor(actions_taken).to(self.device)
 80 |         if self.discrete:
 81 |             #log probability under a categorical distribution
 82 |             network_outputs = nn.functional.log_softmax(network_outputs).exp()
 83 |             return torch.distributions.Categorical(network_outputs).log_prob(actions_taken)
 84 |         else:
 85 |             #log probability under a multivariate gaussian
 86 |             return torch.distributions.Normal(network_outputs[0], network_outputs[1]).log_prob(actions_taken).sum(-1)
 87 | 
 88 | #####################################################
 89 | #####################################################
 90 | 
 91 | class MLPPolicyPG(MLPPolicy):
 92 | 
 93 |     def update(self, observations, acs_na, adv_n = None, acs_labels_na = None, qvals = None):
 94 |         policy_output = self.policy_mlp(torch.Tensor(observations).to(self.device))
 95 |         logprob_pi = self.get_log_prob(policy_output, acs_na)
 96 | 
 97 |         #TODO Don't forget to zero out the gradient
 98 | 
 99 |         # TODO: define the loss that should be optimized when training a policy with policy gradient
100 |         # HINT1: Recall that the expression that we want to MAXIMIZE
101 |             # is the expectation over collected trajectories of:
102 |             # sum_{t=0}^{T-1} [grad [log pi(a_t|s_t) * (Q_t - b_t)]]
103 |         # HINT2: look at logprob_pi above
104 |         # HINT3: don't forget that we need to MINIMIZE this self.loss
105 |             # but the equation above is something that should be maximized
106 |         #HINT4: Don't forget to propagate the loss backward
107 | 
108 |         if self.nn_baseline:
109 |             baseline_prediction = self.baseline_mlp(torch.Tensor(observations).to(self.device)).view(-1)
110 |             baseline_target = torch.Tensor((qvals - qvals.mean()) / (qvals.std() + 1e-8)).to(self.device)
111 | 
112 |             # TODO: define the loss that should be optimized for training the baseline
113 |             # HINT1: use nn.functional.mse_loss, similar to SL loss from hw1
114 |             # HINT2: we want predictions (baseline_prediction) to be as close as possible to the labels (baseline_target)
115 |             # HINT3: Don't forget to propagate the loss backward
116 | 
117 |         #step the optimizer
118 |         return loss
119 | 
120 | #####################################################
121 | #####################################################
122 | 


--------------------------------------------------------------------------------
/hw2/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw2/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/policies/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285/policies/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw2/cs285/scripts/run_hw2_policy_gradient.py:
--------------------------------------------------------------------------------
  1 | #Uncomment next two lines and replace the path if not using anaconda
  2 | #import sys
  3 | #sys.path.append(r'<your path to hw1 folder>')
  4 | 
  5 | import torch
  6 | import os
  7 | import time
  8 | 
  9 | from cs285.infrastructure.rl_trainer import RL_Trainer
 10 | from cs285.agents.pg_agent import PGAgent
 11 | 
 12 | class PG_Trainer(object):
 13 | 
 14 |     def __init__(self, params):
 15 | 
 16 |         #####################
 17 |         ## SET AGENT PARAMS
 18 |         #####################
 19 | 
 20 |         computation_graph_args = {
 21 |             'n_layers': params['n_layers'],
 22 |             'size': params['size'],
 23 |             'learning_rate': params['learning_rate'],
 24 |             'device': params['device'],
 25 |             }
 26 | 
 27 |         estimate_advantage_args = {
 28 |             'gamma': params['discount'],
 29 |             'standardize_advantages': not(params['dont_standardize_advantages']),
 30 |             'reward_to_go': params['reward_to_go'],
 31 |             'nn_baseline': params['nn_baseline'],
 32 |         }
 33 | 
 34 |         train_args = {
 35 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
 36 |         }
 37 | 
 38 |         agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}
 39 | 
 40 |         self.params = params
 41 |         self.params['agent_class'] = PGAgent
 42 |         self.params['agent_params'] = agent_params
 43 |         self.params['batch_size_initial'] = self.params['batch_size']
 44 | 
 45 |         ################
 46 |         ## RL TRAINER
 47 |         ################
 48 | 
 49 |         self.rl_trainer = RL_Trainer(self.params)
 50 | 
 51 |     def run_training_loop(self):
 52 | 
 53 |         self.rl_trainer.run_training_loop(
 54 |             self.params['n_iter'],
 55 |             collect_policy = self.rl_trainer.agent.actor,
 56 |             eval_policy = self.rl_trainer.agent.actor,
 57 |             )
 58 | 
 59 | 
 60 | def main():
 61 | 
 62 |     import argparse
 63 |     parser = argparse.ArgumentParser()
 64 |     parser.add_argument('--env_name', type=str)
 65 |     parser.add_argument('--exp_name', type=str, default='todo')
 66 |     parser.add_argument('--n_iter', '-n', type=int, default=200)
 67 | 
 68 |     parser.add_argument('--reward_to_go', '-rtg', action='store_true')
 69 |     parser.add_argument('--nn_baseline', action='store_true')
 70 |     parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true')
 71 |     parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration
 72 |     parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
 73 | 
 74 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
 75 |     parser.add_argument('--discount', type=float, default=1)
 76 |     parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
 77 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
 78 |     parser.add_argument('--size', '-s', type=int, default=64)
 79 | 
 80 |     parser.add_argument('--ep_len', type=int) #students shouldn't change this away from env's default
 81 |     parser.add_argument('--seed', type=int, default=1)
 82 |     parser.add_argument('--use_gpu', '-gpu', default = True)
 83 |     parser.add_argument('--which_gpu', '-gpu_id', default=0)
 84 |     parser.add_argument('--video_log_freq', type=int, default=-1)   # video log disabled
 85 |     parser.add_argument('--scalar_log_freq', type=int, default=1)
 86 | 
 87 |     parser.add_argument('--save_params', action='store_true')
 88 | 
 89 |     args = parser.parse_args()
 90 | 
 91 |     # convert to dictionary
 92 |     params = vars(args)
 93 | 
 94 |     if torch.cuda.is_available() and params["use_gpu"]:
 95 |         which_gpu = "cuda:" + str(params["which_gpu"])
 96 |         params["device"] = torch.device(which_gpu)
 97 |         print("Pytorch is running on GPU", params["which_gpu"])
 98 |     else:
 99 |         params["device"] = torch.device("cpu")
100 |         print("Pytorch is running on the CPU")
101 | 
102 |     # for this assignment, we train on everything we recently collected
103 |     # so making train_batch_size=batch_size
104 |     params['train_batch_size']=params['batch_size']
105 | 
106 |     ##################################
107 |     ### CREATE DIRECTORY FOR LOGGING
108 |     ##################################
109 | 
110 |     logdir_prefix = 'pg_'
111 | 
112 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
113 | 
114 |     if not (os.path.exists(data_path)):
115 |         os.makedirs(data_path)
116 | 
117 |     logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
118 |     logdir = os.path.join(data_path, logdir)
119 |     params['logdir'] = logdir
120 |     if not(os.path.exists(logdir)):
121 |         os.makedirs(logdir)
122 | 
123 |     ###################
124 |     ### RUN TRAINING
125 |     ###################
126 | 
127 |     trainer = PG_Trainer(params)
128 |     trainer.run_training_loop()
129 | 
130 | 
131 | if __name__ == "__main__":
132 |     main()
133 | 


--------------------------------------------------------------------------------
/hw2/cs285_hw2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw2/cs285_hw2.pdf


--------------------------------------------------------------------------------
/hw2/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.11
2 | mujoco-py==1.50.1.35
3 | matplotlib==2.2.2
4 | ipython==6.4.0
5 | moviepy==1.0.0


--------------------------------------------------------------------------------
/hw2/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------
/hw3/README.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 1) See hw1 if you'd like to see installation instructions. You do NOT have to redo them. But, you need to install OpenCV for this assignment:
 3 | `pip install opencv-python==3.4.0.12`
 4 | 
 5 | You also need to replace `<pathtogym>/gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file. To find the file:
 6 | $ locate lunar_lander.py
 7 | (or if there are multiple options there):
 8 | $ source activate cs285_env
 9 | $ ipython
10 | $ import gym
11 | $ gym.__file__
12 | <pathtogym>/gym/__init__.py
13 | ##############################################
14 | ##############################################
15 | 
16 | 
17 | 2) Code:
18 | 
19 | -------------------------------------------
20 | 
21 | Files to look at, even though there are no explicit 'TODO' markings:
22 | - scripts/run_hw3_dqn.py
23 | - scripts/run_hw3_actor_critic.py
24 | - infrastructure/models.py
25 | - policies/dqn_utils.py
26 | - policies/MLP_policy.py
27 | 
28 | -------------------------------------------
29 | 
30 | Blanks to be filled in now (for this assignment) are marked with 'TODO'
31 | 
32 | The following files have these:
33 | - critics/dqn_critic.py
34 | - agents/dqn_agent.py
35 | - policies/argmax_policy.py
36 | - critics/bootstrapped_continuous_critic.py
37 | - agents/ac_agent.py
38 | 
39 | ##############################################
40 | ##############################################
41 | 
42 | 
43 | 3) Run code with the following command: 
44 | 
45 | $ python cs285/scripts/run_hw3_dqn.py --env_name PongNoFrameskip-v4 --exp_name test_pong
46 | $ python cs285/scripts/run_hw3_actor_critic.py --env_name CartPole-v0 -n 100 -b 1000 --exp_name 100_1 -ntu 100 -ngsptu 1
47 | 
48 | Flags of relevance, when running the commands above (see pdf for more info):
49 | -double_q Whether to use double Q learning or not.
50 | 
51 | ##############################################
52 | 
53 | 
54 | 4) Visualize saved tensorboard event file:
55 | 
56 | $ cd cs285/data/<your_log_dir>
57 | $ tensorboard --logdir .
58 | 
59 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab)
60 | 


--------------------------------------------------------------------------------
/hw3/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw3/cs285/agents/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/agents/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/agents/__pycache__/ac_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/agents/__pycache__/ac_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/agents/__pycache__/dqn_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/agents/__pycache__/dqn_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/agents/ac_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | from collections import OrderedDict
 4 | 
 5 | from cs285.policies.MLP_policy import MLPPolicyAC
 6 | from cs285.critics.bootstrapped_continuous_critic import BootstrappedContinuousCritic
 7 | from cs285.infrastructure.replay_buffer import ReplayBuffer
 8 | from cs285.infrastructure.utils import *
 9 | 
10 | class ACAgent:
11 |     def __init__(self, env, agent_params):
12 |         super(ACAgent, self).__init__()
13 | 
14 |         self.env = env
15 |         self.agent_params = agent_params
16 |         self.num_critic_updates_per_agent_update = agent_params['num_critic_updates_per_agent_update']
17 |         self.num_actor_updates_per_agent_update = agent_params['num_actor_updates_per_agent_update']3
18 |         self.device = agent_params['device']
19 | 
20 |         self.gamma = self.agent_params['gamma']
21 |         self.standardize_advantages = self.agent_params['standardize_advantages']
22 | 
23 |         self.actor = MLPPolicyAC(self.agent_params['ac_dim'],
24 |                                self.agent_params['ob_dim'],
25 |                                self.agent_params['n_layers'],
26 |                                self.agent_params['size'],
27 |                                self.agent_params['device'],
28 |                                discrete=self.agent_params['discrete'],
29 |                                learning_rate=self.agent_params['learning_rate'],
30 |                                )
31 |         self.critic = BootstrappedContinuousCritic(self.agent_params)
32 | 
33 |         self.replay_buffer = ReplayBuffer()
34 | 
35 |     def estimate_advantage(self, ob_no, next_ob_no, re_n, terminal_n):
36 |         ob, next_ob, rew, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n])
37 | 
38 |         # TODO Implement the following pseudocode:
39 |             # 1) query the critic with ob_no, to get V(s)
40 |             # 2) query the critic with next_ob_no, to get V(s')
41 |             # 3) estimate the Q value as Q(s, a) = r(s, a) + gamma*V(s')
42 |             # HINT: Remember to cut off the V(s') term (ie set it to 0) at terminal states (ie terminal_n=1)
43 |             # 4) calculate advantage (adv_n) as A(s, a) = Q(s, a) - V(s)
44 | 
45 |         adv_n = TODO
46 | 
47 |         if self.standardize_advantages:
48 |             adv_n = (adv_n - np.mean(adv_n)) / (np.std(adv_n) + 1e-8)
49 |         return adv_n
50 | 
51 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
52 | 
53 |         # TODO Implement the following pseudocode:
54 |             # for agent_params['num_critic_updates_per_agent_update'] steps,
55 |             #     update the critic
56 | 
57 |             # advantage = estimate_advantage(...)
58 | 
59 |             # for agent_params['num_actor_updates_per_agent_update'] steps,
60 |             #     update the actor
61 | 
62 |         TODO
63 | 
64 |         loss = OrderedDict()
65 |         loss['Critic_Loss'] = TODO  # put final critic loss here
66 |         loss['Actor_Loss'] = TODO  # put final actor loss here
67 |         return loss
68 | 
69 |     def add_to_replay_buffer(self, paths):
70 |         self.replay_buffer.add_rollouts(paths)
71 | 
72 |     def sample(self, batch_size):
73 |         return self.replay_buffer.sample_recent_data(batch_size)
74 | 


--------------------------------------------------------------------------------
/hw3/cs285/agents/dqn_agent.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | from cs285.infrastructure.dqn_utils import MemoryOptimizedReplayBuffer, PiecewiseSchedule
  5 | from cs285.policies.argmax_policy import ArgMaxPolicy
  6 | from cs285.critics.dqn_critic import DQNCritic
  7 | 
  8 | 
  9 | class DQNAgent(object):
 10 |     def __init__(self, env, agent_params):
 11 | 
 12 |         print(agent_params['optimizer_spec'])
 13 | 
 14 |         self.env = env
 15 |         self.agent_params = agent_params
 16 |         self.batch_size = agent_params['batch_size']
 17 |         self.device = agent_params['device']
 18 |         self.last_obs = self.env.reset()
 19 | 
 20 |         self.num_actions = agent_params['ac_dim']
 21 |         self.learning_starts = agent_params['learning_starts']
 22 |         self.learning_freq = agent_params['learning_freq']
 23 |         self.target_update_freq = agent_params['target_update_freq']
 24 | 
 25 |         self.replay_buffer_idx = None
 26 |         self.exploration = agent_params['exploration_schedule']
 27 |         self.optimizer_spec = agent_params['optimizer_spec']
 28 | 
 29 |         self.critic = DQNCritic(agent_params, self.optimizer_spec)
 30 |         self.actor = ArgMaxPolicy(self.critic, self.device)
 31 | 
 32 |         lander = agent_params['env_name'] == 'LunarLander-v2'
 33 |         self.replay_buffer = MemoryOptimizedReplayBuffer(agent_params['replay_buffer_size'], agent_params['frame_history_len'], lander=lander)
 34 |         self.t = 0
 35 |         self.num_param_updates = 0
 36 | 
 37 |     def add_to_replay_buffer(self, paths):
 38 |         pass
 39 | 
 40 |     def step_env(self):
 41 | 
 42 |         """
 43 |             Step the env and store the transition
 44 | 
 45 |             At the end of this block of code, the simulator should have been
 46 |             advanced one step, and the replay buffer should contain one more transition.
 47 | 
 48 |             Note that self.last_obs must always point to the new latest observation.
 49 |         """
 50 | 
 51 |         # TODO store the latest observation into the replay buffer
 52 |         # HINT: see replay buffer's function store_frame
 53 |         self.replay_buffer_idx = TODO
 54 | 
 55 |         eps = self.exploration.value(self.t)
 56 |         # TODO use epsilon greedy exploration when selecting action
 57 |         # HINT: take random action
 58 |             # with probability eps (see np.random.random())
 59 |             # OR if your current step number (see self.t) is less that self.learning_starts
 60 |         perform_random_action = TODO
 61 | 
 62 |         if perform_random_action:
 63 |             action = TODO
 64 |         else:
 65 |             # TODO query the policy to select action
 66 |             # HINT: you cannot use "self.last_obs" directly as input
 67 |             # into your network, since it needs to be processed to include context
 68 |             # from previous frames.
 69 |             # Check out the replay buffer, which has a function called
 70 |             # encode_recent_observation that will take the latest observation
 71 |             # that you pushed into the buffer and compute the corresponding
 72 |             # input that should be given to a Q network by appending some
 73 |             # previous frames.
 74 |             enc_last_obs =
 75 |             enc_last_obs = torch.tensor(enc_last_obs[None, :]).to(self.device)
 76 | 
 77 |             # TODO query the policy with enc_last_obs to select action
 78 |             action = TODO
 79 | 
 80 |         # TODO take a step in the environment using the action from the policy
 81 |         # HINT1: remember that self.last_obs must always point to the newest/latest observation
 82 |         # HINT2: remember the following useful function that you've seen before:
 83 |             #obs, reward, done, info = env.step(action)
 84 |         TODO
 85 | 
 86 |         # TODO store the result of taking this action into the replay buffer
 87 |         # HINT1: see replay buffer's store_effect function
 88 |         # HINT2: one of the arguments you'll need to pass in is self.replay_buffer_idx from above
 89 |         TODO
 90 | 
 91 |         # TODO if taking this step resulted in done, reset the env (and the latest observation), otherwise set last obs to obs
 92 |         TODO
 93 | 
 94 |     def sample(self, batch_size):
 95 |         if self.replay_buffer.can_sample(self.batch_size):
 96 |             return self.replay_buffer.sample(batch_size)
 97 |         else:
 98 |             return [],[],[],[],[]
 99 | 
100 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
101 | 
102 |         """
103 |             Here, you should train the DQN agent.
104 |             This consists of training the critic, as well as periodically updating the target network.
105 |         """
106 |         loss = 0
107 |         if (self.t > self.learning_starts and \
108 |                 self.t % self.learning_freq == 0 and \
109 |                 self.replay_buffer.can_sample(self.batch_size)):
110 | 
111 |             # TODO populate the parameters and implement critic.update()
112 |             loss = self.critic.update(TODO, TODO, TODO, TODO, TODO)
113 | 
114 |             # TODO: load newest parameters into the target network
115 |             if self.num_param_updates % self.target_update_freq == 0:
116 |                 TODO
117 | 
118 |             self.num_param_updates += 1
119 | 
120 |         self.t += 1
121 |         return loss
122 | 


--------------------------------------------------------------------------------
/hw3/cs285/critics/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw3/cs285/critics/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/critics/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/critics/__pycache__/bootstrapped_continuous_critic.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/critics/__pycache__/bootstrapped_continuous_critic.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/critics/__pycache__/dqn_critic.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/critics/__pycache__/dqn_critic.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/critics/bootstrapped_continuous_critic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from cs285.infrastructure.models import MLP
 4 | 
 5 | class BootstrappedContinuousCritic:
 6 |     def __init__(self, hparams):
 7 |         self.ob_dim = hparams['ob_dim']
 8 |         self.ac_dim = hparams['ac_dim']
 9 |         self.discrete = hparams['discrete']
10 |         self.size = hparams['size']
11 |         self.n_layers = hparams['n_layers']
12 |         self.device = hparams['device']
13 |         self.learning_rate = hparams['learning_rate']
14 |         self.num_target_updates = hparams['num_target_updates']
15 |         self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update']
16 |         self.gamma = hparams['gamma']
17 | 
18 |         self.value_func = MLP(1, self.ob_dim, self.n_layers, self.size, self.device, self.discrete)
19 |         # TODO: use the Adam optimizer to optimize the loss with self.learning_rate
20 |         self.optimizer = TODO
21 | 
22 |     def update(self, ob_no, next_ob_no, re_n, terminal_n):
23 |         """
24 |             Update the parameters of the critic.
25 | 
26 |             let sum_of_path_lengths be the sum of the lengths of the sampled paths
27 |             let num_paths be the number of sampled paths
28 | 
29 |             arguments:
30 |                 ob_no: shape: (sum_of_path_lengths, ob_dim)
31 |                 next_ob_no: shape: (sum_of_path_lengths, ob_dim). The observation after taking one step forward
32 |                 re_n: length: sum_of_path_lengths. Each element in re_n is a scalar containing
33 |                     the reward for each timestep
34 |                 terminal_n: length: sum_of_path_lengths. Each element in terminal_n is either 1 if the episode ended
35 |                     at that timestep of 0 if the episode did not end
36 | 
37 |             returns:
38 |                 loss
39 |         """
40 | 
41 |         # TODO: Implement the pseudocode below:
42 | 
43 |         # do the following (self.num_grad_steps_per_target_update * self.num_target_updates) times:
44 |             # every self.num_grad_steps_per_target_update steps (which includes the first step),
45 |                 # recompute the target values by
46 |                     #a) calculating V(s') by querying this critic network (ie calling 'forward') with next_ob_no
47 |                     #b) and computing the target values as r(s, a) + gamma * V(s')
48 |                 # HINT: don't forget to use terminal_n to cut off the V(s') (ie set it to 0) when a terminal state is reached
49 |             # every time,
50 |                 # update this critic using the observations and targets
51 |                 # HINT: use nn.MSE()
52 | 
53 |         TODO
54 | 
55 |         return loss
56 | 


--------------------------------------------------------------------------------
/hw3/cs285/critics/dqn_critic.py:
--------------------------------------------------------------------------------
 1 | from cs285.infrastructure.models import *
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | class DQNCritic:
 6 |     def __init__(self, hparams, optimizer_spec, **kwargs):
 7 |         super().__init__(**kwargs)
 8 |         self.env_name = hparams['env_name']
 9 |         self.device = hparams['device']
10 |         self.ob_dim = hparams['ob_dim']
11 | 
12 |         if isinstance(self.ob_dim, int):
13 |             self.input_shape = self.ob_dim
14 |         else:
15 |             self.input_shape = hparams['input_shape']
16 | 
17 |         self.ac_dim = hparams['ac_dim']
18 |         self.double_q = hparams['double_q']
19 |         self.grad_norm_clipping = hparams['grad_norm_clipping']
20 |         self.gamma = hparams['gamma']
21 | 
22 |         self.optimizer_spec = optimizer_spec
23 | 
24 |         if self.env_name == 'LunarLander-v2':
25 |             self.Q_func = LL_DQN(self.ac_dim, self.input_shape, self.device)
26 |             self.target_Q_func = LL_DQN(self.ac_dim, self.input_shape, self.device)
27 | 
28 |         elif self.env_name == 'PongNoFrameskip-v4':
29 |             self.Q_func = atari_DQN(self.ac_dim, self.input_shape, self.device)
30 |             self.target_Q_func = atari_DQN(self.ac_dim, self.input_shape, self.device)
31 | 
32 |         else: raise NotImplementedError
33 | 
34 |         self.optimizer = self.optimizer_spec.constructor(self.Q_func.parameters(), lr = 1, **self.optimizer_spec.kwargs)
35 |         self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(self.optimizer, self.optimizer_spec.lr_schedule)
36 | 
37 |     def get_loss(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
38 |         ob, ac, rew, next_ob, done = map(lambda x: torch.from_numpy(x).to(self.device), [ob_no, ac_na, re_n, next_ob_no, terminal_n])
39 | 
40 |         with torch.no_grad():
41 |             if self.double_q:
42 |                 # You must fill this part for Q2 of the Q-learning potion of the homework.
43 |                 # In double Q-learning, the best action is selected using the Q-network that
44 |                 # is being updated, but the Q-value for this action is obtained from the
45 |                 # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.
46 |                 max_ac = TODO
47 |             else:
48 |                 max_ac = TODO
49 | 
50 |         curr_Q = self.Q_func(ob).gather(-1, ac.long().view(-1, 1)).squeeze()
51 |         # TODO calculate the optimal Qs for next_ob using max_ac
52 |         # HINT1: similar to how it is done above
53 |         best_next_Q = TODO
54 |         # TODO calculate the targets for the Bellman error
55 |         # HINT1: as you saw in lecture, this would be:
56 |             #currentReward + self.gamma * best_next_Q * (1 - self.done_mask_ph)
57 |         calc_Q = TODO
58 | 
59 |         return nn.functional.smooth_l1_loss(curr_Q, calc_Q) #Huber Loss
60 | 
61 | 
62 |     def update(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
63 |         self.optimizer.zero_grad()
64 | 
65 |         loss = self.get_loss(ob_no, ac_na, re_n, next_ob_no, terminal_n)
66 |         loss.backward()
67 | 
68 |         nn.utils.clip_grad_norm_(self.Q_func.parameters(), max_norm = self.grad_norm_clipping) #perform grad clipping
69 |         self.optimizer.step() #take step with optimizer
70 |         self.lr_scheduler.step() #move forward learning rate
71 | 
72 |         return loss
73 | 


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__pycache__/atari_wrappers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/atari_wrappers.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__pycache__/dqn_utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/dqn_utils.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__pycache__/logger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/logger.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__pycache__/models.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/models.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/infrastructure/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | from torch.utils.tensorboard import SummaryWriter
 4 | import numpy as np
 5 | 
 6 | class Logger:
 7 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 8 |         self._log_dir = log_dir
 9 |         print('########################')
10 |         print('logging outputs to ', log_dir)
11 |         print('########################')
12 |         self._n_logged_samples = n_logged_samples
13 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
14 | 
15 |     def log_scalar(self, scalar, name, step_):
16 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
17 | 
18 |     def log_scalars(self, scalar_dict, group_name, step, phase):
19 |         """Will log all scalars in the same plot."""
20 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
21 | 
22 |     def log_image(self, image, name, step):
23 |         assert(len(image.shape) == 3)  # [C, H, W]
24 |         self._summ_writer.add_image('{}'.format(name), image, step)
25 | 
26 |     def log_video(self, video_frames, name, step, fps=10):
27 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
28 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
29 | 
30 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
31 | 
32 |         # reshape the rollouts
33 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
34 | 
35 |         # max rollout length
36 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
37 |         max_length = videos[0].shape[0]
38 |         for i in range(max_videos_to_save):
39 |             if videos[i].shape[0]>max_length:
40 |                 max_length = videos[i].shape[0]
41 | 
42 |         # pad rollouts to all be same length
43 |         for i in range(max_videos_to_save):
44 |             if videos[i].shape[0]<max_length:
45 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
46 |                 videos[i] = np.concatenate([videos[i], padding], 0)
47 | 
48 |         # log videos to tensorboard event file
49 |         videos = np.stack(videos[:max_videos_to_save], 0)
50 |         self.log_video(videos, video_title, step, fps=fps)
51 | 
52 |     def log_figures(self, figure, name, step, phase):
53 |         """figure: matplotlib.pyplot figure handle"""
54 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
55 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
56 | 
57 |     def log_figure(self, figure, name, step, phase):
58 |         """figure: matplotlib.pyplot figure handle"""
59 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
60 | 
61 |     def log_graph(self, array, name, step, phase):
62 |         """figure: matplotlib.pyplot figure handle"""
63 |         im = plot_graph(array)
64 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
65 | 
66 |     def dump_scalars(self, log_path=None):
67 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
68 |         self._summ_writer.export_scalars_to_json(log_path)
69 | 
70 |     def flush(self):
71 |         self._summ_writer.flush()
72 | 
73 | 
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/models.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | 
 4 | class MLP(nn.Module):
 5 |     def __init__(self,
 6 |         ac_dim,
 7 |         ob_dim,
 8 |         n_layers,
 9 |         size,
10 |         device,
11 |         discrete,
12 |         activation = nn.Tanh()):
13 |         super().__init__()
14 | 
15 |         self.discrete = discrete
16 | 
17 |         # network architecture
18 |         self.mlp = nn.ModuleList()
19 |         self.mlp.append(nn.Linear(ob_dim, size)) #first hidden layer
20 |         self.mlp.append(activation)
21 | 
22 |         for h in range(n_layers - 1): #additional hidden layers
23 |             self.mlp.append(nn.Linear(size, size))
24 |             self.mlp.append(activation)
25 | 
26 |         self.mlp.append(nn.Linear(size, ac_dim)) #output layer, no activation function
27 | 
28 |         #if continuous define logstd variable
29 |         if not self.discrete:
30 |             self.logstd = nn.Parameter(torch.zeros(ac_dim))
31 | 
32 |         self.to(device)
33 | 
34 |     def forward(self, x):
35 |         for layer in self.mlp:
36 |             x = layer(x)
37 |         if self.discrete:
38 |             return x
39 |         else:
40 |             return (x, self.logstd.exp())
41 | 
42 |     def save(self, filepath):
43 |         torch.save(self.state_dict(), filepath)
44 | 
45 |     def restore(self, filepath):
46 |         self.load_state_dict(torch.load(filepath))
47 | 
48 | class LL_DQN(MLP):
49 |     def __init__(self, ac_dim, ob_dim, device):
50 |         super().__init__(ac_dim, ob_dim, 2, 64, device, True, nn.ReLU())
51 | 
52 | class atari_DQN(nn.Module):
53 |     def __init__(self, ac_dim, ob_dim, device):
54 |         super().__init__()
55 | 
56 |         self.convnet = nn.Sequential(
57 |             nn.Conv2d(ob_dim[2], 32, 8, stride = 4),
58 |             nn.ReLU(True),
59 |             nn.Conv2d(32, 64, 4, stride = 2),
60 |             nn.ReLU(True),
61 |             nn.Conv2d(64, 64, 3, stride = 1),
62 |             nn.ReLU(True),
63 |         )
64 |         self.action_value = nn.Sequential(
65 |             nn.Linear(7 * 7 * 64, 512),
66 |             nn.ReLU(True),
67 |             nn.Linear(512, ac_dim),
68 |         )
69 |         self.to(device)
70 | 
71 |     def forward(self, obs):
72 |         out = obs.float() / 255
73 |         out = out.permute(0, 3, 1, 2) #reshape to [batch size, channels, height, width]
74 |         out = self.convnet(out)
75 |         out = out.reshape(out.size(0), -1)
76 |         out = self.action_value(out)
77 |         return out
78 | 
79 |     def save(self, filepath):
80 |         torch.save(self.state_dict(), filepath)
81 | 
82 |     def restore(self, filepath):
83 |         self.load_state_dict(torch.load(filepath))
84 | 


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from cs285.infrastructure.utils import *
 4 | 
 5 | class ReplayBuffer(object):
 6 | 
 7 |     def __init__(self, max_size=1000000):
 8 | 
 9 |         self.max_size = max_size
10 |         self.paths = []
11 |         self.obs = None
12 |         self.acs = None
13 |         self.concatenated_rews = None
14 |         self.unconcatenated_rews = None
15 |         self.next_obs = None
16 |         self.terminals = None
17 | 
18 |     def add_rollouts(self, paths):
19 | 
20 |         # add new rollouts into our list of rollouts
21 |         for path in paths:
22 |             self.paths.append(path)
23 | 
24 |         # convert new rollouts into their component arrays, and append them onto our arrays
25 |         observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
26 | 
27 |         if self.obs is None:
28 |             self.obs = observations[-self.max_size:]
29 |             self.acs = actions[-self.max_size:]
30 |             self.next_obs = next_observations[-self.max_size:]
31 |             self.terminals = terminals[-self.max_size:]
32 |             self.concatenated_rews = concatenated_rews[-self.max_size:]
33 |             self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
34 |         else:
35 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
36 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
37 |             self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:]
38 |             self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:]
39 |             self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:]
40 |             if isinstance(unconcatenated_rews, list):
41 |                 self.unconcatenated_rews += unconcatenated_rews
42 |             else:
43 |                 self.unconcatenated_rews.append(unconcatenated_rews)
44 | 
45 |     ########################################
46 |     ########################################
47 | 
48 |     def sample_random_rollouts(self, num_rollouts):
49 |         rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
50 |         return self.paths[rand_indices]
51 | 
52 |     def sample_recent_rollouts(self, num_rollouts=1):
53 |         return self.paths[-num_rollouts:]
54 | 
55 |     ########################################
56 |     ########################################
57 | 
58 |     def sample_random_data(self, batch_size):
59 | 
60 |         assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
61 |         rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
62 |         return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
63 | 
64 |     def sample_recent_data(self, batch_size=1, concat_rew=True):
65 | 
66 |         if concat_rew:
67 |             return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
68 |         else:
69 |             num_recent_rollouts_to_return = 0
70 |             num_datapoints_so_far = 0
71 |             index = -1
72 |             while num_datapoints_so_far < batch_size:
73 |                 recent_rollout = self.paths[index]
74 |                 index -=1
75 |                 num_recent_rollouts_to_return +=1
76 |                 num_datapoints_so_far += get_pathlength(recent_rollout)
77 |             rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
78 |             observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
79 |             return observations, actions, unconcatenated_rews, next_observations, terminals


--------------------------------------------------------------------------------
/hw3/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | 
  4 | ############################################
  5 | ############################################
  6 | 
  7 | def sample_trajectory(env, policy, max_path_length, render=False, render_mode=('rgb_array')):
  8 | 
  9 |     if render:
 10 |         env.render(mode = "human")
 11 | 
 12 |     # initialize env for the beginning of a new rollout
 13 |     ob = env.reset() # TODO: GETTHIS from HW1
 14 | 
 15 |     # init vars
 16 |     obs, acs, rewards, next_obs, terminals, image_obs = [], [], [], [], [], []
 17 |     steps = 0
 18 |     while True:
 19 | 
 20 |         # render image of the simulated env
 21 |         if render:
 22 |             if 'rgb_array' in render_mode:
 23 |                 if hasattr(env, 'sim'):
 24 |                     if 'track' in env.env.model.camera_names:
 25 |                         image_obs.append(env.sim.render(camera_name='track', height=500, width=500)[::-1])
 26 |                     else:
 27 |                         image_obs.append(env.sim.render(height=500, width=500)[::-1])
 28 |                 else:
 29 |                     image_obs.append(env.render(mode=render_mode))
 30 |             if 'human' in render_mode:
 31 |                 env.render(mode=render_mode)
 32 |                 time.sleep(env.model.opt.timestep)
 33 | 
 34 |         # use the most recent ob to decide what to do
 35 |         obs.append(ob)
 36 |         ac = policy.get_action(ob)
 37 |         acs.append(ac)
 38 | 
 39 |         # take that action and record results
 40 |         ob, rew, done, _ = env.step(ac)
 41 | 
 42 |         # record result of taking that action
 43 |         steps += 1
 44 |         next_obs.append(ob)
 45 |         rewards.append(rew)
 46 | 
 47 |         # End the rollout if the rollout ended
 48 |         # Note that the rollout can end due to done, or due to max_path_length
 49 |         rollout_done = done or steps >= max_path_length
 50 |         terminals.append(rollout_done)
 51 | 
 52 |         if rollout_done:
 53 |             break
 54 | 
 55 |     return Path(obs, image_obs, acs, rewards, next_obs, terminals)
 56 | 
 57 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, render=False, render_mode=('rgb_array')):
 58 | 
 59 |     timesteps_left = min_timesteps_per_batch
 60 |     timesteps_this_batch = 0
 61 |     paths = []
 62 | 
 63 |     while timesteps_this_batch < min_timesteps_per_batch:
 64 |         paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode))
 65 |         timesteps_this_batch += get_pathlength(paths[-1])
 66 | 
 67 |     return paths, timesteps_this_batch
 68 | 
 69 | def sample_n_trajectories(env, policy, ntraj, max_path_length, render=False, render_mode=('rgb_array')):
 70 |     paths = []
 71 |     for n in range(ntraj):
 72 |         paths.append(sample_trajectory(env, policy, max_path_length, render, render_mode))
 73 | 
 74 |     return paths
 75 | 
 76 | 
 77 | def Path(obs, image_obs, acs, rewards, next_obs, terminals):
 78 |     """
 79 |         Take info (separate arrays) from a single rollout
 80 |         and return it in a single dictionary
 81 |     """
 82 |     if image_obs != []:
 83 |         image_obs = np.stack(image_obs, axis=0)
 84 |     return {"observation" : np.array(obs, dtype=np.float32),
 85 |             "image_obs" : np.array(image_obs, dtype=np.uint8),
 86 |             "reward" : np.array(rewards, dtype=np.float32),
 87 |             "action" : np.array(acs, dtype=np.float32),
 88 |             "next_observation": np.array(next_obs, dtype=np.float32),
 89 |             "terminal": np.array(terminals, dtype=np.float32)}
 90 | 
 91 | 
 92 | def convert_listofrollouts(paths):
 93 |     """
 94 |         Take a list of rollout dictionaries
 95 |         and return separate arrays,
 96 |         where each array is a concatenation of that array from across the rollouts
 97 |     """
 98 |     observations = np.concatenate([path["observation"] for path in paths])
 99 |     actions = np.concatenate([path["action"] for path in paths])
100 |     next_observations = np.concatenate([path["next_observation"] for path in paths])
101 |     terminals = np.concatenate([path["terminal"] for path in paths])
102 |     concatenated_rewards = np.concatenate([path["reward"] for path in paths])
103 |     unconcatenated_rewards = [path["reward"] for path in paths]
104 |     return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
105 | 
106 | ############################################
107 | ############################################
108 | 
109 | def get_pathlength(path):
110 |     return len(path["reward"])
111 | 


--------------------------------------------------------------------------------
/hw3/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | from cs285.infrastructure.models import MLP
 5 | 
 6 | class MLPPolicy:
 7 |     def __init__(self,
 8 |         ac_dim,
 9 |         ob_dim,
10 |         n_layers,
11 |         size,
12 |         device,
13 |         learning_rate,
14 |         training=True,
15 |         discrete=False,
16 |         nn_baseline=False,
17 |         **kwargs):
18 |         super().__init__()
19 | 
20 |         # init vars
21 |         self.device = device
22 |         self.discrete = discrete
23 |         self.training = training
24 |         self.nn_baseline = nn_baseline
25 | 
26 |         # network architecture
27 |         self.policy_mlp = MLP(ac_dim, ob_dim, n_layers, size, device, discrete)
28 |         params = list(self.policy_mlp.parameters())
29 |         if self.nn_baseline:
30 |             self.baseline_mlp = MLP(1, ob_dim, n_layers, size, device, True)
31 |             params += list(self.baseline_mlp.parameters())
32 | 
33 |         #optimizer
34 |         if self.training:
35 |             self.optimizer = torch.optim.Adam(params, lr = learning_rate)
36 | 
37 |     ##################################
38 | 
39 |     # update/train this policy
40 |     def update(self, observations, actions):
41 |         raise NotImplementedError
42 | 
43 |     # query the neural net that's our 'policy' function, as defined by an mlp above
44 |     # query the policy with observation(s) to get selected action(s)
45 |     def get_action(self, obs):
46 |         output = self.policy_mlp(torch.Tensor(obs).to(self.device))
47 |         if self.discrete:
48 |             action_probs = nn.functional.log_softmax(output).exp()
49 |             return torch.multinomial(action_probs, num_samples = 1).cpu().detach().numpy()[0]
50 |         else:
51 |             return torch.normal(output[0], output[1]).cpu().detach().numpy()
52 | 
53 |     def get_log_prob(self, network_outputs, actions_taken):
54 |         actions_taken = torch.Tensor(actions_taken).to(self.device)
55 |         if self.discrete:
56 |             network_outputs = nn.functional.log_softmax(network_outputs).exp()
57 |             return torch.distributions.Categorical(network_outputs).log_prob(actions_taken)
58 |         else:
59 |             return torch.distributions.Normal(network_outputs[0], network_outputs[1]).log_prob(actions_taken).sum(-1)
60 | 
61 | #####################################################
62 | #####################################################
63 | 
64 | class MLPPolicyPG(MLPPolicy):
65 | 
66 |     def update(self, observations, acs_na, adv_n = None, acs_labels_na = None, qvals = None):
67 |         policy_output = self.policy_mlp(torch.Tensor(observations).to(self.device))
68 |         logprob_pi = self.get_log_prob(policy_output, acs_na)
69 | 
70 |         self.optimizer.zero_grad()
71 | 
72 |         loss = torch.sum((-logprob_pi * torch.Tensor(adv_n).to(self.device)))
73 |         loss.backward()
74 | 
75 |         if self.nn_baseline:
76 |             baseline_prediction = self.baseline_mlp(torch.Tensor(observations).to(self.device)).view(-1)
77 |             baseline_target = torch.Tensor((qvals - qvals.mean()) / (qvals.std() + 1e-8)).to(self.device)
78 |             baseline_loss = nn.functional.mse_loss(baseline_prediction, baseline_target)
79 |             baseline_loss.backward()
80 | 
81 |         self.optimizer.step()
82 | 
83 |         return loss
84 | 
85 | #####################################################
86 | #####################################################
87 | 
88 | class MLPPolicyAC(MLPPolicyPG):
89 |     """ MLP policy required for actor-critic.
90 | 
91 |     Note: Your code for this class could in fact the same as MLPPolicyPG, except the neural net baseline
92 |     would not be required (i.e. self.nn_baseline would always be false. It is separated here only
93 |     to avoid any unintended errors.
94 |     """
95 |     def __init__(self, *args, **kwargs):
96 |         if 'nn_baseline' in kwargs.keys():
97 |             assert kwargs['nn_baseline'] == False, "MLPPolicyAC should not use the nn_baseline flag"
98 |         super().__init__(*args, **kwargs)
99 | 


--------------------------------------------------------------------------------
/hw3/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw3/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/policies/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/policies/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/policies/__pycache__/argmax_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285/policies/__pycache__/argmax_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/hw3/cs285/policies/argmax_policy.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class ArgMaxPolicy:
 4 | 
 5 |     def __init__(self, critic, device):
 6 |         self.critic = critic
 7 |         self.device = device
 8 | 
 9 |     def get_action(self, obs):
10 |         if len(obs.shape) > 1:
11 |             observation = torch.tensor(obs).to(self.device)
12 |         else:
13 |             observation = torch.tensor(obs[None]).to(self.device)
14 |         # TODO: pass observation to critic and use argmax of the resulting Q values as the action
15 |         return TODO
16 | 


--------------------------------------------------------------------------------
/hw3/cs285/scripts/run_hw3_actor_critic.py:
--------------------------------------------------------------------------------
  1 | #Uncomment next two lines and replace the path if not using anaconda
  2 | #import sys
  3 | #sys.path.append(r'<your path to hw1 folder>')
  4 | 
  5 | import os
  6 | import gym
  7 | import pdb
  8 | import time
  9 | import numpy as np
 10 | import torch
 11 | 
 12 | from cs285.infrastructure.rl_trainer import RL_Trainer
 13 | from cs285.agents.ac_agent import ACAgent
 14 | 
 15 | class AC_Trainer(object):
 16 | 
 17 |     def __init__(self, params):
 18 | 
 19 |         #####################
 20 |         ## SET AGENT PARAMS
 21 |         #####################
 22 | 
 23 |         computation_graph_args = {
 24 |             'n_layers': params['n_layers'],
 25 |             'size': params['size'],
 26 |             'learning_rate': params['learning_rate'],
 27 |             'num_target_updates': params['num_target_updates'],
 28 |             'num_grad_steps_per_target_update': params['num_grad_steps_per_target_update'],
 29 |             'device': params['device'],
 30 |             }
 31 | 
 32 |         estimate_advantage_args = {
 33 |             'gamma': params['discount'],
 34 |             'standardize_advantages': not(params['dont_standardize_advantages']),
 35 |         }
 36 | 
 37 |         train_args = {
 38 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
 39 |             'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
 40 |             'num_actor_updates_per_agent_update': params['num_actor_updates_per_agent_update'],
 41 |         }
 42 | 
 43 |         agent_params = {**computation_graph_args, **estimate_advantage_args, **train_args}
 44 | 
 45 |         self.params = params
 46 |         self.params['agent_class'] = ACAgent
 47 |         self.params['agent_params'] = agent_params
 48 |         self.params['batch_size_initial'] = self.params['batch_size']
 49 | 
 50 |         ################
 51 |         ## RL TRAINER
 52 |         ################
 53 | 
 54 |         self.rl_trainer = RL_Trainer(self.params)
 55 | 
 56 |     def run_training_loop(self):
 57 |         self.rl_trainer.run_training_loop(
 58 |             self.params['n_iter'],
 59 |             collect_policy = self.rl_trainer.agent.actor,
 60 |             eval_policy = self.rl_trainer.agent.actor,
 61 |             )
 62 | 
 63 | 
 64 | def main():
 65 |     import argparse
 66 |     parser = argparse.ArgumentParser()
 67 |     parser.add_argument('--env_name', type=str)
 68 |     parser.add_argument('--ep_len', type=int, default=200)
 69 |     parser.add_argument('--exp_name', type=str, default='todo')
 70 |     parser.add_argument('--n_iter', '-n', type=int, default=200)
 71 | 
 72 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
 73 |     parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1)
 74 |     parser.add_argument('--num_actor_updates_per_agent_update', type=int, default=1)
 75 | 
 76 |     parser.add_argument('--batch_size', '-b', type=int, default=1000) #steps collected per train iteration
 77 |     parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
 78 |     parser.add_argument('--train_batch_size', '-tb', type=int, default=1000) ##steps used per gradient step
 79 | 
 80 |     parser.add_argument('--discount', type=float, default=1.0)
 81 |     parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
 82 |     parser.add_argument('--dont_standardize_advantages', '-dsa', action='store_true')
 83 |     parser.add_argument('--num_target_updates', '-ntu', type=int, default=10)
 84 |     parser.add_argument('--num_grad_steps_per_target_update', '-ngsptu', type=int, default=10)
 85 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
 86 |     parser.add_argument('--size', '-s', type=int, default=64)
 87 | 
 88 |     parser.add_argument('--seed', type=int, default=1)
 89 |     parser.add_argument('--use_gpu', '-gpu', default = True)
 90 |     parser.add_argument('--which_gpu', '-gpu_id', default=0)
 91 |     parser.add_argument('--video_log_freq', type=int, default=-1)
 92 |     parser.add_argument('--scalar_log_freq', type=int, default=1)
 93 | 
 94 |     parser.add_argument('--save_params', action='store_true')
 95 | 
 96 |     args = parser.parse_args()
 97 | 
 98 |     # convert to dictionary
 99 |     params = vars(args)
100 | 
101 |     if torch.cuda.is_available() and params["use_gpu"]:
102 |         which_gpu = "cuda:" + str(params["which_gpu"])
103 |         params["device"] = torch.device(which_gpu)
104 |         print("Pytorch is running on GPU", params["which_gpu"])
105 |     else:
106 |         params["device"] = torch.device("cpu")
107 |         print("Pytorch is running on the CPU")
108 | 
109 |     # for policy gradient, we made a design decision
110 |     # to force batch_size = train_batch_size
111 |     # note that, to avoid confusion, you don't even have a train_batch_size argument anymore (above)
112 |     params['train_batch_size'] = params['batch_size']
113 | 
114 |     ##################################
115 |     ### CREATE DIRECTORY FOR LOGGING
116 |     ##################################
117 | 
118 |     logdir_prefix = 'ac_'
119 | 
120 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
121 | 
122 |     if not (os.path.exists(data_path)):
123 |         os.makedirs(data_path)
124 | 
125 |     logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
126 |     logdir = os.path.join(data_path, logdir)
127 |     params['logdir'] = logdir
128 |     if not(os.path.exists(logdir)):
129 |         os.makedirs(logdir)
130 | 
131 |     print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
132 | 
133 |     ###################
134 |     ### RUN TRAINING
135 |     ###################
136 | 
137 |     trainer = AC_Trainer(params)
138 |     trainer.run_training_loop()
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     main()
143 | 


--------------------------------------------------------------------------------
/hw3/cs285/scripts/run_hw3_dqn.py:
--------------------------------------------------------------------------------
  1 | #Uncomment next two lines and replace the path if not using anaconda
  2 | #import sys
  3 | #sys.path.append(r'<your path to hw1 folder>')
  4 | 
  5 | import os
  6 | import time
  7 | import torch
  8 | 
  9 | from cs285.infrastructure.rl_trainer import RL_Trainer
 10 | from cs285.agents.dqn_agent import DQNAgent
 11 | from cs285.infrastructure.dqn_utils import get_env_kwargs
 12 | 
 13 | 
 14 | class Q_Trainer:
 15 |     def __init__(self, params):
 16 |         self.params = params
 17 | 
 18 |         train_args = {
 19 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
 20 |             'num_critic_updates_per_agent_update': params['num_critic_updates_per_agent_update'],
 21 |             'train_batch_size': params['batch_size'],
 22 |             'double_q': params['double_q'],
 23 |             'device': params['device'],
 24 |         }
 25 | 
 26 |         env_args = get_env_kwargs(params['env_name'])
 27 | 
 28 |         self.agent_params = {**train_args, **env_args, **params}
 29 | 
 30 |         self.params['agent_class'] = DQNAgent
 31 |         self.params['agent_params'] = self.agent_params
 32 |         self.params['train_batch_size'] = params['batch_size']
 33 |         self.params['env_wrappers'] = self.agent_params['env_wrappers']
 34 | 
 35 |         self.rl_trainer = RL_Trainer(self.params)
 36 | 
 37 |     def run_training_loop(self):
 38 |         self.rl_trainer.run_training_loop(
 39 |             self.agent_params['num_timesteps'],
 40 |             collect_policy = self.rl_trainer.agent.actor,
 41 |             eval_policy = self.rl_trainer.agent.actor,
 42 |             )
 43 | 
 44 | def main():
 45 | 
 46 |     import argparse
 47 |     parser = argparse.ArgumentParser()
 48 |     parser.add_argument('--env_name',  default='PongNoFrameskip-v4',
 49 |                         choices=('PongNoFrameskip-v4',
 50 |                                  'LunarLander-v2')
 51 |                         )
 52 | 
 53 |     parser.add_argument('--ep_len', type=int, default=200)
 54 |     parser.add_argument('--exp_name', type=str, default='todo')
 55 | 
 56 |     parser.add_argument('--eval_batch_size', type=int, default=1000)
 57 | 
 58 |     parser.add_argument('--batch_size', type=int, default=32)
 59 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1)
 60 |     parser.add_argument('--num_critic_updates_per_agent_update', type=int, default=1)
 61 |     parser.add_argument('--double_q', action='store_true')
 62 | 
 63 |     parser.add_argument('--seed', type=int, default=1)
 64 |     parser.add_argument('--use_gpu', '-gpu', default = True)
 65 |     parser.add_argument('--which_gpu', '-gpu_id', default=0)
 66 |     parser.add_argument('--scalar_log_freq', type=int, default=int(1e4))
 67 | 
 68 |     parser.add_argument('--save_params', action='store_true')
 69 | 
 70 |     args = parser.parse_args()
 71 | 
 72 |     # convert to dictionary
 73 |     params = vars(args)
 74 |     params['video_log_freq'] = -1 # This param is not used for DQN
 75 | 
 76 |     if torch.cuda.is_available() and params["use_gpu"]:
 77 |         which_gpu = "cuda:" + str(params["which_gpu"])
 78 |         params["device"] = torch.device(which_gpu)
 79 |         print("Pytorch is running on GPU", params["which_gpu"])
 80 |     else:
 81 |         params["device"] = torch.device("cpu")
 82 |         print("Pytorch is running on the CPU")
 83 | 
 84 |     ##################################
 85 |     ### CREATE DIRECTORY FOR LOGGING
 86 |     ##################################
 87 | 
 88 |     logdir_prefix = 'dqn_'
 89 |     if args.double_q:
 90 |         logdir_prefix += 'double_q_'
 91 | 
 92 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
 93 | 
 94 |     if not (os.path.exists(data_path)):
 95 |         os.makedirs(data_path)
 96 | 
 97 |     logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
 98 |     logdir = os.path.join(data_path, logdir)
 99 |     params['logdir'] = logdir
100 |     if not(os.path.exists(logdir)):
101 |         os.makedirs(logdir)
102 | 
103 |     print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
104 | 
105 |     trainer = Q_Trainer(params)
106 |     trainer.run_training_loop()
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     main()
111 | 


--------------------------------------------------------------------------------
/hw3/cs285_hw3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw3/cs285_hw3.pdf


--------------------------------------------------------------------------------
/hw3/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.11
2 | mujoco-py==1.50.1.35
3 | matplotlib==2.2.2
4 | ipython==6.4.0
5 | moviepy==1.0.0
6 | box2d-py


--------------------------------------------------------------------------------
/hw3/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------
/hw4/README.txt:
--------------------------------------------------------------------------------
 1 | 1) See hw1 if you'd like to see installation instructions. You do NOT have to redo them.
 2 | 
 3 | ##############################################
 4 | ##############################################
 5 | 
 6 | 
 7 | 2) Code:
 8 | 
 9 | -------------------------------------------
10 | 
11 | Files to look at, even though there are no explicit 'TODO' markings:
12 | - scripts/run_hw4_mb.py
13 | - infrastructure/rl_trainer.py
14 | 
15 | -------------------------------------------
16 | 
17 | Blanks to be filled in now (for this assignment) are marked with 'TODO'
18 | 
19 | The following files have these:
20 | - agents/mb_agent.py
21 | - models/ff_model.py
22 | - policies/MPC_policy.py
23 | - infrastructure/utils.py
24 | 
25 | ##############################################
26 | ##############################################
27 | 
28 | 
29 | 3) Commands: 
30 | 
31 | Please refer to the PDF for the specific commands needed for different questions. 
32 | 
33 | ##############################################
34 | 
35 | 
36 | 4) Visualize saved tensorboard event file:
37 | 
38 | $ cd cs285/data/<your_log_dir>
39 | $ tensorboard --logdir .
40 | 
41 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab)


--------------------------------------------------------------------------------
/hw4/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw4/cs285/agents/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/agents/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/agents/__pycache__/mb_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/agents/__pycache__/mb_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/agents/mb_agent.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from cs285.models.ff_model import FFModel
 3 | from cs285.policies.MPC_policy import MPCPolicy
 4 | from cs285.infrastructure.replay_buffer import ReplayBuffer
 5 | from cs285.infrastructure.utils import *
 6 | 
 7 | class MBAgent:
 8 |     def __init__(self, env, agent_params):
 9 |         super(MBAgent, self).__init__()
10 | 
11 |         self.env = env.unwrapped
12 |         self.agent_params = agent_params
13 |         self.ensemble_size = self.agent_params['ensemble_size']
14 | 
15 |         self.dyn_models = []
16 |         for i in range(self.ensemble_size):
17 |             model = FFModel(self.agent_params['ac_dim'],
18 |                             self.agent_params['ob_dim'],
19 |                             self.agent_params['n_layers'],
20 |                             self.agent_params['size'],
21 |                             self.agent_params['device'],
22 |                             self.agent_params['learning_rate'])
23 |             self.dyn_models.append(model)
24 | 
25 |         self.actor = MPCPolicy(self.env,
26 |                                ac_dim = self.agent_params['ac_dim'],
27 |                                dyn_models = self.dyn_models,
28 |                                horizon = self.agent_params['mpc_horizon'],
29 |                                N = self.agent_params['mpc_num_action_sequences'])
30 | 
31 |         self.replay_buffer = ReplayBuffer()
32 | 
33 |     def train(self, ob_no, ac_na, re_n, next_ob_no, terminal_n):
34 | 
35 |         # training a MB agent refers to updating the predictive model using observed state transitions
36 |         # NOTE: each model in the ensemble is trained on a different random batch of size batch_size
37 |         losses = []
38 |         num_data = ob_no.shape[0]
39 |         num_data_per_ens = int(num_data/self.ensemble_size)
40 | 
41 |         for i in range(self.ensemble_size):
42 | 
43 |             # select which datapoints to use for this model of the ensemble
44 |             # you might find the num_data_per_env variable defined above useful
45 | 
46 |             observations = # TODO(Q1)
47 |             actions = # TODO(Q1)
48 |             next_observations = # TODO(Q1)
49 | 
50 |             # use datapoints to update one of the dyn_models
51 |             model =  # TODO(Q1)
52 |             loss = model.update(observations, actions, next_observations, self.data_statistics)
53 |             losses.append(loss)
54 | 
55 |         avg_loss = np.mean(losses)
56 |         return avg_loss
57 | 
58 |     def add_to_replay_buffer(self, paths, add_sl_noise=False):
59 | 
60 |         # add data to replay buffer
61 |         self.replay_buffer.add_rollouts(paths, noised=add_sl_noise)
62 | 
63 |         # get updated mean/std of the data in our replay buffer
64 |         self.data_statistics = {'obs_mean': np.mean(self.replay_buffer.obs, axis=0),
65 |                                 'obs_std': np.std(self.replay_buffer.obs, axis=0),
66 |                                 'acs_mean': np.mean(self.replay_buffer.acs, axis=0),
67 |                                 'acs_std': np.std(self.replay_buffer.acs, axis=0),
68 |                                 'delta_mean': np.mean(
69 |                                     self.replay_buffer.next_obs - self.replay_buffer.obs,
70 |                                     axis=0),
71 |                                 'delta_std': np.std(
72 |                                     self.replay_buffer.next_obs - self.replay_buffer.obs,
73 |                                     axis=0),
74 |                                 }
75 | 
76 |         # update the actor's data_statistics too, so actor.get_action can be calculated correctly
77 |         self.actor.data_statistics = self.data_statistics
78 | 
79 |     def sample(self, batch_size):
80 |         # NOTE: The size of the batch returned here is sampling batch_size * ensemble_size,
81 |         # so each model in our ensemble can get trained on batch_size data
82 |         return self.replay_buffer.sample_random_data(batch_size * self.ensemble_size)
83 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from cs285.envs import ant
2 | from cs285.envs import cheetah
3 | from cs285.envs import obstacles
4 | from cs285.envs import reacher


--------------------------------------------------------------------------------
/hw4/cs285/envs/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/ant/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | 
3 | register(
4 |     id='ant-cs285-v0',
5 |     entry_point='cs285.envs.ant:AntEnv',
6 |     max_episode_steps=1000,
7 | )
8 | from cs285.envs.ant.ant import AntEnv
9 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/ant/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/ant/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/ant/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/ant/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/ant/__pycache__/ant.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/ant/__pycache__/ant.cpython-35.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/ant/__pycache__/ant.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/ant/__pycache__/ant.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | 
3 | register(
4 |     id='cheetah-cs285-v0',
5 |     entry_point='cs285.envs.cheetah:HalfCheetahEnv',
6 |     max_episode_steps=1000,
7 | )
8 | from cs285.envs.cheetah.cheetah import HalfCheetahEnv
9 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/cheetah/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/cheetah/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/__pycache__/cheetah.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/cheetah/__pycache__/cheetah.cpython-35.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/__pycache__/cheetah.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/cheetah/__pycache__/cheetah.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/cheetah/cheetah.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import mujoco_py
  3 | from gym import utils
  4 | from gym.envs.mujoco import mujoco_env
  5 | 
  6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  7 | 
  8 |     def __init__(self):
  9 | 
 10 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
 11 |         utils.EzPickle.__init__(self)
 12 | 
 13 |         self.skip = self.frame_skip
 14 | 
 15 |         self.action_dim = self.ac_dim = self.action_space.shape[0]
 16 |         self.observation_dim = self.obs_dim = self.observation_space.shape[0]
 17 | 
 18 |     def get_reward(self, observations, actions):
 19 | 
 20 |         """get reward/s of given (observations, actions) datapoint or datapoints
 21 | 
 22 |         Args:
 23 |             observations: (batchsize, obs_dim) or (obs_dim,)
 24 |             actions: (batchsize, ac_dim) or (ac_dim,)
 25 | 
 26 |         Return:
 27 |             r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
 28 |             done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
 29 |         """
 30 | 
 31 |         #initialize and reshape as needed, for batch mode
 32 |         self.reward_dict = {}
 33 |         if(len(observations.shape)==1):
 34 |             observations = np.expand_dims(observations, axis = 0)
 35 |             actions = np.expand_dims(actions, axis = 0)
 36 |             batch_mode = False
 37 |         else:
 38 |             batch_mode = True
 39 | 
 40 |         #get vars
 41 |         xvel = observations[:, 9].copy()
 42 |         body_angle = observations[:, 2].copy()
 43 |         front_leg = observations[:, 6].copy()
 44 |         front_shin = observations[:, 7].copy()
 45 |         front_foot = observations[:, 8].copy()
 46 |         zeros = np.zeros((observations.shape[0],)).copy()
 47 | 
 48 |         # ranges
 49 |         leg_range = 0.2
 50 |         shin_range = 0
 51 |         foot_range = 0
 52 |         penalty_factor = 10
 53 | 
 54 |         #calc rew
 55 |         self.reward_dict['run'] = xvel
 56 | 
 57 |         front_leg_rew = zeros.copy()
 58 |         front_leg_rew[front_leg>leg_range] = -penalty_factor
 59 |         self.reward_dict['leg'] = front_leg_rew
 60 | 
 61 |         front_shin_rew = zeros.copy()
 62 |         front_shin_rew[front_shin>shin_range] = -penalty_factor
 63 |         self.reward_dict['shin'] = front_shin_rew
 64 | 
 65 |         front_foot_rew = zeros.copy()
 66 |         front_foot_rew[front_foot>foot_range] = -penalty_factor
 67 |         self.reward_dict['foot'] = front_foot_rew
 68 | 
 69 |         # total reward
 70 |         self.reward_dict['r_total'] = self.reward_dict['run'] +  self.reward_dict['leg'] + self.reward_dict['shin'] + self.reward_dict['foot']
 71 | 
 72 |         #return
 73 |         dones = zeros.copy()
 74 |         if(not batch_mode):
 75 |             return self.reward_dict['r_total'][0], dones[0]
 76 |         return self.reward_dict['r_total'], dones
 77 | 
 78 | 
 79 |     def get_score(self, obs):
 80 |         xposafter = obs[0]
 81 |         return xposafter
 82 | 
 83 |     ##############################################
 84 | 
 85 |     def step(self, action):
 86 | 
 87 |         #step
 88 |         self.do_simulation(action, self.frame_skip)
 89 | 
 90 |         #obs/reward/done/score
 91 |         ob = self._get_obs()
 92 |         rew, done = self.get_reward(ob, action)
 93 |         score = self.get_score(ob)
 94 | 
 95 |         #return
 96 |         env_info = {'obs_dict': self.obs_dict,
 97 |                     'rewards': self.reward_dict,
 98 |                     'score': score}
 99 |         return ob, rew, done, env_info
100 | 
101 |     def _get_obs(self):
102 | 
103 |         self.obs_dict = {}
104 |         self.obs_dict['joints_pos'] = self.sim.data.qpos.flat.copy()
105 |         self.obs_dict['joints_vel'] = self.sim.data.qvel.flat.copy()
106 |         self.obs_dict['com_torso'] = self.get_body_com("torso").flat.copy()
107 | 
108 |         return np.concatenate([
109 |             self.obs_dict['joints_pos'], #9
110 |             self.obs_dict['joints_vel'], #9
111 |             self.obs_dict['com_torso'], #3
112 |         ])
113 | 
114 |     ##############################################
115 | 
116 |     def reset_model(self, seed=None):
117 | 
118 |         # set reset pose/vel
119 |         self.reset_pose = self.init_qpos + self.np_random.uniform(
120 |                         low=-.1, high=.1, size=self.model.nq)
121 |         self.reset_vel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
122 | 
123 |         #reset the env to that pose/vel
124 |         return self.do_reset(self.reset_pose.copy(), self.reset_vel.copy())
125 | 
126 | 
127 |     def do_reset(self, reset_pose, reset_vel, reset_goal=None):
128 | 
129 |         #reset
130 |         self.set_state(reset_pose, reset_vel)
131 | 
132 |         #return
133 |         return self._get_obs()
134 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/obstacles/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | 
3 | register(
4 |     id='obstacles-cs285-v0',
5 |     entry_point='cs285.envs.obstacles:Obstacles',
6 |     max_episode_steps=500,
7 | )
8 | from cs285.envs.obstacles.obstacles_env import Obstacles
9 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/obstacles/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/obstacles/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/obstacles/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/obstacles/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/obstacles/__pycache__/obstacles_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/obstacles/__pycache__/obstacles_env.cpython-35.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/obstacles/__pycache__/obstacles_env.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/obstacles/__pycache__/obstacles_env.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | 
3 | register(
4 |     id='reacher-cs285-v0',
5 |     entry_point='cs285.envs.reacher:Reacher7DOFEnv',
6 |     max_episode_steps=500,
7 | )
8 | from cs285.envs.reacher.reacher_env import Reacher7DOFEnv
9 | 


--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/reacher/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/reacher/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/__pycache__/reacher_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/reacher/__pycache__/reacher_env.cpython-35.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/__pycache__/reacher_env.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/envs/reacher/__pycache__/reacher_env.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/envs/reacher/reacher_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import utils
  3 | from gym.envs.mujoco import mujoco_env
  4 | from mujoco_py import MjViewer
  5 | import os
  6 | 
  7 | class Reacher7DOFEnv(mujoco_env.MujocoEnv, utils.EzPickle):
  8 |     def __init__(self):
  9 | 
 10 |         # placeholder
 11 |         self.hand_sid = -2
 12 |         self.target_sid = -1
 13 | 
 14 |         curr_dir = os.path.dirname(os.path.abspath(__file__))
 15 |         mujoco_env.MujocoEnv.__init__(self, curr_dir+'/assets/sawyer.xml', 2)
 16 |         utils.EzPickle.__init__(self)
 17 |         self.observation_dim = 26
 18 |         self.action_dim = 7
 19 | 
 20 |         self.hand_sid = self.model.site_name2id("finger")
 21 |         self.target_sid = self.model.site_name2id("target")
 22 |         self.skip = self.frame_skip
 23 | 
 24 | 
 25 |     def _get_obs(self):
 26 |         return np.concatenate([
 27 |             self.data.qpos.flat, #[7]
 28 |             self.data.qvel.flatten() / 10., #[7]
 29 |             self.data.site_xpos[self.hand_sid], #[3]
 30 |             self.model.site_pos[self.target_sid], #[3]
 31 |         ])
 32 | 
 33 |     def step(self, a):
 34 | 
 35 |         self.do_simulation(a, self.frame_skip)
 36 |         ob = self._get_obs()
 37 |         reward, done = self.get_reward(ob, a)
 38 | 
 39 |         score = self.get_score(ob)
 40 | 
 41 |         # finalize step
 42 |         env_info = {'ob': ob,
 43 |                     'rewards': self.reward_dict,
 44 |                     'score': score}
 45 | 
 46 |         return ob, reward, done, env_info
 47 | 
 48 |     def get_score(self, obs):
 49 |         hand_pos = obs[-6:-3]
 50 |         target_pos = obs[-3:]
 51 |         score = -1*np.abs(hand_pos-target_pos)
 52 |         return score
 53 | 
 54 |     def get_reward(self, observations, actions):
 55 | 
 56 |         """get reward/s of given (observations, actions) datapoint or datapoints
 57 | 
 58 |         Args:
 59 |             observations: (batchsize, obs_dim) or (obs_dim,)
 60 |             actions: (batchsize, ac_dim) or (ac_dim,)
 61 | 
 62 |         Return:
 63 |             r_total: reward of this (o,a) pair, dimension is (batchsize,1) or (1,)
 64 |             done: True if env reaches terminal state, dimension is (batchsize,1) or (1,)
 65 |         """
 66 | 
 67 |         #initialize and reshape as needed, for batch mode
 68 |         self.reward_dict = {}
 69 |         if(len(observations.shape)==1):
 70 |             observations = np.expand_dims(observations, axis = 0)
 71 |             actions = np.expand_dims(actions, axis = 0)
 72 |             batch_mode = False
 73 |         else:
 74 |             batch_mode = True
 75 | 
 76 |         #get vars
 77 |         hand_pos = observations[:, -6:-3]
 78 |         target_pos = observations[:, -3:]
 79 | 
 80 |         #calc rew
 81 |         dist = np.linalg.norm(hand_pos - target_pos, axis=1)
 82 |         self.reward_dict['r_total'] = -10*dist
 83 | 
 84 |         #done is always false for this env
 85 |         dones = np.zeros((observations.shape[0],))
 86 | 
 87 |         #return
 88 |         if(not batch_mode):
 89 |             return self.reward_dict['r_total'][0], dones[0]
 90 |         return self.reward_dict['r_total'], dones
 91 | 
 92 |     def reset(self):
 93 |         _ = self.reset_model()
 94 | 
 95 |         self.model.site_pos[self.target_sid] = [0.1, 0.1, 0.1]
 96 | 
 97 |         observation, _reward, done, _info = self.step(np.zeros(7))
 98 |         ob = self._get_obs()
 99 | 
100 |         return ob
101 | 
102 |     def reset_model(self, seed=None):
103 |         if seed is not None:
104 |             self.seed(seed)
105 | 
106 |         self.reset_pose = self.init_qpos.copy()
107 |         self.reset_vel = self.init_qvel.copy()
108 | 
109 |         self.reset_goal = np.zeros(3)
110 |         self.reset_goal[0] = self.np_random.uniform(low=-0.3, high=0.3)
111 |         self.reset_goal[1] = self.np_random.uniform(low=-0.2, high=0.2)
112 |         self.reset_goal[2] = self.np_random.uniform(low=-0.25, high=0.25)
113 | 
114 |         return self.do_reset(self.reset_pose, self.reset_vel, self.reset_goal)
115 | 
116 |     def do_reset(self, reset_pose, reset_vel, reset_goal):
117 | 
118 |         self.set_state(reset_pose, reset_vel)
119 | 
120 |         #reset target
121 |         self.reset_goal = reset_goal.copy()
122 |         self.model.site_pos[self.target_sid] = self.reset_goal
123 |         self.sim.forward()
124 | 
125 |         #return
126 |         return self._get_obs()


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/__pycache__/logger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/logger.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/infrastructure/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tensorboardX import SummaryWriter
 3 | import numpy as np
 4 | 
 5 | class Logger:
 6 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 7 |         self._log_dir = log_dir
 8 |         print('########################')
 9 |         print('logging outputs to ', log_dir)
10 |         print('########################')
11 |         self._n_logged_samples = n_logged_samples
12 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 | 
14 |     def log_scalar(self, scalar, name, step_):
15 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 | 
17 |     def log_scalars(self, scalar_dict, group_name, step, phase):
18 |         """Will log all scalars in the same plot."""
19 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 | 
21 |     def log_image(self, image, name, step):
22 |         assert(len(image.shape) == 3)  # [C, H, W]
23 |         self._summ_writer.add_image('{}'.format(name), image, step)
24 | 
25 |     def log_video(self, video_frames, name, step, fps=10):
26 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 | 
29 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 | 
31 |         # reshape the rollouts
32 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 | 
34 |         # max rollout length
35 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 |         max_length = videos[0].shape[0]
37 |         for i in range(max_videos_to_save):
38 |             if videos[i].shape[0]>max_length:
39 |                 max_length = videos[i].shape[0]
40 | 
41 |         # pad rollouts to all be same length
42 |         for i in range(max_videos_to_save):
43 |             if videos[i].shape[0]<max_length:
44 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
45 |                 videos[i] = np.concatenate([videos[i], padding], 0)
46 | 
47 |         # log videos to tensorboard event file
48 |         videos = np.stack(videos[:max_videos_to_save], 0)
49 |         self.log_video(videos, video_title, step, fps=fps)
50 | 
51 |     def log_figures(self, figure, name, step, phase):
52 |         """figure: matplotlib.pyplot figure handle"""
53 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
54 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 | 
56 |     def log_figure(self, figure, name, step, phase):
57 |         """figure: matplotlib.pyplot figure handle"""
58 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 | 
60 |     def log_graph(self, array, name, step, phase):
61 |         """figure: matplotlib.pyplot figure handle"""
62 |         im = plot_graph(array)
63 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 | 
65 |     def dump_scalars(self, log_path=None):
66 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 |         self._summ_writer.export_scalars_to_json(log_path)
68 | 
69 |     def flush(self):
70 |         self._summ_writer.flush()
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/hw4/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from cs285.infrastructure.utils import *
 4 | 
 5 | class ReplayBuffer(object):
 6 | 
 7 |     def __init__(self, max_size=1000000):
 8 | 
 9 |         self.max_size = max_size
10 |         self.paths = []
11 |         self.obs = None
12 |         self.acs = None
13 |         self.concatenated_rews = None
14 |         self.unconcatenated_rews = None
15 |         self.next_obs = None
16 |         self.terminals = None
17 | 
18 |     def add_rollouts(self, paths, noised=False):
19 | 
20 |         # add new rollouts into our list of rollouts
21 |         for path in paths:
22 |             self.paths.append(path)
23 | 
24 |         # convert new rollouts into their component arrays, and append them onto our arrays
25 |         observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
26 | 
27 |         if noised:
28 |             observations = add_noise(observations)
29 |             next_observations = add_noise(next_observations)
30 | 
31 |         if self.obs is None:
32 |             self.obs = observations[-self.max_size:]
33 |             self.acs = actions[-self.max_size:]
34 |             self.next_obs = next_observations[-self.max_size:]
35 |             self.terminals = terminals[-self.max_size:]
36 |             self.concatenated_rews = concatenated_rews[-self.max_size:]
37 |             self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
38 |         else:
39 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
40 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
41 |             self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:]
42 |             self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:]
43 |             self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:]
44 |             if isinstance(unconcatenated_rews, list):
45 |                 self.unconcatenated_rews += unconcatenated_rews
46 |             else:
47 |                 self.unconcatenated_rews.append(unconcatenated_rews)
48 | 
49 |     ########################################
50 |     ########################################
51 | 
52 |     def sample_random_rollouts(self, num_rollouts):
53 |         rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
54 |         return self.paths[rand_indices]
55 | 
56 |     def sample_recent_rollouts(self, num_rollouts=1):
57 |         return self.paths[-num_rollouts:]
58 | 
59 |     ########################################
60 |     ########################################
61 | 
62 |     def sample_random_data(self, batch_size):
63 | 
64 |         assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
65 |         rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
66 |         return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
67 | 
68 |     def sample_recent_data(self, batch_size=1, concat_rew=True):
69 | 
70 |         if concat_rew:
71 |             return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
72 |         else:
73 |             num_recent_rollouts_to_return = 0
74 |             num_datapoints_so_far = 0
75 |             index = -1
76 |             while num_datapoints_so_far < batch_size:
77 |                 recent_rollout = self.paths[index]
78 |                 index -=1
79 |                 num_recent_rollouts_to_return +=1
80 |                 num_datapoints_so_far += get_pathlength(recent_rollout)
81 |             rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
82 |             observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
83 |             return observations, actions, unconcatenated_rews, next_observations, terminals


--------------------------------------------------------------------------------
/hw4/cs285/models/__pycache__/ff_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/models/__pycache__/ff_model.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/models/ff_model.py:
--------------------------------------------------------------------------------
 1 | from cs285.infrastructure.utils import normalize, unnormalize, MLP
 2 | import numpy as np
 3 | import torch
 4 | from torch import nn
 5 | 
 6 | class FFModel:
 7 |     def __init__(self, ac_dim, ob_dim, n_layers, size, device, learning_rate = 0.001):
 8 |         # init vars
 9 |         self.device = device
10 | 
11 |         #TODO - specify ouput dim and input dim of delta func MLP
12 |         self.delta_func = MLP(input_dim = TODO,
13 |                               output_dim = TODO,
14 |                               n_layers = n_layers,
15 |                               size = size,
16 |                               device = self.device,
17 |                               discrete = True)
18 | 
19 |         #TODO - define the delta func optimizer. Adam optimizer will work well.
20 |         self.optimizer = TODO
21 | 
22 |     #############################
23 | 
24 |     def get_prediction(self, obs, acs, data_statistics):
25 |         if len(obs.shape) == 1 or len(acs.shape) == 1:
26 |             obs = np.squeeze(obs)[None]
27 |             acs = np.squeeze(acs)[None]
28 | 
29 |         # TODO(Q1) normalize the obs and acs above using the normalize function and data_statistics
30 |         norm_obs = TODO
31 |         norm_acs = TODO
32 | 
33 |         norm_input = torch.Tensor(np.concatenate((norm_obs, norm_acs), axis = 1)).to(self.device)
34 |         norm_delta = self.delta_func(norm_input).cpu().detach().numpy()
35 | 
36 |         # TODO(Q1) Unnormalize the the norm_delta above using the unnormalize function and data_statistics
37 |         delta = TODO
38 |         # TODO(Q1) Return the predited next observation (You will use obs and delta)
39 |         return TODO
40 | 
41 |     def update(self, observations, actions, next_observations, data_statistics):
42 |         # TODO(Q1) normalize the obs and acs above using the normalize function and data_statistics (same as above)
43 |         norm_obs = TODO
44 |         norm_acs = TODO
45 | 
46 |         pred_delta = self.delta_func(torch.Tensor(np.concatenate((norm_obs, norm_acs), axis = 1)).to(self.device))
47 |         # TODO(Q1) Define a normalized true_delta using observations, next_observations and the delta stats from data_statistics
48 |         true_delta = TODO
49 | 
50 |         # TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and true change in state
51 |         loss = TODO
52 |         self.optimizer.zero_grad()
53 |         loss.backward()
54 |         self.optimizer.step()
55 | 
56 |         return loss.item()
57 | 


--------------------------------------------------------------------------------
/hw4/cs285/policies/MPC_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class MPCPolicy:
 4 |     def __init__(self,
 5 |         env,
 6 |         ac_dim,
 7 |         dyn_models,
 8 |         horizon,
 9 |         N,
10 |         **kwargs):
11 |         super().__init__(**kwargs)
12 | 
13 |         # init vars
14 |         self.env = env
15 |         self.dyn_models = dyn_models
16 |         self.horizon = horizon
17 |         self.N = N
18 |         self.data_statistics = None # NOTE must be updated from elsewhere
19 | 
20 |         self.ob_dim = self.env.observation_space.shape[0]
21 | 
22 |         # action space
23 |         self.ac_space = self.env.action_space
24 |         self.ac_dim = ac_dim
25 |         self.low = self.ac_space.low
26 |         self.high = self.ac_space.high
27 | 
28 |     def sample_action_sequences(self, num_sequences, horizon):
29 |         # TODO(Q1) uniformly sample trajectories and return an array of
30 |         # dimensions (num_sequences, horizon, self.ac_dim)
31 |         return random_action_sequences
32 | 
33 |     def get_action(self, obs):
34 | 
35 |         if self.data_statistics is None:
36 |             # print("WARNING: performing random actions.")
37 |             return self.sample_action_sequences(num_sequences=1, horizon=1)[0]
38 | 
39 |         #sample random actions (Nxhorizon)
40 |         candidate_action_sequences = self.sample_action_sequences(num_sequences=self.N, horizon=self.horizon)
41 | 
42 |         # a list you can use for storing the predicted reward for each candidate sequence
43 |         predicted_rewards_per_ens = []
44 | 
45 |         for model in self.dyn_models:
46 |             pass
47 |             # TODO(Q2)
48 | 
49 |             # for each candidate action sequence, predict a sequence of
50 |             # states for each dynamics model in your ensemble
51 | 
52 |             # once you have a sequence of predicted states from each model in your
53 |             # ensemble, calculate the reward for each sequence using self.env.get_reward (See files in envs to see how to call this)
54 | 
55 |         # calculate mean_across_ensembles(predicted rewards).
56 |         # the matrix dimensions should change as follows: [ens,N] --> N
57 |         predicted_rewards = None # TODO(Q2)
58 | 
59 |         # pick the action sequence and return the 1st element of that sequence
60 |         best_index = None #TODO(Q2)
61 |         best_action_sequence = None #TODO(Q2)
62 |         action_to_take = None # TODO(Q2)
63 |         return action_to_take[None] # the None is for matching expected dimensions
64 | 


--------------------------------------------------------------------------------
/hw4/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw4/cs285/policies/__pycache__/MPC_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/policies/__pycache__/MPC_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/policies/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285/policies/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw4/cs285/scripts/run_hw4_mb.py:
--------------------------------------------------------------------------------
  1 | #If not using anaconda use next two lines:
  2 | #import sys
  3 | #sys.path.append(r'<Your path to cs285 here>')
  4 | 
  5 | import torch
  6 | import os
  7 | import time
  8 | from cs285.infrastructure.rl_trainer import RL_Trainer
  9 | from cs285.agents.mb_agent import MBAgent
 10 | 
 11 | 
 12 | class MB_Trainer(object):
 13 | 
 14 |     def __init__(self, params):
 15 | 
 16 |         #####################
 17 |         ## SET AGENT PARAMS
 18 |         #####################
 19 | 
 20 |         computation_graph_args = {
 21 |             'ensemble_size': params['ensemble_size'],
 22 |             'n_layers': params['n_layers'],
 23 |             'size': params['size'],
 24 |             'learning_rate': params['learning_rate'],
 25 |             'device': params['device'],
 26 |             }
 27 | 
 28 |         train_args = {
 29 |             'num_agent_train_steps_per_iter': params['num_agent_train_steps_per_iter'],
 30 |         }
 31 | 
 32 |         controller_args = {
 33 |             'mpc_horizon': params['mpc_horizon'],
 34 |             'mpc_num_action_sequences': params['mpc_num_action_sequences'],
 35 |         }
 36 | 
 37 |         agent_params = {**computation_graph_args, **train_args, **controller_args}
 38 | 
 39 |         self.params = params
 40 |         self.params['agent_class'] = MBAgent
 41 |         self.params['agent_params'] = agent_params
 42 | 
 43 |         ################
 44 |         ## RL TRAINER
 45 |         ################
 46 | 
 47 |         self.rl_trainer = RL_Trainer(self.params)
 48 | 
 49 |     def run_training_loop(self):
 50 | 
 51 |         self.rl_trainer.run_training_loop(
 52 |             self.params['n_iter'],
 53 |             collect_policy = self.rl_trainer.agent.actor,
 54 |             eval_policy = self.rl_trainer.agent.actor,
 55 |             )
 56 | 
 57 | 
 58 | def main():
 59 | 
 60 |     import argparse
 61 |     parser = argparse.ArgumentParser()
 62 |     parser.add_argument('--env_name', type=str) #reacher-cs285-v0, ant-cs285-v0, cheetah-cs285-v0, obstacles-cs285-v0
 63 |     parser.add_argument('--ep_len', type=int, default=200)
 64 |     parser.add_argument('--exp_name', type=str, default='todo')
 65 |     parser.add_argument('--n_iter', '-n', type=int, default=20)
 66 | 
 67 |     parser.add_argument('--ensemble_size', '-e', type=int, default=3)
 68 |     parser.add_argument('--mpc_horizon', type=int, default=10)
 69 |     parser.add_argument('--mpc_num_action_sequences', type=int, default=1000)
 70 | 
 71 |     parser.add_argument('--add_sl_noise', '-noise', action='store_true')
 72 |     parser.add_argument('--num_agent_train_steps_per_iter', type=int, default=1000)
 73 |     parser.add_argument('--batch_size_initial', type=int, default=20000) #(random) steps collected on 1st iteration (put into replay buffer)
 74 |     parser.add_argument('--batch_size', '-b', type=int, default=8000) #steps collected per train iteration (put into replay buffer)
 75 |     parser.add_argument('--train_batch_size', '-tb', type=int, default=512) ##steps used per gradient step (used for training)
 76 |     parser.add_argument('--eval_batch_size', '-eb', type=int, default=400) #steps collected per eval iteration
 77 | 
 78 |     parser.add_argument('--learning_rate', '-lr', type=float, default=0.001)
 79 |     parser.add_argument('--n_layers', '-l', type=int, default=2)
 80 |     parser.add_argument('--size', '-s', type=int, default=250)
 81 | 
 82 |     parser.add_argument('--seed', type=int, default=1)
 83 |     parser.add_argument('--use_gpu', '-gpu', default = True)
 84 |     parser.add_argument('--which_gpu', type=int, default=0)
 85 |     parser.add_argument('--video_log_freq', type=int, default=1) #-1 to disable
 86 |     parser.add_argument('--scalar_log_freq', type=int, default=1) #-1 to disable
 87 |     parser.add_argument('--save_params', action='store_true')
 88 |     args = parser.parse_args()
 89 | 
 90 |     # convert to dictionary
 91 |     params = vars(args)
 92 | 
 93 |     if torch.cuda.is_available() and params["use_gpu"]:
 94 |         which_gpu = "cuda:" + str(params["which_gpu"])
 95 |         params["device"] = torch.device(which_gpu)
 96 |         print("Pytorch is running on GPU", params["which_gpu"])
 97 |     else:
 98 |         params["device"] = torch.device("cpu")
 99 |         print("Pytorch is running on the CPU")
100 | 
101 |     # HARDCODE EPISODE LENGTHS FOR THE ENVS USED IN THIS MB ASSIGNMENT
102 |     if params['env_name']=='reacher-cs285-v0':
103 |         params['ep_len']=200
104 |     if params['env_name']=='cheetah-cs285-v0':
105 |         params['ep_len']=500
106 |     if params['env_name']=='obstacles-cs285-v0':
107 |         params['ep_len']=100
108 | 
109 |     ##################################
110 |     ### CREATE DIRECTORY FOR LOGGING
111 |     ##################################
112 | 
113 |     logdir_prefix = 'mb_'
114 | 
115 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '../data')
116 | 
117 |     if not (os.path.exists(data_path)):
118 |         os.makedirs(data_path)
119 | 
120 |     logdir = logdir_prefix + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
121 |     logdir = os.path.join(data_path, logdir)
122 |     params['logdir'] = logdir
123 |     if not(os.path.exists(logdir)):
124 |         os.makedirs(logdir)
125 | 
126 |     print("\n\n\nLOGGING TO: ", logdir, "\n\n\n")
127 | 
128 |     ###################
129 |     ### RUN TRAINING
130 |     ###################
131 | 
132 |     trainer = MB_Trainer(params)
133 |     trainer.run_training_loop()
134 | 
135 | 
136 | if __name__ == "__main__":
137 |     main()
138 | 


--------------------------------------------------------------------------------
/hw4/cs285_hw4.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw4/cs285_hw4.pdf


--------------------------------------------------------------------------------
/hw4/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------
/hw5/README.txt:
--------------------------------------------------------------------------------
 1 | 1) The code structure for this homeowrk was heavily modified in order to match the structure of the previous three homeworks. 
 2 | To this end the PDF does not give the most accurate location instructions but should still be referred to for questions and guidance.
 3 | The logging procedure in particular was changed to match the previous assignments.
 4 | 
 5 | 2) Code:
 6 | 
 7 | Code to look at:
 8 | 
 9 | - scripts/train_ac_exploration_f18.py
10 | - envs/pointmass.py
11 | - infrastructure/rl_trainer.py (Has been changed for this homework)
12 | - infrastructure/utils.py (Has been changed foir this homework)
13 | 
14 | Code to fill in as part of HW:
15 | 
16 | - agents/ac_agent.py (new Exploratory_ACAgent class added)
17 | - exploration/exploration.py
18 | - exploration/density_model.py
19 | 
20 | 3) Commands to run each experiment:
21 | 
22 | ##########################
23 | ### P1 Hist PointMass  ###
24 | ##########################
25 | 
26 | python cs285/scripts/train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model none -s 8 --exp_name PM_bc0_s8
27 | python cs285/scripts/train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model hist -bc 0.01 -s 8 --exp_name PM_hist_bc0.01_s8
28 | 
29 | ##########################
30 | ###  P2 RBF PointMass  ###
31 | ##########################
32 | 
33 | python cs285/scripts/train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model rbf -bc 0.01 -s 8 -sig 0.2 --exp_name PM_rbf_bc0.01_s8_sig0.2
34 | 
35 | ##########################
36 | ###  P3 EX2 PointMass  ###
37 | ##########################
38 | 
39 | python cs285/scripts/train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model ex2 -s 8 -bc 0.05 -kl 0.1 -dlr 0.001 -dh 8 --exp_name PM_ex2_s8_bc0.05_kl0.1_dlr0.001_dh8
40 | 
41 | ###########################
42 | ###    P4 HalfCheetah   ###
43 | ###########################
44 | 
45 | python cs285/scripts/train_ac_exploration_f18.py sparse-cheetah-cs285-v1 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model none --exp_name HC_bc0
46 | python cs285/scripts/train_ac_exploration_f18.py sparse-cheetah-cs285-v1 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.001 -kl 0.1 -dlr 0.005 -dti 1000 --exp_name HC_bc0.001_kl0.1_dlr0.005_dti1000
47 | python cs285/scripts/train_ac_exploration_f18.py sparse-cheetah-cs285-v1 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.0001 -kl 0.1 -dlr 0.005 -dti 10000 --exp_name HC_bc0.0001_kl0.1_dlr0.005_dti10000
48 | 
49 | 
50 | 4) Visualize saved tensorboard event file:
51 | 
52 | $ cd cs285/data/<your_log_dir>
53 | $ tensorboard --logdir .
54 | 
55 | Then, navigate to shown url to see scalar summaries as plots (in 'scalar' tab), as well as videos (in 'images' tab)
56 | 


--------------------------------------------------------------------------------
/hw5/cs285/agents/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw5/cs285/agents/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/agents/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/agents/__pycache__/ac_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/agents/__pycache__/ac_agent.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/critics/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw5/cs285/critics/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/critics/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/critics/__pycache__/bootstrapped_continuous_critic.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/critics/__pycache__/bootstrapped_continuous_critic.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/critics/bootstrapped_continuous_critic.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from cs285.infrastructure.utils import MLP
 4 | 
 5 | class BootstrappedContinuousCritic:
 6 |     def __init__(self, hparams):
 7 |         self.ob_dim = hparams['ob_dim']
 8 |         self.ac_dim = hparams['ac_dim']
 9 |         self.size = hparams['size']
10 |         self.n_layers = hparams['n_layers']
11 |         self.device = hparams['device']
12 |         self.learning_rate = hparams['learning_rate']
13 |         self.num_target_updates = hparams['num_target_updates']
14 |         self.num_grad_steps_per_target_update = hparams['num_grad_steps_per_target_update']
15 |         self.gamma = hparams['gamma']
16 | 
17 |         self.value_func = MLP(self.ob_dim, 1, self.n_layers, self.size, self.device, True)
18 |         self.optimizer = torch.optim.Adam(self.value_func.parameters(), lr = self.learning_rate)
19 | 
20 |     def update(self, ob_no, next_ob_no, re_n, terminal_n):
21 |         '''
22 |         ts_ob_no, ts_next_ob_no, ts_re_n, ts_terminal_n = map(lambda x: torch.Tensor(x).to(self.device),
23 |                                                               [ob_no, next_ob_no, re_n, terminal_n])
24 |         for _ in range(self.num_target_updates):
25 |             with torch.no_grad():
26 |                 ts_next_V_n = self.value_func(ts_next_ob_no).view(-1)
27 |             ts_target_n = ts_re_n + (1 - ts_terminal_n) * self.gamma * ts_next_V_n
28 |             for _ in range(self.num_grad_steps_per_target_update):
29 |                 ts_V_n = self.value_func(ts_ob_no).view(-1)
30 |                 self.optimizer.zero_grad()
31 |                 loss = nn.functional.mse_loss(ts_V_n, ts_target_n)
32 |                 loss.backward()
33 |                 self.optimizer.step()
34 |         '''
35 |         ob, next_ob, rew, done = map(lambda x: torch.Tensor(x).to(self.device), [ob_no, next_ob_no, re_n, terminal_n])
36 | 
37 |         for update in range(self.num_grad_steps_per_target_update * self.num_target_updates):
38 |             if update % self.num_grad_steps_per_target_update == 0:
39 |                 next_value = self.value_func(next_ob).squeeze() * (1 - done)
40 |                 target_value = rew + self.gamma * next_value
41 | 
42 |             self.optimizer.zero_grad()
43 |             loss = nn.functional.mse_loss(self.value_func(ob).squeeze(), target_value)
44 |             loss.backward()
45 |             self.optimizer.step()
46 |             target_value.detach_()
47 |         #'''
48 | 
49 |         return loss
50 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | register(
3 |     id='sparse-cheetah-cs285-v1',
4 |     entry_point='cs285.envs.sparse_half_cheetah:HalfCheetahEnv',
5 |     max_episode_steps=1000,
6 | )
7 | from cs285.envs.sparse_half_cheetah import HalfCheetahEnv
8 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/envs/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/envs/__pycache__/pointmass.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/envs/__pycache__/pointmass.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/envs/__pycache__/sparse_half_cheetah.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/envs/__pycache__/sparse_half_cheetah.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/envs/pointmass.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | from gym.envs.registration import EnvSpec
  3 | import imageio
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import os
  7 | import seaborn as sns
  8 | from tqdm import tqdm
  9 | 
 10 | class Env(object):
 11 |     def __init__(self):
 12 |         super(Env, self).__init__()
 13 | 
 14 |     def reset(self):
 15 |         raise NotImplementedError
 16 | 
 17 |     def step(self, action):
 18 |         raise NotImplementedError
 19 | 
 20 |     def seed(self, seed):
 21 |         raise NotImplementedError
 22 | 
 23 | class PointMass(Env):
 24 |     def __init__(self, max_episode_steps_coeff=1, scale=20, goal_padding=2.0):
 25 |         super(PointMass, self).__init__()
 26 |         # define scale such that the each square in the grid is 1 x 1
 27 |         self.scale = int(scale)
 28 |         self.grid_size = self.scale * self.scale
 29 |         self.observation_space = gym.spaces.Box(
 30 |             low=np.array([0.0, 0.0]),
 31 |             high=np.array([1.0, 1.0]))
 32 |         self.action_space = gym.spaces.Box(
 33 |             low=np.array([-np.inf, -np.inf]),
 34 |             high=np.array([np.inf, np.inf]))
 35 |         self.goal_padding = goal_padding
 36 |         self.spec = EnvSpec(id='PointMass-v0', max_episode_steps=int(max_episode_steps_coeff*self.scale))
 37 | 
 38 |     def reset(self):
 39 |         plt.close()
 40 |         self.state = np.array([self.goal_padding, self.goal_padding])
 41 |         state = self.state/self.scale
 42 |         return state
 43 | 
 44 |     def step(self, action):
 45 |         x, y = action
 46 | 
 47 |         # next state
 48 |         new_x = self.state[0]+x
 49 |         new_y = self.state[1]+y
 50 |         if new_x < 0:
 51 |             new_x = 0
 52 |         if new_x > self.scale:
 53 |             new_x = self.scale
 54 |         if new_y < 0:
 55 |             new_y = 0
 56 |         if new_y > self.scale:
 57 |             new_y = self.scale
 58 |         self.state = np.array([new_x, new_y])
 59 |         state = self.state/self.scale
 60 | 
 61 |         # reward
 62 |         reg_term = -0.01*np.sum(action**2)
 63 | 
 64 |         threshold = self.scale - self.goal_padding
 65 |         if new_x > threshold and new_y > threshold:
 66 |             reward = 10 + reg_term
 67 |         else:
 68 |             reward = 0 + reg_term
 69 | 
 70 |         # done
 71 |         done = False
 72 | 
 73 |         return state, reward, done, None
 74 | 
 75 |     def preprocess(self, state):
 76 |         scaled_state = self.scale * state
 77 |         x_floor, y_floor = np.floor(scaled_state)
 78 |         assert x_floor <= self.scale
 79 |         assert y_floor <= self.scale
 80 |         if x_floor == self.scale:
 81 |             x_floor -= 1
 82 |         if y_floor == self.scale:
 83 |             y_floor -= 1
 84 |         index = self.scale*x_floor + y_floor
 85 |         return index
 86 | 
 87 |     def unprocess(self, index):
 88 |         x_floor = index // self.scale
 89 |         y_floor = index % self.scale
 90 |         unscaled_state = np.array([x_floor, y_floor])/self.scale
 91 |         return unscaled_state
 92 | 
 93 |     def seed(self, seed):
 94 |         pass
 95 | 
 96 |     def render(self):
 97 |         # create a grid
 98 |         states = [self.state/self.scale]
 99 |         indices = np.array([int(self.preprocess(s)) for s in states])
100 |         a = np.zeros(self.grid_size)
101 |         for i in indices:
102 |             a[i] += 1
103 |         max_freq = np.max(a)
104 |         a/=float(max_freq)  # normalize
105 |         a = np.reshape(a, (self.scale, self.scale))
106 |         ax = sns.heatmap(a)
107 |         plt.draw()
108 |         plt.pause(0.001)
109 |         plt.clf()        
110 | 
111 |     def visualize(self, states, itr, dirname):
112 |         if states is None:
113 |             states = np.load(os.path.join(dirname, '{}.npy'.format(itr)))
114 |         indices = np.array([int(self.preprocess(s)) for s in states])
115 |         a = np.zeros(int(self.grid_size))
116 |         for i in indices:
117 |             a[i] += 1
118 |         max_freq = np.max(a)
119 |         a/=float(max_freq)  # normalize
120 |         a = np.reshape(a, (self.scale, self.scale))
121 |         ax = sns.heatmap(a)
122 |         plt.savefig(os.path.join(dirname, '{}.png'.format(itr)))
123 |         plt.close()
124 | 
125 |     def create_gif(self, dirname, density=False):
126 |         images = []
127 |         if density:
128 |             filenames = [x for x in os.listdir(dirname) if '_density.png' in x]
129 |             sorted_fnames = sorted(filenames, key=lambda x: int(x.split('_density.png')[0]))
130 |         else:
131 |             filenames = [x for x in os.listdir(dirname) if ('.png' in x and 'density' not in x)]
132 |             sorted_fnames = sorted(filenames, key=lambda x: int(x.split('.png')[0]))
133 |         for f in sorted_fnames:
134 |             images.append(imageio.imread(os.path.join(dirname, f)))
135 |         imageio.mimsave(os.path.join(dirname, 'hist_exploration.gif'), images)
136 | 
137 |     def create_visualization(self, dirname, density=False):
138 |         for s in os.listdir(dirname):
139 |             for i in tqdm(range(100)):
140 |                 self.visualize(None, i, os.path.join(dirname, str(s)))
141 |             self.create_gif(os.path.join(dirname, str(s)))
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     logdir = 'pm_debug'
146 |     os.mkdir(logdir)
147 |     num_episodes = 10
148 |     num_steps_per_epoch = 20
149 | 
150 |     env = PointMass()
151 |     for epoch in range(num_episodes):
152 |         states = []
153 |         state = env.reset()
154 |         for i in range(num_steps_per_epoch):
155 |             action = np.random.rand(2)
156 |             state, reward, done, _ = env.step(action)
157 |             states.append(state)
158 |         env.visualize(np.array(states), epoch, logdir)
159 |     env.create_gif(logdir)
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/hw5/cs285/envs/sparse_half_cheetah.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     def __init__(self):
 7 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
 8 |         utils.EzPickle.__init__(self)
 9 | 
10 |     def step(self, action):
11 |         #################################################
12 |         ctrl = False
13 |         relu = False
14 |         threshold = 10.0
15 |         #################################################
16 |         xposbefore = self.sim.data.qpos[0]
17 |         self.do_simulation(action, self.frame_skip)
18 |         xposafter = self.sim.data.qpos[0]
19 |         ob = self._get_obs()
20 |         # reward_ctrl = - 0.1 * np.square(action).sum()
21 |         # reward_run = (xposafter - xposbefore)/self.dt
22 |         #################################################
23 |         if ctrl:
24 |             reward_ctrl = - 0.1 * np.square(action).sum()
25 |         else:
26 |             reward_ctrl = 0
27 |         if abs(xposafter) <= threshold:
28 |             reward_run = 0.0
29 |         else:
30 |             if relu:
31 |                 reward_run = np.sign(xposafter)*(xposafter - xposbefore)/self.dt
32 |             else:
33 |                 reward_run = 1.0
34 |         #################################################
35 |         reward = reward_ctrl + reward_run
36 |         done = False
37 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
38 | 
39 |     def _get_obs(self):
40 |         return np.concatenate([
41 |             self.sim.data.qpos.flat[1:],
42 |             self.sim.data.qvel.flat,
43 |         ])
44 | 
45 |     def reset_model(self):
46 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
47 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
48 |         self.set_state(qpos, qvel)
49 |         return self._get_obs()
50 | 
51 |     def viewer_setup(self):
52 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
53 | 


--------------------------------------------------------------------------------
/hw5/cs285/exploration/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw5/cs285/exploration/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/exploration/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/exploration/__pycache__/density_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/exploration/__pycache__/density_model.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/exploration/__pycache__/exploration.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/exploration/__pycache__/exploration.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/__pycache__/logger.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/logger.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/__pycache__/replay.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/replay.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/replay_buffer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/rl_trainer.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/infrastructure/__pycache__/utils.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/logger.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from tensorboardX import SummaryWriter
 3 | import numpy as np
 4 | 
 5 | class Logger:
 6 |     def __init__(self, log_dir, n_logged_samples=10, summary_writer=None):
 7 |         self._log_dir = log_dir
 8 |         print('########################')
 9 |         print('logging outputs to ', log_dir)
10 |         print('########################')
11 |         self._n_logged_samples = n_logged_samples
12 |         self._summ_writer = SummaryWriter(log_dir, flush_secs=1, max_queue=1)
13 | 
14 |     def log_scalar(self, scalar, name, step_):
15 |         self._summ_writer.add_scalar('{}'.format(name), scalar, step_)
16 | 
17 |     def log_scalars(self, scalar_dict, group_name, step, phase):
18 |         """Will log all scalars in the same plot."""
19 |         self._summ_writer.add_scalars('{}_{}'.format(group_name, phase), scalar_dict, step)
20 | 
21 |     def log_image(self, image, name, step):
22 |         assert(len(image.shape) == 3)  # [C, H, W]
23 |         self._summ_writer.add_image('{}'.format(name), image, step)
24 | 
25 |     def log_video(self, video_frames, name, step, fps=10):
26 |         assert len(video_frames.shape) == 5, "Need [N, T, C, H, W] input tensor for video logging!"
27 |         self._summ_writer.add_video('{}'.format(name), video_frames, step, fps=fps)
28 | 
29 |     def log_paths_as_videos(self, paths, step, max_videos_to_save=2, fps=10, video_title='video'):
30 | 
31 |         # reshape the rollouts
32 |         videos = [np.transpose(p['image_obs'], [0, 3, 1, 2]) for p in paths]
33 | 
34 |         # max rollout length
35 |         max_videos_to_save = np.min([max_videos_to_save, len(videos)])
36 |         max_length = videos[0].shape[0]
37 |         for i in range(max_videos_to_save):
38 |             if videos[i].shape[0]>max_length:
39 |                 max_length = videos[i].shape[0]
40 | 
41 |         # pad rollouts to all be same length
42 |         for i in range(max_videos_to_save):
43 |             if videos[i].shape[0]<max_length:
44 |                 padding = np.tile([videos[i][-1]], (max_length-videos[i].shape[0],1,1,1))
45 |                 videos[i] = np.concatenate([videos[i], padding], 0)
46 | 
47 |         # log videos to tensorboard event file
48 |         videos = np.stack(videos[:max_videos_to_save], 0)
49 |         self.log_video(videos, video_title, step, fps=fps)
50 | 
51 |     def log_figures(self, figure, name, step, phase):
52 |         """figure: matplotlib.pyplot figure handle"""
53 |         assert figure.shape[0] > 0, "Figure logging requires input shape [batch x figures]!"
54 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
55 | 
56 |     def log_figure(self, figure, name, step, phase):
57 |         """figure: matplotlib.pyplot figure handle"""
58 |         self._summ_writer.add_figure('{}_{}'.format(name, phase), figure, step)
59 | 
60 |     def log_graph(self, array, name, step, phase):
61 |         """figure: matplotlib.pyplot figure handle"""
62 |         im = plot_graph(array)
63 |         self._summ_writer.add_image('{}_{}'.format(name, phase), im, step)
64 | 
65 |     def dump_scalars(self, log_path=None):
66 |         log_path = os.path.join(self._log_dir, "scalar_data.json") if log_path is None else log_path
67 |         self._summ_writer.export_scalars_to_json(log_path)
68 | 
69 |     def flush(self):
70 |         self._summ_writer.flush()
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import copy
 3 | from cs285.infrastructure.utils import *
 4 | 
 5 | class ReplayBuffer(object):
 6 | 
 7 |     def __init__(self, max_size=1000000):
 8 | 
 9 |         self.max_size = max_size
10 |         self.paths = []
11 |         self.obs = None
12 |         self.acs = None
13 |         self.concatenated_rews = None
14 |         self.unconcatenated_rews = None
15 |         self.next_obs = None
16 |         self.terminals = None
17 | 
18 |     def add_rollouts(self, paths, noised = False):
19 | 
20 |         # add new rollouts into our list of rollouts
21 |         for path in paths:
22 |             self.paths.append(path)
23 | 
24 |         # convert new rollouts into their component arrays, and append them onto our arrays
25 |         observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(paths)
26 | 
27 |         if noised:
28 |             observations = add_noise(observations)
29 |             next_observations = add_noise(next_observations)
30 | 
31 |         if self.obs is None:
32 |             self.obs = observations[-self.max_size:]
33 |             self.acs = actions[-self.max_size:]
34 |             self.next_obs = next_observations[-self.max_size:]
35 |             self.terminals = terminals[-self.max_size:]
36 |             self.concatenated_rews = concatenated_rews[-self.max_size:]
37 |             self.unconcatenated_rews = unconcatenated_rews[-self.max_size:]
38 |         else:
39 |             self.obs = np.concatenate([self.obs, observations])[-self.max_size:]
40 |             self.acs = np.concatenate([self.acs, actions])[-self.max_size:]
41 |             self.next_obs = np.concatenate([self.next_obs, next_observations])[-self.max_size:]
42 |             self.terminals = np.concatenate([self.terminals, terminals])[-self.max_size:]
43 |             self.concatenated_rews = np.concatenate([self.concatenated_rews, concatenated_rews])[-self.max_size:]
44 |             if isinstance(unconcatenated_rews, list):
45 |                 self.unconcatenated_rews += unconcatenated_rews
46 |             else:
47 |                 self.unconcatenated_rews.append(unconcatenated_rews)
48 | 
49 |     ########################################
50 |     ########################################
51 | 
52 |     def sample_random_rollouts(self, num_rollouts):
53 |         rand_indices = np.random.permutation(len(self.paths))[:num_rollouts]
54 |         return self.paths[rand_indices]
55 | 
56 |     def sample_recent_rollouts(self, num_rollouts=1):
57 |         return self.paths[-num_rollouts:]
58 | 
59 |     def get_all_obs(self):
60 |         return copy.deepcopy(self.obs)
61 | 
62 |     ########################################
63 |     ########################################
64 | 
65 |     def sample_random_data(self, batch_size):
66 | 
67 |         assert self.obs.shape[0] == self.acs.shape[0] == self.concatenated_rews.shape[0] == self.next_obs.shape[0] == self.terminals.shape[0]
68 |         rand_indices = np.random.permutation(self.obs.shape[0])[:batch_size]
69 |         return self.obs[rand_indices], self.acs[rand_indices], self.concatenated_rews[rand_indices], self.next_obs[rand_indices], self.terminals[rand_indices]
70 | 
71 |     def sample_recent_data(self, batch_size=1, concat_rew=True):
72 | 
73 |         if concat_rew:
74 |             return self.obs[-batch_size:], self.acs[-batch_size:], self.concatenated_rews[-batch_size:], self.next_obs[-batch_size:], self.terminals[-batch_size:]
75 |         else:
76 |             num_recent_rollouts_to_return = 0
77 |             num_datapoints_so_far = 0
78 |             index = -1
79 |             while num_datapoints_so_far < batch_size:
80 |                 recent_rollout = self.paths[index]
81 |                 index -=1
82 |                 num_recent_rollouts_to_return +=1
83 |                 num_datapoints_so_far += get_pathlength(recent_rollout)
84 |             rollouts_to_return = self.paths[-num_recent_rollouts_to_return:]
85 |             observations, actions, next_observations, terminals, concatenated_rews, unconcatenated_rews = convert_listofrollouts(rollouts_to_return)
86 |             return observations, actions, unconcatenated_rews, next_observations, terminals
87 | 
88 |     def __len__(self):
89 |         return len(self.obs)
90 | 


--------------------------------------------------------------------------------
/hw5/cs285/infrastructure/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import time
  3 | import torch
  4 | from torch import nn
  5 | 
  6 | class MLP(nn.Module):
  7 |     def __init__(self,
  8 |         input_dim,
  9 |         output_dim,
 10 |         n_layers,
 11 |         size,
 12 |         device,
 13 |         discrete,
 14 |         activation = nn.Tanh()):
 15 |         super().__init__()
 16 | 
 17 |         self.discrete = discrete
 18 | 
 19 |         # network architecture
 20 |         self.mlp = nn.ModuleList()
 21 |         self.mlp.append(nn.Linear(input_dim, size)) #first hidden layer
 22 |         self.mlp.append(activation)
 23 | 
 24 |         for h in range(n_layers - 1): #additional hidden layers
 25 |             self.mlp.append(nn.Linear(size, size))
 26 |             self.mlp.append(activation)
 27 | 
 28 |         self.mlp.append(nn.Linear(size, output_dim)) #output layer, no activation function
 29 | 
 30 |         #if continuous define logstd variable
 31 |         if not self.discrete:
 32 |             self.logstd = nn.Parameter(torch.zeros(output_dim))
 33 | 
 34 |         self.to(device)
 35 | 
 36 |     def forward(self, x):
 37 |         for layer in self.mlp:
 38 |             x = layer(x)
 39 |         if self.discrete:
 40 |             return x
 41 |         else:
 42 |             return (x, self.logstd.exp())
 43 | 
 44 |     def save(self, filepath):
 45 |         torch.save(self.state_dict(), filepath)
 46 | 
 47 |     def restore(self, filepath):
 48 |         self.load_state_dict(torch.load(filepath))
 49 | 
 50 | def sample_trajectories(env, policy, min_timesteps_per_batch, max_path_length, animate, itr):
 51 |         # Collect paths until we have enough timesteps
 52 |         timesteps_this_batch = 0
 53 |         paths = []
 54 |         while True:
 55 |             #animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate)
 56 |             animate_this_episode = (len(paths) == 0 and animate)
 57 |             path = sample_trajectory(env, policy, max_path_length, animate_this_episode)
 58 |             paths.append(path)
 59 |             timesteps_this_batch += get_pathlength(path)
 60 |             if timesteps_this_batch > min_timesteps_per_batch:
 61 |                 break
 62 |         return paths, timesteps_this_batch
 63 | 
 64 | def sample_trajectory(env, policy, max_path_length, animate_this_episode):
 65 |     ob = env.reset()
 66 |     obs, acs, rewards, next_obs, terminals = [], [], [], [], []
 67 |     steps = 0
 68 |     while True:
 69 |         if animate_this_episode:
 70 |             env.render()
 71 |             time.sleep(0.1)
 72 | 
 73 |         obs.append(ob)
 74 |         ac = policy.get_action(ob)
 75 |         acs.append(ac)
 76 | 
 77 |         ob, rew, done, _ = env.step(ac)
 78 | 
 79 |         steps += 1
 80 |         next_obs.append(ob)
 81 |         rewards.append(rew)
 82 | 
 83 |         if done or steps > max_path_length:
 84 |             terminals.append(1)
 85 |             break
 86 |         else:
 87 |             terminals.append(0)
 88 | 
 89 |     path = {"observation" : np.array(obs, dtype=np.float32),
 90 |             "reward" : np.array(rewards, dtype=np.float32),
 91 |             "action" : np.array(acs, dtype=np.float32),
 92 |             "next_observation": np.array(next_obs, dtype=np.float32),
 93 |             "terminal": np.array(terminals, dtype=np.float32)}
 94 | 
 95 |     return path
 96 | 
 97 | def convert_listofrollouts(paths):
 98 |     """
 99 |         Take a list of rollout dictionaries
100 |         and return separate arrays,
101 |         where each array is a concatenation of that array from across the rollouts
102 |     """
103 |     observations = np.concatenate([path["observation"] for path in paths])
104 |     actions = np.concatenate([path["action"] for path in paths])
105 |     next_observations = np.concatenate([path["next_observation"] for path in paths])
106 |     terminals = np.concatenate([path["terminal"] for path in paths])
107 |     concatenated_rewards = np.concatenate([path["reward"] for path in paths])
108 |     unconcatenated_rewards = [path["reward"] for path in paths]
109 |     return observations, actions, next_observations, terminals, concatenated_rewards, unconcatenated_rewards
110 | 
111 | def get_pathlength(path):
112 |     return len(path["reward"])
113 | 


--------------------------------------------------------------------------------
/hw5/cs285/policies/MLP_policy.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import torch
 3 | import torch.nn as nn
 4 | from cs285.infrastructure.utils import MLP
 5 | 
 6 | class MLPPolicy:
 7 |     def __init__(self,
 8 |         ac_dim,
 9 |         ob_dim,
10 |         n_layers,
11 |         size,
12 |         device,
13 |         learning_rate,
14 |         training=True,
15 |         discrete=False,
16 |         nn_baseline=False,
17 |         **kwargs):
18 |         super().__init__()
19 | 
20 |         # init vars
21 |         self.device = device
22 |         self.discrete = discrete
23 |         self.training = training
24 |         self.nn_baseline = nn_baseline
25 | 
26 |         # network architecture
27 |         self.policy_mlp = MLP(ac_dim, ob_dim, n_layers, size, device, discrete)
28 |         params = list(self.policy_mlp.parameters())
29 |         if self.nn_baseline:
30 |             self.baseline_mlp = MLP(1, ob_dim, n_layers, size, device, True)
31 |             params += list(self.baseline_mlp.parameters())
32 | 
33 |         #optimizer
34 |         if self.training:
35 |             self.optimizer = torch.optim.Adam(params, lr = learning_rate)
36 | 
37 |     ##################################
38 | 
39 |     # update/train this policy
40 |     def update(self, observations, actions):
41 |         raise NotImplementedError
42 | 
43 |     # query the neural net that's our 'policy' function, as defined by an mlp above
44 |     # query the policy with observation(s) to get selected action(s)
45 |     def get_action(self, obs):
46 |         output = self.policy_mlp(torch.Tensor(obs).to(self.device))
47 |         if self.discrete:
48 |             action_probs = nn.functional.log_softmax(output).exp()
49 |             return torch.multinomial(action_probs, num_samples = 1).cpu().detach().numpy()[0]
50 |         else:
51 |             return torch.normal(output[0], output[1]).cpu().detach().numpy()
52 | 
53 |     def get_log_prob(self, network_outputs, actions_taken):
54 |         actions_taken = torch.Tensor(actions_taken).to(self.device)
55 |         if self.discrete:
56 |             network_outputs = nn.functional.log_softmax(network_outputs).exp()
57 |             return torch.distributions.Categorical(network_outputs).log_prob(actions_taken)
58 |         else:
59 |             return torch.distributions.Normal(network_outputs[0], network_outputs[1]).log_prob(actions_taken).sum(-1)
60 | 
61 | #####################################################
62 | #####################################################
63 | 
64 | class MLPPolicyPG(MLPPolicy):
65 | 
66 |     def update(self, observations, acs_na, adv_n = None, acs_labels_na = None, qvals = None):
67 |         policy_output = self.policy_mlp(torch.Tensor(observations).to(self.device))
68 |         logprob_pi = self.get_log_prob(policy_output, acs_na)
69 | 
70 |         self.optimizer.zero_grad()
71 | 
72 |         loss = torch.sum((-logprob_pi * torch.Tensor(adv_n).to(self.device)))
73 |         loss.backward()
74 | 
75 |         if self.nn_baseline:
76 |             baseline_prediction = self.baseline_mlp(torch.Tensor(observations).to(self.device)).view(-1)
77 |             baseline_target = torch.Tensor((qvals - qvals.mean()) / (qvals.std() + 1e-8)).to(self.device)
78 |             baseline_loss = nn.functional.mse_loss(baseline_prediction, baseline_target)
79 |             baseline_loss.backward()
80 | 
81 |         self.optimizer.step()
82 | 
83 |         return loss
84 | 
85 | #####################################################
86 | #####################################################
87 | 
88 | class MLPPolicyAC(MLPPolicyPG):
89 |     """ MLP policy required for actor-critic.
90 | 
91 |     Note: Your code for this class could in fact the same as MLPPolicyPG, except the neural net baseline
92 |     would not be required (i.e. self.nn_baseline would always be false. It is separated here only
93 |     to avoid any unintended errors.
94 |     """
95 |     def __init__(self, *args, **kwargs):
96 |         if 'nn_baseline' in kwargs.keys():
97 |             assert kwargs['nn_baseline'] == False, "MLPPolicyAC should not use the nn_baseline flag"
98 |         super().__init__(*args, **kwargs)
99 | 


--------------------------------------------------------------------------------
/hw5/cs285/policies/__init__.py:
--------------------------------------------------------------------------------
1 | #init for making the folder a package
2 | 


--------------------------------------------------------------------------------
/hw5/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/policies/__pycache__/MLP_policy.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285/policies/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285/policies/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/hw5/cs285_hw5.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdeib/berkeley-deep-RL-pytorch-starter/8c5f7095b4f3717ee286e0f5dc9795a6cac55a66/hw5/cs285_hw5.pdf


--------------------------------------------------------------------------------
/hw5/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.5
2 | mujoco-py==1.50.1.56
3 | numpy
4 | seaborn
5 | tqdm


--------------------------------------------------------------------------------
/hw5/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='cs285',
6 |     version='0.1.0',
7 |     packages=['cs285'],
8 | )


--------------------------------------------------------------------------------