├── .gitignore ├── README.md ├── hw1 ├── DAgger.py ├── Readme.md ├── behavior_cloning.py ├── experts │ ├── Ant-v2.pkl │ ├── HalfCheetah-v2.pkl │ ├── Hopper-v2.pkl │ ├── Humanoid-v2.pkl │ ├── Reacher-v2.pkl │ └── Walker2d-v2.pkl ├── hw1.bash ├── load_policy.py ├── plot.py ├── run_expert.py └── tf_util.py ├── hw2 ├── README.md ├── data_HalfCheetah_8 │ ├── hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ └── hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── 11 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ └── 21 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_InvertedPendulum │ └── hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── 11 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ └── 21 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_large │ ├── lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ └── lib_rtg_na_CartPole-v0_18-09-2018_00-58-19 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── 11 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ └── 21 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_lunar │ └── ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── 11 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ └── 21 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_small │ ├── sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ └── sb_rtg_na_CartPole-v0_18-09-2018_00-36-45 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── 11 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ └── 21 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── hw2.bash ├── logz.py ├── lunar_lander.py ├── plot.py └── train_pg_f18.py ├── hw3 ├── DDQN_Pong.pkl ├── DQNAtari_Ponglr_multi0.1.pkl ├── DQNAtari_Ponglr_multi10.0.pkl ├── DQNAtari_Ponglr_multi5.0.pkl ├── DQN_Pong.pkl ├── Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf ├── README.md ├── atari_wrappers.py ├── data:pkl │ ├── 1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl │ ├── 3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl │ ├── 43109373-50a0-47a8-b483-17921386ed82Lander.pkl │ ├── 518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl │ ├── 63926721-2624-40a7-b029-cee54d11097aLander.pkl │ ├── 8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl │ ├── 9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl │ ├── Atari_DDQN.pkl │ ├── DDQN-Lunar-test1.pkl │ ├── DDQNFalseLander.pkl │ ├── DDQNFalseLander_1e4.pkl │ ├── DDQNFalseLander_lr2e3.pkl │ ├── DDQNFalseLander_lr3e3.pkl │ ├── DDQNTrueLander.pkl │ ├── DQN-Atari-Pong.pkl │ ├── DQN-Lunar-2 │ ├── DQN-Pong.pkl │ ├── b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl │ └── ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl ├── data_CartPole │ ├── .DS_Store │ ├── ac_100_1_CartPole-v0_02-10-2018_17-05-47 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── ac_10_10_CartPole-v0_02-10-2018_17-09-03 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── ac_1_100_CartPole-v0_02-10-2018_17-07-35 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── 11 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ └── 21 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ └── ac_1_1_CartPole-v0_02-10-2018_09-37-30 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── 11 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ └── 21 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_HalfCheetah │ ├── .DS_Store │ └── ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── 11 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ └── 21 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_InvertedPendulum │ └── ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── 11 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ └── 21 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── dqn.py ├── dqn_utils.py ├── figures │ ├── p1q1.png │ ├── p1q2.png │ ├── p1q3.png │ ├── p2q1.png │ ├── p2q2_1.png │ └── p2q2_2.png ├── hw3.pdf ├── logz.py ├── lunar_lander.py ├── p1q1.py ├── p1q2.py ├── p1q3.py ├── plot.py ├── plot_q_learning.ipynb ├── requirements.txt ├── run_dqn_atari.py ├── run_dqn_lander.py ├── run_dqn_ram.py └── train_ac_f18.py ├── hw4 ├── Deep_RL_Assignment_4__Model_Based_RL.pdf ├── Readme.md ├── half_cheetah_env.py ├── logger.py ├── main.py ├── model_based_policy.py ├── model_based_rl.py ├── plot.py ├── requirements.txt ├── run_all.sh ├── tabulate.py ├── timer.py └── utils.py └── hw5 ├── exp ├── README.md ├── density_model.py ├── ex_utils.py ├── exploration.py ├── hw5a.pdf ├── logz.py ├── plot.py ├── pointmass.py ├── replay.py ├── requirements.txt ├── run_all.sh ├── sparse_half_cheetah.py └── train_ac_exploration_f18.py ├── meta ├── Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf ├── README.md ├── data │ ├── mlp_1_pm_13-11-2018_20-57-59 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── mlp_30_pm_13-11-2018_20-48-55 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── mlp_50_pm_14-11-2018_20-05-53 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── mlp_60_pm_13-11-2018_23-01-39 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── pro1_pm-obs_13-11-2018_01-08-37 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── pro3_rnn_60_g_1_pm_15-11-2018_01-30-55 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── pro3_rnn_60_g_2_pm_14-11-2018_16-22-59 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── pro3_rnn_60_g_4_pm_15-11-2018_01-34-18 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── rnn_1_pm_13-11-2018_21-05-16 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── rnn_30_pm_13-11-2018_19-34-21 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── rnn_50_pm_14-11-2018_10-34-08 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ └── rnn_60_pm_13-11-2018_17-27-20 │ │ └── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_pro1 │ ├── pro1_pm-obs_13-11-2018_01-08-37 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ └── prob1.png ├── data_pro2_1 │ ├── mlp_1_pm_13-11-2018_20-57-59 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── pro2_1.png │ └── rnn_1_pm_13-11-2018_21-05-16 │ │ └── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_pro2_30 │ ├── mlp_30_pm_13-11-2018_20-48-55 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── prob_30.png │ └── rnn_30_pm_13-11-2018_19-34-21 │ │ └── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_pro2_50 │ ├── mlp_50_pm_14-11-2018_20-05-53 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── pro2_50.png │ └── rnn_50_pm_14-11-2018_10-34-08 │ │ └── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_pro2_60 │ ├── mlp_60_pm_13-11-2018_23-01-39 │ │ └── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ ├── pro2_60.png │ └── rnn_60_pm_13-11-2018_17-27-20 │ │ └── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl ├── data_pro3 │ ├── pro3_rnn_60_g_1_pm_15-11-2018_01-30-55 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── g_1.png │ │ ├── g_1_avg.png │ │ └── g_1_val.png │ ├── pro3_rnn_60_g_2_pm_14-11-2018_16-22-59 │ │ ├── 1 │ │ │ ├── log.txt │ │ │ ├── params.json │ │ │ └── vars.pkl │ │ ├── g_2.png │ │ ├── g_2_avg.png │ │ └── g_2_val.png │ └── pro3_rnn_60_g_4_pm_15-11-2018_01-34-18 │ │ ├── 1 │ │ ├── log.txt │ │ ├── params.json │ │ └── vars.pkl │ │ ├── g_4.png │ │ ├── g_4_avg.png │ │ └── g_4_val.png ├── logz.py ├── plot.py ├── point_mass.py ├── point_mass_observed.py ├── replay_buffer.py ├── requirements.txt └── train_policy.py └── sac ├── README.md ├── environment.yml ├── logz.py ├── nn.py ├── plot.py ├── sac.py ├── train_mujoco.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | 4 | */.DS_Store 5 | 6 | __pycache__ 7 | 8 | Thumbs.db 9 | 10 | .ipynb_checkpoints/ 11 | 12 | .gitignore 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CS294-112-Deep-Reinforcement-Learning 2 | 3 | 4 | -- 5 | 6 | **- This is my assignments and project of CS294-112 Deep Reinforcement Learning course at UC Berkeley in Fall 2018** 7 | 8 | 9 | **- For assignments details, please step into specific homework folders.** 10 | 11 | **- The course website is [CS294-112](http://rail.eecs.berkeley.edu/deeprlcourse/)** 12 | -------------------------------------------------------------------------------- /hw1/Readme.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 1: Imitation Learning 2 | 3 | --- 4 | 5 | ###Run the bash script 6 | ###### `./hw1.bash` 7 | ###to get all results of hw1 8 | 9 | 10 | 11 | 12 | --- 13 | #####The following steps are detailed guidance to section 2 and section 3 of hw1 14 | In order to run this assignment, first you need to make a folder names **expert_data** which saves the output data for expert_policy 15 | 16 | `mkdir expert_data` 17 | 18 | 1. Load up expert policy and run data
19 | Run `python run_expert.py experts/task.pkl task --render --num_rollouts [num]` to run expert policy
20 | Eg. 21 | `python run_expert.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20` for the Hopper task 22 | `python run_expert.py experts/Reacher-v2.pkl Reacher-v2 --render --num_rollouts 400` for the Reacher task 23 | 24 | 2. Implement Behavior_cloning
25 | Run `python behavior_cloning.py experts/task.pkl task --render --num_rollouts [num]`to implement BC
26 | Eg. 27 | `python behavior_cloning.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20`for the Hopper task 28 | `python behavior_cloning.py experts/Reacher-v2.pkl Reacher-v2 --render --num_rollouts 400`for the Reacher task
29 | This command will generate a `.pkl` file which saves mean value of reward and std of reward with epoch increasing 30 | 31 | 3. Implement DAgger
32 | Run `python DAgger.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20`for the Hopper task
33 | This command will also generate a `.pkl` file which saves mean value of reward and std of reward with DAgger iterations. 34 | 35 | 4. Plot
36 | With the `.pkl` files generated from step2 and step3, run 37 | `python plot.py Hopper-v2 --num_rollouts 20` to generate figures for behavior cloning and DAgger 38 | 39 | -------------------------------------------------------------------------------- /hw1/behavior_cloning.py: -------------------------------------------------------------------------------- 1 | # Implement behavior cloning 2 | 3 | 4 | import tensorflow as tf 5 | import pickle 6 | import numpy as np 7 | import tf_util 8 | import argparse 9 | import load_policy 10 | import gym 11 | from sklearn.cross_validation import train_test_split 12 | from sklearn.utils import shuffle 13 | 14 | 15 | # Parameters 16 | 17 | learning_rate = 0.001 18 | num_epoch = 100 19 | batch_size = 128 20 | 21 | # Network Parameters 22 | 23 | num_hid_1 = 128 24 | num_hid_2 = 128 25 | 26 | 27 | #Load training data from expert demonstrations generated by run_expert.py 28 | def load_expert_data (filename): 29 | with open (filename, 'rb') as f: 30 | data = pickle.loads(f.read()) 31 | return data 32 | 33 | def data_preprocessing(x, y): 34 | 35 | x, y = shuffle(x, y, random_state=0) 36 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3) 37 | y_train = y_train.reshape(y_train.shape[0], y_train.shape[2]) 38 | y_test = y_test.reshape(y_test.shape[0], y_test.shape[2]) 39 | 40 | return x_train, x_test, y_train, y_test 41 | 42 | def next_batch(batch_size, x, y): 43 | 44 | indices = np.random.randint(low = 0, high = len(x), size = batch_size) 45 | input_batch = x[indices] 46 | label_batch = y[indices] 47 | 48 | return input_batch, label_batch 49 | 50 | def network_model(num_obs, num_act): 51 | 52 | x = tf.placeholder(tf.float32, shape = [None, num_obs], name = 'x') 53 | y = tf.placeholder(tf.float32, shape = [None, num_act], name = 'y') 54 | layer_1 = tf.layers.dense(x, num_hid_1, activation = tf.nn.relu, use_bias=True) 55 | layer_2 = tf.layers.dense(layer_1, num_hid_2, activation = tf.nn.relu, use_bias = True) 56 | output = tf.layers.dense(layer_2, num_act, activation = None, use_bias = True) 57 | 58 | return output, x, y 59 | 60 | def train_network(output, y): 61 | loss = tf.losses.mean_squared_error(output, y) 62 | train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) 63 | 64 | return loss, train_op 65 | 66 | 67 | 68 | def main(): 69 | parser = argparse.ArgumentParser(); 70 | parser.add_argument('expert_policy_file', type=str) 71 | parser.add_argument('envname', type=str) 72 | parser.add_argument('--render', action='store_true') 73 | parser.add_argument("--max_timesteps", type=int) 74 | parser.add_argument('--num_rollouts', type=int, default=20, 75 | help='Number of expert roll outs') 76 | args = parser.parse_args() 77 | 78 | task = args.envname 79 | dataset = 'expert_data/' + args.envname + '_' + str(args.num_rollouts) + '_data.pkl' 80 | 81 | 82 | #Load training data 83 | data = load_expert_data(dataset) 84 | observations = np.array(data['observations']) 85 | actions = np.array(data['actions']) 86 | num_obs = observations.shape[1] 87 | num_act = actions.shape[2] 88 | 89 | obs_train, obs_test, act_train, act_test = data_preprocessing(observations, actions) 90 | 91 | output, x, y = network_model(num_obs, num_act) 92 | 93 | lossfunction, train_op = train_network(output, y) 94 | 95 | tf.add_to_collection('pred_network', output) 96 | 97 | mean_reward = [] 98 | std_reward = [] 99 | 100 | 101 | # Train 102 | init = tf.global_variables_initializer() 103 | #model_path = './bc_policy/' + task + '_' + str(args.num_rollouts) + '_bc' 104 | #builder = tf.saved_model.builder.SavedModelBuilder(model_path) 105 | with tf.Session() as sess: 106 | sess.run(init) 107 | 108 | for epoch in range(num_epoch + 1): 109 | 110 | num_batch = int(len(obs_train) / batch_size) 111 | 112 | for num in range(num_batch): 113 | 114 | obs_train_batch, act_train_batch = next_batch(batch_size, obs_train, act_train) 115 | 116 | sess.run(train_op, feed_dict = {x: obs_train_batch, y: act_train_batch}) 117 | 118 | if epoch % 10 == 0: 119 | 120 | loss = sess.run(lossfunction, feed_dict = {x: obs_train, y: act_train}) 121 | 122 | print("Number of Epoch: %d, Training Loss = %.08f "%(epoch, loss)) 123 | 124 | test_output = sess.run(output, feed_dict = {x: obs_test}) 125 | 126 | testloss = np.mean((test_output - act_test)**2) 127 | 128 | print ("Testing loss = %.08f" % testloss) 129 | 130 | env = gym.make(args.envname) 131 | max_steps = args.max_timesteps or env.spec.timestep_limit 132 | 133 | returns = [] 134 | observations = [] 135 | actions = [] 136 | for i in range(args.num_rollouts): 137 | print('iter', i) 138 | obs = env.reset() 139 | done = False 140 | totalr = 0. 141 | steps = 0 142 | while not done: 143 | 144 | pre_action = sess.run(output, feed_dict = {x:obs[None,:]}) 145 | observations.append(obs) 146 | actions.append(pre_action) 147 | obs, r, done, _ = env.step(pre_action) 148 | totalr += r 149 | steps += 1 150 | if args.render: 151 | env.render() 152 | #if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 153 | if steps >= max_steps: 154 | break 155 | returns.append(totalr) 156 | 157 | print('returns', returns) 158 | print('mean return', np.mean(returns)) 159 | print('std of return', np.std(returns)) 160 | mean_reward.append(np.mean(returns)) 161 | std_reward.append(np.std(returns)) 162 | 163 | #builder.add_meta_graph_and_variables(sess, ['Training']) 164 | #builder.save 165 | BC_result = {'mean_reward': np.array(mean_reward), 166 | 'std_reward': np.array(std_reward)} 167 | 168 | 169 | outfilename = './' + args.envname + '_' + str(args.num_rollouts) + '_bc_data.pkl' 170 | 171 | with open((outfilename), 'wb') as f: 172 | pickle.dump(BC_result, f, pickle.HIGHEST_PROTOCOL) 173 | 174 | 175 | 176 | 177 | 178 | if __name__ == '__main__': 179 | main() 180 | 181 | -------------------------------------------------------------------------------- /hw1/experts/Ant-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Ant-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/HalfCheetah-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/HalfCheetah-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/Hopper-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Hopper-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/Humanoid-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Humanoid-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/Reacher-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Reacher-v2.pkl -------------------------------------------------------------------------------- /hw1/experts/Walker2d-v2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Walker2d-v2.pkl -------------------------------------------------------------------------------- /hw1/hw1.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | if [ ! -d "expert_data" ]; then 4 | mkdir expert_data 5 | fi 6 | 7 | python run_expert.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20 8 | python run_expert.py experts/Reacher-v2.pkl Reacher-v2 --num_rollouts 400 9 | python behavior_cloning.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20 10 | python behavior_cloning.py experts/Reacher-v2.pkl Reacher-v2 --num_rollouts 400 11 | python DAgger.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20 12 | python plot.py Hopper-v2 --num_rollouts 20 -------------------------------------------------------------------------------- /hw1/load_policy.py: -------------------------------------------------------------------------------- 1 | import pickle, tensorflow as tf, tf_util, numpy as np 2 | 3 | def load_policy(filename): 4 | with open(filename, 'rb') as f: 5 | data = pickle.loads(f.read()) 6 | 7 | # assert len(data.keys()) == 2 8 | nonlin_type = data['nonlin_type'] 9 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0] 10 | 11 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type) 12 | policy_params = data[policy_type] 13 | 14 | assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'} 15 | 16 | # Keep track of input and output dims (i.e. observation and action dims) for the user 17 | 18 | def build_policy(obs_bo): 19 | def read_layer(l): 20 | assert list(l.keys()) == ['AffineLayer'] 21 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b'] 22 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32) 23 | 24 | def apply_nonlin(x): 25 | if nonlin_type == 'lrelu': 26 | return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233 27 | elif nonlin_type == 'tanh': 28 | return tf.tanh(x) 29 | else: 30 | raise NotImplementedError(nonlin_type) 31 | 32 | # Build the policy. First, observation normalization. 33 | assert list(policy_params['obsnorm'].keys()) == ['Standardizer'] 34 | obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D'] 35 | obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D'] 36 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean))) 37 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape) 38 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation 39 | 40 | curr_activations_bd = normedobs_bo 41 | 42 | # Hidden layers next 43 | assert list(policy_params['hidden'].keys()) == ['FeedforwardNet'] 44 | layer_params = policy_params['hidden']['FeedforwardNet'] 45 | for layer_name in sorted(layer_params.keys()): 46 | l = layer_params[layer_name] 47 | W, b = read_layer(l) 48 | curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b) 49 | 50 | # Output layer 51 | W, b = read_layer(policy_params['out']) 52 | output_bo = tf.matmul(curr_activations_bd, W) + b 53 | return output_bo 54 | 55 | obs_bo = tf.placeholder(tf.float32, [None, None]) 56 | a_ba = build_policy(obs_bo) 57 | policy_fn = tf_util.function([obs_bo], a_ba) 58 | return policy_fn -------------------------------------------------------------------------------- /hw1/plot.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | def load_expert_data (filename): 6 | with open (filename, 'rb') as f: 7 | data = pickle.loads(f.read()) 8 | return data 9 | 10 | 11 | def main(): 12 | 13 | 14 | #Behavior cloning result with the number of epoch 15 | 16 | BC_path = 'Hopper-v2_20_bc_data.pkl' 17 | BC_result = load_expert_data(BC_path) 18 | BC_mean = np.array(BC_result['mean_reward']) 19 | BC_std = np.array(BC_result['std_reward']) 20 | epoch = np.arange(0, 101, 10) 21 | BC_plot = plt.figure(1) 22 | p1, = plt.plot(epoch, BC_mean, color='blue', label='Behavor_cloning' ) 23 | plt.errorbar(epoch, BC_mean, ecolor='r', color='blue', yerr = BC_std, fmt = '-o', elinewidth=2, capsize=4) 24 | plt.suptitle('Behavorial Cloning: Epoches vs. Reward', fontsize=20) 25 | plt.xlabel('Number of Training Epoches') 26 | plt.ylabel('Mean Reward') 27 | plt.legend() 28 | plt.show() 29 | 30 | 31 | 32 | 33 | DAgger_path = './Hopper-v2_20_data.pkl' 34 | DAgger_result = load_expert_data(DAgger_path) 35 | mean = np.array(DAgger_result['mean_reward']) 36 | std = np.array(DAgger_result['std_reward']) 37 | iteration = np.arange(std.shape[0]) 38 | iteration = iteration + 1; 39 | 40 | 41 | DAgger_plot = plt.figure(2) 42 | Dag, = plt.plot(iteration, mean, marker = '*', color='b', label='DAgger Policy') 43 | plt.errorbar(iteration, mean, yerr = std, fmt = '-*',color='b',ecolor='r' , elinewidth=2, capsize=4) 44 | plt.suptitle('DAgger Iterations vs. Reward', fontsize=20) 45 | plt.xlabel('DAgger Iteration') 46 | plt.ylabel('Mean Reward') 47 | plt.xlim([0, 6.5]) 48 | plt.ylim([1000, 4000]) 49 | expert = plt.axhline(y=3778.4842779089204, color='k', label='Expert Policy') 50 | bc = plt.axhline(y=2009.9990, color='g', label='Behaviorial Cloning') 51 | plt.legend(loc= 4) 52 | plt.show() 53 | 54 | 55 | 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | -------------------------------------------------------------------------------- /hw1/run_expert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Code to load an expert policy and generate roll-out data for behavioral cloning. 5 | Example usage: 6 | python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \ 7 | --num_rollouts 20 8 | 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com) 10 | """ 11 | 12 | import os 13 | import pickle 14 | import tensorflow as tf 15 | import numpy as np 16 | import tf_util 17 | import gym 18 | import load_policy 19 | 20 | def main(): 21 | import argparse 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('expert_policy_file', type=str) 24 | parser.add_argument('envname', type=str) 25 | parser.add_argument('--render', action='store_true') 26 | parser.add_argument("--max_timesteps", type=int) 27 | parser.add_argument('--num_rollouts', type=int, default=20, 28 | help='Number of expert roll outs') 29 | args = parser.parse_args() 30 | 31 | print('loading and building expert policy') 32 | policy_fn = load_policy.load_policy(args.expert_policy_file) 33 | print('loaded and built') 34 | 35 | with tf.Session(): 36 | tf_util.initialize() 37 | 38 | import gym 39 | env = gym.make(args.envname) 40 | max_steps = args.max_timesteps or env.spec.timestep_limit 41 | 42 | returns = [] 43 | observations = [] 44 | actions = [] 45 | for i in range(args.num_rollouts): 46 | print('steps', max_steps) 47 | print('iter', i) 48 | obs = env.reset() 49 | done = False 50 | totalr = 0. 51 | steps = 0 52 | while not done: 53 | action = policy_fn(obs[None,:]) 54 | observations.append(obs) 55 | actions.append(action) 56 | obs, r, done, _ = env.step(action) 57 | totalr += r 58 | steps += 1 59 | if args.render: 60 | env.render() 61 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps)) 62 | if steps >= max_steps: 63 | break 64 | returns.append(totalr) 65 | 66 | print('returns', returns) 67 | print('mean return', np.mean(returns)) 68 | print('std of return', np.std(returns)) 69 | 70 | expert_data = {'observations': np.array(observations), 71 | 'actions': np.array(actions)} 72 | 73 | outfilename = 'expert_data/' + args.envname + '_' + str(args.num_rollouts) + '_data.pkl' 74 | 75 | with open((outfilename), 'wb') as f: 76 | pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL) 77 | 78 | if __name__ == '__main__': 79 | main() 80 | -------------------------------------------------------------------------------- /hw2/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 2: Policy Gradient 2 | 3 | For all command-line expressions that used to run my experiments, they are stored in the `hw2.bash` script with annotations of different problems. 4 | 5 | If you want to run the whole expriment, just run: 6 | `./hw2.bash` in the master folder of `train_pg_f18.py` 7 | 8 | (For this bash script, it will store all data file into the `./data` folder) 9 | 10 | I also provided the data I got with expriments: 11 | 12 | 1. For problem 4, if you want to get the graph of small batch, the data is stored in `./data_small`, and run `python plot.py data_small/*` then you can get the graph. if you want to get the graph of large batch, the data is stored in `./data_large`, and run `python plot.py data_large/*` then you can get the graph. 13 | 14 | 2. For problem 5, the data is stored in `./data_InvertedPendulum` folder and run `python plot.py data_InvertedPendulum/*` to get the graph. 15 | 16 | 3. For problem 7, the data is stored in `./data_lunar` folder and run `python plot.py data_lunar/*` to get the graph. 17 | 18 | 4. For problem 8, the folder `./data_HalfCheetah`contains all result with different batch size and learning rate. The folder `./data_HalfCheetah_8` stored the result of optimal for four runs. Run `python plot.py data_HalfCheetah/*` to get the graph. 19 | 20 | 21 | -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_None", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : false, 14 | "seed" : 1, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_None", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : false, 14 | "seed" : 11, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_None", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : false, 14 | "seed" : 21, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_bl", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : false, 14 | "seed" : 1, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_bl", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : false, 14 | "seed" : 11, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_bl", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : false, 14 | "seed" : 21, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_rtg", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 1, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_rtg", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 11, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_rtg", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 21, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_rtg_bl", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 1, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_rtg_bl", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 11, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "hc_b50000_r0.02_rtg_bl", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 50000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 21, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "InvertedPendulum-v2", 3 | "exp_name" : "hc_b400_r0.02", 4 | "gamma" : 0.9, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 400, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "InvertedPendulum-v2", 3 | "exp_name" : "hc_b400_r0.02", 4 | "gamma" : 0.9, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 400, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "InvertedPendulum-v2", 3 | "exp_name" : "hc_b400_r0.02", 4 | "gamma" : 0.9, 5 | "learning_rate" : 0.02, 6 | "logdir" : "data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 400, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lb_no_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : false, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lb_no_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : false, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lb_no_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : false, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lb_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : true, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lb_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : true, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lb_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : true, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lib_rtg_na", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lib_rtg_na", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "lib_rtg_na", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "LunarLanderContinuous-v2", 3 | "exp_name" : "ll_b40000_r0.005", 4 | "gamma" : 0.99, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 40000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "LunarLanderContinuous-v2", 3 | "exp_name" : "ll_b40000_r0.005", 4 | "gamma" : 0.99, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 40000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "LunarLanderContinuous-v2", 3 | "exp_name" : "ll_b40000_r0.005", 4 | "gamma" : 0.99, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 40000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : true, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_no_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : false, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_no_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : false, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_no_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : false, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : true, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : true, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_rtg_dna", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : false, 13 | "reward_to_go" : true, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_rtg_na", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_rtg_na", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/vars.pkl -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "sb_rtg_na", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "nn_baseline" : false, 12 | "normalize_advantages" : true, 13 | "reward_to_go" : true, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/vars.pkl -------------------------------------------------------------------------------- /hw2/hw2.bash: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -eux 3 | 4 | #Problem 4 5 | 6 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -dna --exp_name sb_no_rtg_dna 7 | 8 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -rtg -dna --exp_name sb_rtg_dna 9 | 10 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -rtg --exp_name sb_rtg_na 11 | 12 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -dna --exp_name lb_no_rtg_dna 13 | 14 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -rtg -dna --exp_name lb_rtg_dna 15 | 16 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -rtg --exp_name lb_rtg_na 17 | 18 | #Peoblem 5 19 | 20 | python train_pg_f18.py InvertedPendulum-v2 -ep 1000 --discount 0.9 -n 100 -e 3 -l 2 -s 64 -b 400 -lr 0.02 -rtg --exp_name hc_b400_r0.02 21 | 22 | #Problem 7 23 | 24 | python train_pg_f18.py LunarLanderContinuous-v2 -ep 1000 --discount 0.99 -n 100 -e 3 -l 2 -s 64 -b 40000 -lr 0.005 -rtg --nn_baseline --exp_name ll_b40000_r0.005 25 | 26 | #Problem 8 27 | 28 | #Find the best batch size and learning rate 29 | for batch in 10000 30000 50000 30 | do 31 | for lr in 0.005 0.01 0.02 32 | do 33 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b $batch -lr $lr -rtg --nn_baseline --exp_name hc_b${batch}_r${lr} 34 | done 35 | done 36 | 37 | 38 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 --exp_name hc_b50000_r0.02_None 39 | 40 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 -rtg --exp_name hc_b50000_r0.02_rtg 41 | 42 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 --nn_baseline --exp_name hc_b50000_r0.02_bl 43 | 44 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 -rtg --nn_baseline --exp_name hc_b50000_r0.02_rtg_bl 45 | 46 | -------------------------------------------------------------------------------- /hw2/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw2/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | plt.show() 59 | 60 | 61 | def get_datasets(fpath, condition=None): 62 | unit = 0 63 | datasets = [] 64 | for root, dir, files in os.walk(fpath): 65 | if 'log.txt' in files: 66 | param_path = open(os.path.join(root,'params.json')) 67 | params = json.load(param_path) 68 | exp_name = params['exp_name'] 69 | 70 | log_path = os.path.join(root,'log.txt') 71 | experiment_data = pd.read_table(log_path) 72 | 73 | experiment_data.insert( 74 | len(experiment_data.columns), 75 | 'Unit', 76 | unit 77 | ) 78 | experiment_data.insert( 79 | len(experiment_data.columns), 80 | 'Condition', 81 | condition or exp_name 82 | ) 83 | 84 | datasets.append(experiment_data) 85 | unit += 1 86 | 87 | return datasets 88 | 89 | 90 | def main(): 91 | import argparse 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('logdir', nargs='*') 94 | parser.add_argument('--legend', nargs='*') 95 | parser.add_argument('--value', default='AverageReturn', nargs='*') 96 | args = parser.parse_args() 97 | 98 | use_legend = False 99 | if args.legend is not None: 100 | assert len(args.legend) == len(args.logdir), \ 101 | "Must give a legend title for each set of experiments." 102 | use_legend = True 103 | 104 | data = [] 105 | if use_legend: 106 | for logdir, legend_title in zip(args.logdir, args.legend): 107 | data += get_datasets(logdir, legend_title) 108 | else: 109 | for logdir in args.logdir: 110 | data += get_datasets(logdir) 111 | 112 | if isinstance(args.value, list): 113 | values = args.value 114 | else: 115 | values = [args.value] 116 | for value in values: 117 | plot_data(data, value=value) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /hw3/DDQN_Pong.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DDQN_Pong.pkl -------------------------------------------------------------------------------- /hw3/DQNAtari_Ponglr_multi0.1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi0.1.pkl -------------------------------------------------------------------------------- /hw3/DQNAtari_Ponglr_multi10.0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi10.0.pkl -------------------------------------------------------------------------------- /hw3/DQNAtari_Ponglr_multi5.0.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi5.0.pkl -------------------------------------------------------------------------------- /hw3/DQN_Pong.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQN_Pong.pkl -------------------------------------------------------------------------------- /hw3/Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf -------------------------------------------------------------------------------- /hw3/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 3: Q-Learning 2 | 3 | 4 | --- 5 | Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file. 6 | 7 | ###Problem 1 8 | 9 | #####Question 1 10 | 11 | Run `python run_dqn_atari.py` directly with vanilla Q-learning and random seed with learning multiplier 1 12 | 13 | Plot 14 | `python p1q1.py` (Replace the `.pkl` filename) 15 | 16 | #####Question 2 17 | 18 | Run `python run_dqn_atari.py --double` with double Q-learning and random seed. 19 | 20 | Plot 21 | `python p1q2.py` (Replace the `.pkl` filename) to plot the vanilla Q-learning and Double Q-learning. 22 | 23 | #####Question 3 24 | 25 | Run `python run_dqn_atari.py -m <> --seed <--double>` with the learning multiplier and a fixed seed number **5000**, if `--double` then with double Q-learning else vanilla Q-learning. 26 | 27 | Plot 28 | `python p1q3.py` (Replace the `.pkl` filename) to plot different learning curves with learning multiplier. 29 | 30 | ###Problem 2 31 | 32 | #####Question 1 33 | Run 34 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 1_1 -ntu 1 -ngsptu 1` 35 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 1_100 -ntu 1-ngsptu 100` 36 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 100_1 -ntu100 -ngsptu 1` 37 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 10_10 -ntu10 -ngsptu 10` 38 | 39 | Plot 40 | `python plot.py data_CartPole/*` 41 | 42 | #####Question 2 43 | Run 44 | `python train_ac_f18.py InvertedPendulum-v2 -ep 1000 --discount 0.95 -n 100 -e3 -l 2 -s 64 -b 5000 -lr 0.01 --exp_name 10_10 -ntu 10 -ngsptu 10` for InvertedPendulum task 45 | Run 46 | `python train_ac_f18.py HalfCheetah-v2 -ep 150 --discount 0.90 -n 100 -e 3 -l 2-s 32 -b 30000 -lr 0.02 --exp_name 10_10 -ntu 10 -ngsptu 10` for HalfCheetah task 47 | 48 | Plot 49 | `python plot.py data_InvertedPendulum/*` 50 | 51 | `python plot.py data_HalfCheetah/*` 52 | 53 | -------------------------------------------------------------------------------- /hw3/data:pkl/1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/43109373-50a0-47a8-b483-17921386ed82Lander.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/43109373-50a0-47a8-b483-17921386ed82Lander.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/63926721-2624-40a7-b029-cee54d11097aLander.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/63926721-2624-40a7-b029-cee54d11097aLander.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/Atari_DDQN.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/Atari_DDQN.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/DDQN-Lunar-test1.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQN-Lunar-test1.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/DDQNFalseLander.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/DDQNFalseLander_1e4.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_1e4.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/DDQNFalseLander_lr2e3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_lr2e3.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/DDQNFalseLander_lr3e3.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_lr3e3.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/DDQNTrueLander.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNTrueLander.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/DQN-Atari-Pong.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Atari-Pong.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/DQN-Lunar-2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Lunar-2 -------------------------------------------------------------------------------- /hw3/data:pkl/DQN-Pong.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Pong.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl -------------------------------------------------------------------------------- /hw3/data:pkl/ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/.DS_Store -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "100_1", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 1, 13 | "num_target_updates" : 100, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "100_1", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 1, 13 | "num_target_updates" : 100, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "100_1", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 1, 13 | "num_target_updates" : 100, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "10_10", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "10_10", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "10_10", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "1_100", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 100, 13 | "num_target_updates" : 1, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "1_100", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 100, 13 | "num_target_updates" : 1, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "1_100", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 100, 13 | "num_target_updates" : 1, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "1_1", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 1, 13 | "num_target_updates" : 1, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "1_1", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 1, 13 | "num_target_updates" : 1, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/vars.pkl -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "CartPole-v0", 3 | "exp_name" : "1_1", 4 | "gamma" : 1.0, 5 | "learning_rate" : 0.005, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21", 7 | "max_path_length" : null, 8 | "min_timesteps_per_batch" : 1000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 1, 13 | "num_target_updates" : 1, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/vars.pkl -------------------------------------------------------------------------------- /hw3/data_HalfCheetah/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/.DS_Store -------------------------------------------------------------------------------- /hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "10_10", 4 | "gamma" : 0.9, 5 | "learning_rate" : 0.02, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 30000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 1, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/vars.pkl -------------------------------------------------------------------------------- /hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "10_10", 4 | "gamma" : 0.9, 5 | "learning_rate" : 0.02, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 30000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 11, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/vars.pkl -------------------------------------------------------------------------------- /hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "HalfCheetah-v2", 3 | "exp_name" : "10_10", 4 | "gamma" : 0.9, 5 | "learning_rate" : 0.02, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21", 7 | "max_path_length" : 150.0, 8 | "min_timesteps_per_batch" : 30000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 21, 15 | "size" : 32} -------------------------------------------------------------------------------- /hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/vars.pkl -------------------------------------------------------------------------------- /hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "InvertedPendulum-v2", 3 | "exp_name" : "10_10", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.01, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 1, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/vars.pkl -------------------------------------------------------------------------------- /hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "InvertedPendulum-v2", 3 | "exp_name" : "10_10", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.01, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 11, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/vars.pkl -------------------------------------------------------------------------------- /hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "InvertedPendulum-v2", 3 | "exp_name" : "10_10", 4 | "gamma" : 0.95, 5 | "learning_rate" : 0.01, 6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21", 7 | "max_path_length" : 1000.0, 8 | "min_timesteps_per_batch" : 5000, 9 | "n_iter" : 100, 10 | "n_layers" : 2, 11 | "normalize_advantages" : true, 12 | "num_grad_steps_per_target_update" : 10, 13 | "num_target_updates" : 10, 14 | "seed" : 21, 15 | "size" : 64} -------------------------------------------------------------------------------- /hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/vars.pkl -------------------------------------------------------------------------------- /hw3/figures/p1q1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q1.png -------------------------------------------------------------------------------- /hw3/figures/p1q2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q2.png -------------------------------------------------------------------------------- /hw3/figures/p1q3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q3.png -------------------------------------------------------------------------------- /hw3/figures/p2q1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q1.png -------------------------------------------------------------------------------- /hw3/figures/p2q2_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q2_1.png -------------------------------------------------------------------------------- /hw3/figures/p2q2_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q2_2.png -------------------------------------------------------------------------------- /hw3/hw3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/hw3.pdf -------------------------------------------------------------------------------- /hw3/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw3/p1q1.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | with open('DQN_Pong.pkl', 'rb') as f: 7 | data = pickle.loads(f.read()) 8 | time_step = data['Timestep'] 9 | mean_reward = data['mean'] 10 | best_reward = data['best'] 11 | best_vanilla = best_reward[-1] 12 | plt.figure() 13 | plt.plot(time_step, mean_reward, color='red', linestyle = '-') 14 | plt.plot(time_step, best_reward, color='blue', linestyle = '--') 15 | plt.xlabel('Timesteps') 16 | plt.ylabel('Mean Episode Reward') 17 | plt.legend(['Mean_DQN','Best Mean_DQN']) 18 | plt.title('Vanilla Q-Learning on Pong', fontsize=12) 19 | plt.grid() 20 | ax = plt.gca() 21 | ax.xaxis.get_major_formatter().set_powerlimits((0,0)) 22 | plt.show() -------------------------------------------------------------------------------- /hw3/p1q2.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | with open('DQN_Pong.pkl', 'rb') as f: 7 | data = pickle.loads(f.read()) 8 | time_step = data['Timestep'] 9 | mean_reward = data['mean'] 10 | best_reward = data['best'] 11 | best_vanilla = best_reward[-1] 12 | print(best_vanilla) 13 | 14 | with open('DDQN_Pong.pkl', 'rb') as l: 15 | data_d = pickle.loads(l.read()) 16 | time_step_d = data_d['Timestep'] 17 | mean_reward_d = data_d['mean'] 18 | best_reward_d = data_d['best'] 19 | best_DDQN = best_reward_d[-1] 20 | print(best_DDQN) 21 | 22 | plt.figure() 23 | plt.plot(time_step, mean_reward, color='green', linestyle = '-') 24 | plt.plot(time_step, best_reward, color='green', linestyle = '--') 25 | plt.plot(time_step_d, mean_reward_d, color='red', linestyle = '-') 26 | plt.plot(time_step_d, best_reward_d, color='red', linestyle = '--') 27 | plt.title('Vanilla Q-Learning Vs. Double Q-Learning on Pong', fontsize=11) 28 | plt.xlabel('Timesteps') 29 | plt.ylabel('Mean Episode Reward') 30 | plt.legend(['Mean_DQN', 'Best Mean_DQN', 'Mean_DDQN', 'Best Mean_DDQN']) 31 | plt.grid() 32 | ax = plt.gca() 33 | ax.xaxis.get_major_formatter().set_powerlimits((0,0)) 34 | plt.show() -------------------------------------------------------------------------------- /hw3/p1q3.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | with open('DQN_Pong.pkl', 'rb') as a: 7 | data = pickle.loads(a.read()) 8 | time_step_a = data['Timestep'][0:300] 9 | mean_reward_a = data['mean'][0:300] 10 | best_reward_a = data['best'][0:300] 11 | 12 | with open('DQNAtari_Ponglr_multi0.1.pkl', 'rb') as b: 13 | data = pickle.loads(b.read()) 14 | time_step_b = data['Timestep'][0:300] 15 | mean_reward_b = data['mean'][0:300] 16 | best_reward_b = data['best'][0:300] 17 | 18 | with open('DQNAtari_Ponglr_multi5.0.pkl', 'rb') as c: 19 | data = pickle.loads(c.read()) 20 | time_step_c = data['Timestep'][0:300] 21 | mean_reward_c = data['mean'][0:300] 22 | best_reward_c = data['best'][0:300] 23 | 24 | with open('DQNAtari_Ponglr_multi10.0.pkl', 'rb') as d: 25 | data = pickle.loads(d.read()) 26 | time_step_d = data['Timestep'][0:300] 27 | mean_reward_d = data['mean'][0:300] 28 | best_reward_d = data['best'][0:300] 29 | 30 | 31 | plt.figure() 32 | plt.plot(time_step_a, mean_reward_a, color='green', linestyle = '-') 33 | plt.plot(time_step_a, best_reward_a, color='green', linestyle = '--') 34 | 35 | plt.plot(time_step_b, mean_reward_b, color='red', linestyle = '-') 36 | plt.plot(time_step_b, best_reward_b, color='red', linestyle = '--') 37 | 38 | plt.plot(time_step_c, mean_reward_c, color='blue', linestyle = '-') 39 | plt.plot(time_step_c, best_reward_c, color='blue', linestyle = '--') 40 | 41 | plt.plot(time_step_d, mean_reward_d, color='magenta', linestyle = '-') 42 | plt.plot(time_step_d, best_reward_d, color='magenta', linestyle = '--') 43 | 44 | plt.title('Q-learning on Pong with different learning rate', fontsize=11) 45 | plt.xlabel('Timesteps') 46 | plt.ylabel('Mean Episode Reward') 47 | plt.grid() 48 | plt.legend(['Mean_lr_multi = 1', 'Best_lr_multi = 1', 'Mean_lr_multi = 0.1', 'Best_lr_multi = 0.1', 'Mean_lr_multi = 5', 'Best_lr_multi = 5', 'Mean_lr_multi = 10', 'Best_lr_multi = 10']) 49 | ax = plt.gca() 50 | ax.xaxis.get_major_formatter().set_powerlimits((0,0)) 51 | plt.show() -------------------------------------------------------------------------------- /hw3/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | plt.show() 59 | 60 | 61 | def get_datasets(fpath, condition=None): 62 | unit = 0 63 | datasets = [] 64 | for root, dir, files in os.walk(fpath): 65 | if 'log.txt' in files: 66 | param_path = open(os.path.join(root,'params.json')) 67 | params = json.load(param_path) 68 | exp_name = params['exp_name'] 69 | 70 | log_path = os.path.join(root,'log.txt') 71 | experiment_data = pd.read_table(log_path) 72 | 73 | experiment_data.insert( 74 | len(experiment_data.columns), 75 | 'Unit', 76 | unit 77 | ) 78 | experiment_data.insert( 79 | len(experiment_data.columns), 80 | 'Condition', 81 | condition or exp_name 82 | ) 83 | 84 | datasets.append(experiment_data) 85 | unit += 1 86 | 87 | return datasets 88 | 89 | 90 | def main(): 91 | import argparse 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('logdir', nargs='*') 94 | parser.add_argument('--legend', nargs='*') 95 | parser.add_argument('--value', default='AverageReturn', nargs='*') 96 | args = parser.parse_args() 97 | 98 | use_legend = False 99 | if args.legend is not None: 100 | assert len(args.legend) == len(args.logdir), \ 101 | "Must give a legend title for each set of experiments." 102 | use_legend = True 103 | 104 | data = [] 105 | if use_legend: 106 | for logdir, legend_title in zip(args.logdir, args.legend): 107 | data += get_datasets(logdir, legend_title) 108 | else: 109 | for logdir in args.logdir: 110 | data += get_datasets(logdir) 111 | 112 | if isinstance(args.value, list): 113 | values = args.value 114 | else: 115 | values = [args.value] 116 | for value in values: 117 | plot_data(data, value=value) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /hw3/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.5 2 | gym[atari] 3 | box2d 4 | mujoco-py==1.50.1.56 5 | tensorflow 6 | numpy 7 | seaborn 8 | opencv-python 9 | -------------------------------------------------------------------------------- /hw3/run_dqn_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(img_in, num_actions, scope, reuse=False): 16 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf 17 | with tf.variable_scope(scope, reuse=reuse): 18 | out = img_in 19 | with tf.variable_scope("convnet"): 20 | # original architecture 21 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu) 22 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu) 23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu) 24 | 25 | out = layers.flatten(out) 26 | with tf.variable_scope("action_value"): 27 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu) 28 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 29 | 30 | return out 31 | 32 | def atari_learn(env, 33 | session, 34 | num_timesteps, 35 | lr_multiplier, 36 | double_q): 37 | # This is just a rough estimate 38 | num_iterations = float(num_timesteps) / 4.0 39 | 40 | lr_multiplier = lr_multiplier 41 | print("The learning rate multiplier is :", lr_multiplier) 42 | lr_schedule = PiecewiseSchedule([ 43 | (0, 1e-4 * lr_multiplier), 44 | (num_iterations / 10, 1e-4 * lr_multiplier), 45 | (num_iterations / 2, 5e-5 * lr_multiplier), 46 | ], 47 | outside_value=5e-5 * lr_multiplier) 48 | optimizer = dqn.OptimizerSpec( 49 | constructor=tf.train.AdamOptimizer, 50 | kwargs=dict(epsilon=1e-4), 51 | lr_schedule=lr_schedule 52 | ) 53 | 54 | def stopping_criterion(env, t): 55 | # notice that here t is the number of steps of the wrapped env, 56 | # which is different from the number of steps in the underlying env 57 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 58 | 59 | exploration_schedule = PiecewiseSchedule( 60 | [ 61 | (0, 1.0), 62 | (1e6, 0.1), 63 | (num_iterations / 2, 0.01), 64 | ], outside_value=0.01 65 | ) 66 | 67 | dqn.learn( 68 | env=env, 69 | q_func=atari_model, 70 | optimizer_spec=optimizer, 71 | session=session, 72 | exploration=exploration_schedule, 73 | stopping_criterion=stopping_criterion, 74 | replay_buffer_size=1000000, 75 | batch_size=32, 76 | gamma=0.99, 77 | learning_starts=50000, 78 | learning_freq=4, 79 | frame_history_len=4, 80 | target_update_freq=10000, 81 | grad_norm_clipping=10, 82 | rew_file = 'Atari_Pong' + 'lr_multi' + str(lr_multiplier), 83 | double_q=double_q 84 | 85 | ) 86 | env.close() 87 | 88 | def get_available_gpus(): 89 | from tensorflow.python.client import device_lib 90 | local_device_protos = device_lib.list_local_devices() 91 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 92 | 93 | def set_global_seeds(i): 94 | try: 95 | import tensorflow as tf 96 | except ImportError: 97 | pass 98 | else: 99 | tf.set_random_seed(i) 100 | np.random.seed(i) 101 | random.seed(i) 102 | 103 | def get_session(): 104 | tf.reset_default_graph() 105 | tf_config = tf.ConfigProto( 106 | inter_op_parallelism_threads=1, 107 | intra_op_parallelism_threads=1) 108 | session = tf.Session(config=tf_config) 109 | print("AVAILABLE GPUS: ", get_available_gpus()) 110 | return session 111 | 112 | def get_env(task, seed): 113 | env = gym.make('PongNoFrameskip-v4') 114 | 115 | set_global_seeds(seed) 116 | env.seed(seed) 117 | 118 | expt_dir = '/tmp/hw3_vid_dir2/' 119 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 120 | env = wrap_deepmind(env) 121 | 122 | return env 123 | 124 | def main(): 125 | import argparse 126 | parser = argparse.ArgumentParser() 127 | parser.add_argument('--multiplier', '-m', type = float, default = 1) 128 | parser.add_argument('--seed', action='store_true') 129 | parser.add_argument('--double', action = 'store_true') 130 | args = parser.parse_args() 131 | 132 | 133 | # Get Atari games. 134 | task = gym.make('PongNoFrameskip-v4') 135 | 136 | if args.seed: 137 | seed = 5000 138 | print('seed = %d' % seed) 139 | # Run training 140 | else: 141 | seed = random.randint(0, 9999) 142 | print('random seed = %d' % seed) 143 | env = get_env(task, seed) 144 | session = get_session() 145 | atari_learn(env, session, num_timesteps=2e8, lr_multiplier = args.multiplier, double_q = args.double) 146 | 147 | if __name__ == "__main__": 148 | main() 149 | -------------------------------------------------------------------------------- /hw3/run_dqn_lander.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | 13 | import argparse 14 | 15 | def lander_model(obs, num_actions, scope, reuse=False): 16 | with tf.variable_scope(scope, reuse=reuse): 17 | out = obs 18 | with tf.variable_scope("action_value"): 19 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 20 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 21 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 22 | return out 23 | 24 | def lander_optimizer(): 25 | return dqn.OptimizerSpec( 26 | constructor=tf.train.AdamOptimizer, 27 | lr_schedule=ConstantSchedule(1e-4), 28 | kwargs={} 29 | ) 30 | 31 | def lander_stopping_criterion(num_timesteps): 32 | def stopping_criterion(env, t): 33 | # notice that here t is the number of steps of the wrapped env, 34 | # which is different from the number of steps in the underlying env 35 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 36 | return stopping_criterion 37 | 38 | def lander_exploration_schedule(num_timesteps): 39 | return PiecewiseSchedule( 40 | [ 41 | (0, 1), 42 | (num_timesteps * 0.1, 0.02), 43 | ], outside_value=0.02 44 | ) 45 | 46 | def lander_kwargs(): 47 | return { 48 | 'optimizer_spec': lander_optimizer(), 49 | 'q_func': lander_model, 50 | 'replay_buffer_size': 50000, 51 | 'batch_size': 32, 52 | 'gamma': 1.00, 53 | 'learning_starts': 1000, 54 | 'learning_freq': 1, 55 | 'frame_history_len': 1, 56 | 'target_update_freq': 3000, 57 | 'grad_norm_clipping': 10, 58 | 'lander': True 59 | } 60 | 61 | def lander_learn(env, 62 | session, 63 | num_timesteps, 64 | seed): 65 | 66 | optimizer = lander_optimizer() 67 | stopping_criterion = lander_stopping_criterion(num_timesteps) 68 | exploration_schedule = lander_exploration_schedule(num_timesteps) 69 | 70 | dqn.learn( 71 | env=env, 72 | session=session, 73 | exploration=lander_exploration_schedule(num_timesteps), 74 | stopping_criterion=lander_stopping_criterion(num_timesteps), 75 | rew_file = 'Lander', 76 | double_q=False, 77 | **lander_kwargs() 78 | ) 79 | env.close() 80 | 81 | def set_global_seeds(i): 82 | tf.set_random_seed(i) 83 | np.random.seed(i) 84 | random.seed(i) 85 | 86 | def get_session(): 87 | tf.reset_default_graph() 88 | tf_config = tf.ConfigProto( 89 | inter_op_parallelism_threads=1, 90 | intra_op_parallelism_threads=1, 91 | device_count={'GPU': 0}) 92 | # GPUs don't significantly speed up deep Q-learning for lunar lander, 93 | # since the observations are low-dimensional 94 | session = tf.Session(config=tf_config) 95 | return session 96 | 97 | def get_env(seed): 98 | env = gym.make('LunarLander-v2') 99 | 100 | set_global_seeds(seed) 101 | env.seed(seed) 102 | 103 | expt_dir = '/tmp/hw3_vid_dir/' 104 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True, video_callable=False) 105 | 106 | return env 107 | 108 | def main(): 109 | # Run training 110 | seed = np.random.randint(9999) # you may want to randomize this 111 | print('random seed = %d' % seed) 112 | env = get_env(seed) 113 | session = get_session() 114 | set_global_seeds(seed) 115 | lander_learn(env, session, num_timesteps=500000, seed=seed) 116 | 117 | if __name__ == "__main__": 118 | main() 119 | -------------------------------------------------------------------------------- /hw3/run_dqn_ram.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | from gym import wrappers 4 | import os.path as osp 5 | import random 6 | import numpy as np 7 | import tensorflow as tf 8 | import tensorflow.contrib.layers as layers 9 | 10 | import dqn 11 | from dqn_utils import * 12 | from atari_wrappers import * 13 | 14 | 15 | def atari_model(ram_in, num_actions, scope, reuse=False): 16 | with tf.variable_scope(scope, reuse=reuse): 17 | out = ram_in 18 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65])) 19 | with tf.variable_scope("action_value"): 20 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu) 21 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu) 22 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu) 23 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 24 | 25 | return out 26 | 27 | def atari_learn(env, 28 | session, 29 | num_timesteps): 30 | # This is just a rough estimate 31 | num_iterations = float(num_timesteps) / 4.0 32 | 33 | lr_multiplier = 1.0 34 | lr_schedule = PiecewiseSchedule([ 35 | (0, 1e-4 * lr_multiplier), 36 | (num_iterations / 10, 1e-4 * lr_multiplier), 37 | (num_iterations / 2, 5e-5 * lr_multiplier), 38 | ], 39 | outside_value=5e-5 * lr_multiplier) 40 | optimizer = dqn.OptimizerSpec( 41 | constructor=tf.train.AdamOptimizer, 42 | kwargs=dict(epsilon=1e-4), 43 | lr_schedule=lr_schedule 44 | ) 45 | 46 | def stopping_criterion(env, t): 47 | # notice that here t is the number of steps of the wrapped env, 48 | # which is different from the number of steps in the underlying env 49 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps 50 | 51 | exploration_schedule = PiecewiseSchedule( 52 | [ 53 | (0, 0.2), 54 | (1e6, 0.1), 55 | (num_iterations / 2, 0.01), 56 | ], outside_value=0.01 57 | ) 58 | 59 | dqn.learn( 60 | env, 61 | q_func=atari_model, 62 | optimizer_spec=optimizer, 63 | session=session, 64 | exploration=exploration_schedule, 65 | stopping_criterion=stopping_criterion, 66 | replay_buffer_size=1000000, 67 | batch_size=32, 68 | gamma=0.99, 69 | learning_starts=50000, 70 | learning_freq=4, 71 | frame_history_len=1, 72 | target_update_freq=10000, 73 | grad_norm_clipping=10 74 | ) 75 | env.close() 76 | 77 | def get_available_gpus(): 78 | from tensorflow.python.client import device_lib 79 | local_device_protos = device_lib.list_local_devices() 80 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU'] 81 | 82 | def set_global_seeds(i): 83 | try: 84 | import tensorflow as tf 85 | except ImportError: 86 | pass 87 | else: 88 | tf.set_random_seed(i) 89 | np.random.seed(i) 90 | random.seed(i) 91 | 92 | def get_session(): 93 | tf.reset_default_graph() 94 | tf_config = tf.ConfigProto( 95 | inter_op_parallelism_threads=1, 96 | intra_op_parallelism_threads=1) 97 | session = tf.Session(config=tf_config) 98 | print("AVAILABLE GPUS: ", get_available_gpus()) 99 | return session 100 | 101 | def get_env(seed): 102 | env = gym.make('Pong-ram-v0') 103 | 104 | set_global_seeds(seed) 105 | env.seed(seed) 106 | 107 | expt_dir = '/tmp/hw3_vid_dir/' 108 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True) 109 | env = wrap_deepmind_ram(env) 110 | 111 | return env 112 | 113 | def main(): 114 | # Run training 115 | seed = 0 # Use a seed of zero (you may want to randomize the seed!) 116 | env = get_env(seed) 117 | session = get_session() 118 | atari_learn(env, session, num_timesteps=int(4e7)) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /hw4/Deep_RL_Assignment_4__Model_Based_RL.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw4/Deep_RL_Assignment_4__Model_Based_RL.pdf -------------------------------------------------------------------------------- /hw4/Readme.md: -------------------------------------------------------------------------------- 1 | ###CS294-112 Assignment 4: Model-Based RL 2 | 3 | --- 4 | 5 | To run the whole solution to all problems, just run the `run_all.sh` in the terminal. 6 | 7 | 8 | The command is `bash ./run_all.sh` and all result data and figures will be saved in the related folders. -------------------------------------------------------------------------------- /hw4/half_cheetah_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym import utils 4 | from gym.envs.mujoco import mujoco_env 5 | 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, action): 12 | xposbefore = self.sim.data.qpos[0] 13 | self.do_simulation(action, self.frame_skip) 14 | xposafter = self.sim.data.qpos[0] 15 | ob = self._get_obs() 16 | reward_ctrl = - 0.1 * np.square(action).sum() 17 | reward_run = (xposafter - xposbefore)/self.dt 18 | reward = reward_ctrl + reward_run 19 | done = False 20 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 21 | 22 | def _get_obs(self): 23 | return np.concatenate([ 24 | self.sim.data.qpos.flat[1:], 25 | self.sim.data.qvel.flat, 26 | self.get_body_com("torso").flat, 27 | # self.get_body_comvel("torso").flat, 28 | ]) 29 | 30 | def reset_model(self): 31 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 32 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 33 | self.set_state(qpos, qvel) 34 | return self._get_obs() 35 | 36 | def viewer_setup(self): 37 | self.viewer.cam.distance = self.model.stat.extent * 0.5 38 | 39 | @staticmethod 40 | def cost_fn(states, actions, next_states): 41 | is_tf = tf.contrib.framework.is_tensor(states) 42 | is_single_state = (len(states.get_shape()) == 1) if is_tf else (len(states.shape) == 1) 43 | 44 | if is_single_state: 45 | states = states[None, ...] 46 | actions = actions[None, ...] 47 | next_states = next_states[None, ...] 48 | 49 | scores = tf.zeros(actions.get_shape()[0].value) if is_tf else np.zeros(actions.shape[0]) 50 | 51 | heading_penalty_factor = 10 52 | 53 | # dont move front shin back so far that you tilt forward 54 | front_leg = states[:, 5] 55 | my_range = 0.2 56 | if is_tf: 57 | scores += tf.cast(front_leg >= my_range, tf.float32) * heading_penalty_factor 58 | else: 59 | scores += (front_leg >= my_range) * heading_penalty_factor 60 | 61 | front_shin = states[:, 6] 62 | my_range = 0 63 | if is_tf: 64 | scores += tf.cast(front_shin >= my_range, tf.float32) * heading_penalty_factor 65 | else: 66 | scores += (front_shin >= my_range) * heading_penalty_factor 67 | 68 | front_foot = states[:, 7] 69 | my_range = 0 70 | if is_tf: 71 | scores += tf.cast(front_foot >= my_range, tf.float32) * heading_penalty_factor 72 | else: 73 | scores += (front_foot >= my_range) * heading_penalty_factor 74 | 75 | scores -= (next_states[:, 17] - states[:, 17]) / 0.01 76 | 77 | if is_single_state: 78 | scores = scores[0] 79 | 80 | return scores 81 | -------------------------------------------------------------------------------- /hw4/main.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import time 4 | 5 | from half_cheetah_env import HalfCheetahEnv 6 | from logger import logger 7 | from model_based_rl import ModelBasedRL 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('question', type=str, choices=('q1, q2, q3')) 11 | parser.add_argument('--exp_name', type=str, default=None) 12 | parser.add_argument('--env', type=str, default='HalfCheetah', choices=('HalfCheetah',)) 13 | parser.add_argument('--render', action='store_true') 14 | parser.add_argument('--mpc_horizon', type=int, default=15) 15 | parser.add_argument('--num_random_action_selection', type=int, default=4096) 16 | parser.add_argument('--nn_layers', type=int, default=1) 17 | args = parser.parse_args() 18 | 19 | data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') 20 | exp_name = '{0}_{1}_{2}'.format(args.env, 21 | args.question, 22 | args.exp_name if args.exp_name else time.strftime("%d-%m-%Y_%H-%M-%S")) 23 | exp_dir = os.path.join(data_dir, exp_name) 24 | assert not os.path.exists(exp_dir),\ 25 | 'Experiment directory {0} already exists. Either delete the directory, or run the experiment with a different name'.format(exp_dir) 26 | os.makedirs(exp_dir, exist_ok=True) 27 | logger.setup(exp_name, os.path.join(exp_dir, 'log.txt'), 'debug') 28 | 29 | env = { 30 | 'HalfCheetah': HalfCheetahEnv() 31 | }[args.env] 32 | 33 | mbrl = ModelBasedRL(env=env, 34 | render=args.render, 35 | mpc_horizon=args.mpc_horizon, 36 | num_random_action_selection=args.num_random_action_selection, 37 | nn_layers=args.nn_layers) 38 | 39 | run_func = { 40 | 'q1': mbrl.run_q1, 41 | 'q2': mbrl.run_q2, 42 | 'q3': mbrl.run_q3 43 | }[args.question] 44 | run_func() 45 | -------------------------------------------------------------------------------- /hw4/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | import matplotlib.pyplot as plt 5 | import matplotlib.cm as cm 6 | import pandas 7 | 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--exps', nargs='+', type=str) 11 | parser.add_argument('--save', type=str, default=None) 12 | args = parser.parse_args() 13 | 14 | f, ax = plt.subplots(1, 1) 15 | for i, exp in enumerate(args.exps): 16 | log_fname = os.path.join('data', exp, 'log.csv') 17 | csv = pandas.read_csv(log_fname) 18 | 19 | color = cm.viridis(i / float(len(args.exps))) 20 | ax.plot(csv['Itr'], csv['ReturnAvg'], color=color, label=exp) 21 | ax.fill_between(csv['Itr'], csv['ReturnAvg'] - csv['ReturnStd'], csv['ReturnAvg'] + csv['ReturnStd'], 22 | color=color, alpha=0.2) 23 | 24 | ax.legend() 25 | ax.set_xlabel('Iteration') 26 | ax.set_ylabel('Return') 27 | 28 | if args.save: 29 | os.makedirs('plots', exist_ok=True) 30 | f.savefig(os.path.join('plots', args.save + '.jpg')) 31 | else: 32 | plt.show() 33 | -------------------------------------------------------------------------------- /hw4/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | matplotlib 3 | colorlog -------------------------------------------------------------------------------- /hw4/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ########## 4 | ### Q1 ### 5 | ########## 6 | 7 | python main.py q1 --exp_name exp 8 | 9 | ########## 10 | ### Q2 ### 11 | ########## 12 | 13 | python main.py q2 --exp_name exp 14 | 15 | ########### 16 | ### Q3a ### 17 | ########### 18 | 19 | python main.py q3 --exp_name default 20 | python plot.py --exps HalfCheetah_q3_default --save HalfCheetah_q3_default 21 | 22 | ########### 23 | ### Q3b ### 24 | ########### 25 | 26 | python main.py q3 --exp_name action128 --num_random_action_selection 128 27 | python main.py q3 --exp_name action4096 --num_random_action_selection 4096 28 | python main.py q3 --exp_name action16384 --num_random_action_selection 16384 29 | python plot.py --exps HalfCheetah_q3_action128 HalfCheetah_q3_action4096 HalfCheetah_q3_action16384 --save HalfCheetah_q3_actions 30 | 31 | python main.py q3 --exp_name horizon10 --mpc_horizon 10 32 | python main.py q3 --exp_name horizon15 --mpc_horizon 15 33 | python main.py q3 --exp_name horizon20 --mpc_horizon 20 34 | python plot.py --exps HalfCheetah_q3_horizon10 HalfCheetah_q3_horizon15 HalfCheetah_q3_horizon20 --save HalfCheetah_q3_mpc_horizon 35 | 36 | python main.py q3 --exp_name layers1 --nn_layers 1 37 | python main.py q3 --exp_name layers2 --nn_layers 2 38 | python main.py q3 --exp_name layers3 --nn_layers 3 39 | python plot.py --exps HalfCheetah_q3_layers1 HalfCheetah_q3_layers2 HalfCheetah_q3_layers3 --save HalfCheetah_q3_nn_layers 40 | -------------------------------------------------------------------------------- /hw4/timer.py: -------------------------------------------------------------------------------- 1 | import time 2 | from collections import defaultdict 3 | 4 | class TimeIt(object): 5 | def __init__(self, prefix=''): 6 | self.prefix = prefix 7 | self.start_times = dict() 8 | self.elapsed_times = defaultdict(int) 9 | 10 | def start(self, name): 11 | assert(name not in self.start_times) 12 | self.start_times[name] = time.time() 13 | 14 | def stop(self, name): 15 | assert(name in self.start_times) 16 | self.elapsed_times[name] += time.time() - self.start_times[name] 17 | self.start_times.pop(name) 18 | 19 | def elapsed(self, name): 20 | return self.elapsed_times[name] 21 | 22 | def reset(self): 23 | self.start_times = dict() 24 | self.elapsed_times = defaultdict(int) 25 | 26 | def __str__(self): 27 | s = '' 28 | names_elapsed = sorted(self.elapsed_times.items(), key=lambda x: x[1], reverse=True) 29 | for name, elapsed in names_elapsed: 30 | if 'total' not in self.elapsed_times: 31 | s += '{0}: {1: <10} {2:.1f}\n'.format(self.prefix, name, elapsed) 32 | else: 33 | assert(self.elapsed_times['total'] >= max(self.elapsed_times.values())) 34 | pct = 100. * elapsed / self.elapsed_times['total'] 35 | s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, name, elapsed, pct) 36 | if 'total' in self.elapsed_times: 37 | times_summed = sum([t for k, t in self.elapsed_times.items() if k != 'total']) 38 | other_time = self.elapsed_times['total'] - times_summed 39 | assert(other_time >= 0) 40 | pct = 100. * other_time / self.elapsed_times['total'] 41 | s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, 'other', other_time, pct) 42 | return s 43 | 44 | timeit = TimeIt() 45 | -------------------------------------------------------------------------------- /hw5/exp/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 5a: Exploration 2 | 3 | Dependencies: 4 | * Python **3.5** 5 | * Numpy version **1.14.5** 6 | * TensorFlow version **1.10.5** 7 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56** 8 | * seaborn 9 | * tqdm==**4.26.0** 10 | 11 | Before doing anything, first replace `gym/envs/mujoco/half_cheetah.py` with the provided `sparse_half_cheetah.py` file. It is always a good idea to keep a copy of the original `gym/envs/mujoco/half_cheetah.py` just in case you need it for something else. 12 | 13 | You will implement `density_model.py`, `exploration.py`, and `train_ac_exploration_f18.py`. 14 | 15 | See the hw5a.pdf in this folder for further instructions. 16 | . 17 | -------------------------------------------------------------------------------- /hw5/exp/ex_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None): 4 | """ 5 | Builds a feedforward neural network 6 | 7 | arguments: 8 | input_placeholder: placeholder variable for the state (batch_size, input_size) 9 | output_size: size of the output layer 10 | scope: variable scope of the network 11 | n_layers: number of hidden layers 12 | size: dimension of the hidden layer 13 | activation: activation of the hidden layers 14 | output_activation: activation of the ouput layers 15 | 16 | returns: 17 | output placeholder of the network (the result of a forward pass) 18 | 19 | Hint: use tf.layers.dense 20 | """ 21 | output_placeholder = input_placeholder 22 | with tf.variable_scope(scope): 23 | for _ in range(n_layers): 24 | output_placeholder = tf.layers.dense(output_placeholder, size, activation=activation) 25 | output_placeholder = tf.layers.dense(output_placeholder, output_size, activation=output_activation) 26 | return output_placeholder -------------------------------------------------------------------------------- /hw5/exp/hw5a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/exp/hw5a.pdf -------------------------------------------------------------------------------- /hw5/exp/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw5/exp/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | # plt.legend(loc='best', bbox_to_anchor=(1, 1), fontsize=8).draggable() 59 | plt.show() 60 | 61 | 62 | def get_datasets(fpath, condition=None): 63 | unit = 0 64 | datasets = [] 65 | for root, dir, files in os.walk(fpath): 66 | if 'log.txt' in files: 67 | param_path = open(os.path.join(root,'params.json')) 68 | params = json.load(param_path) 69 | exp_name = params['exp_name'] 70 | 71 | log_path = os.path.join(root,'log.txt') 72 | experiment_data = pd.read_table(log_path) 73 | 74 | experiment_data.insert( 75 | len(experiment_data.columns), 76 | 'Unit', 77 | unit 78 | ) 79 | experiment_data.insert( 80 | len(experiment_data.columns), 81 | 'Condition', 82 | condition or exp_name 83 | ) 84 | 85 | datasets.append(experiment_data) 86 | unit += 1 87 | 88 | return datasets 89 | 90 | 91 | def main(): 92 | import argparse 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('logdir', nargs='*') 95 | parser.add_argument('--legend', nargs='*') 96 | parser.add_argument('--value', default='AverageReturn', nargs='*') 97 | args = parser.parse_args() 98 | 99 | use_legend = False 100 | if args.legend is not None: 101 | assert len(args.legend) == len(args.logdir), \ 102 | "Must give a legend title for each set of experiments." 103 | use_legend = True 104 | 105 | data = [] 106 | if use_legend: 107 | for logdir, legend_title in zip(args.logdir, args.legend): 108 | data += get_datasets(logdir, legend_title) 109 | else: 110 | for logdir in args.logdir: 111 | data += get_datasets(logdir) 112 | 113 | if isinstance(args.value, list): 114 | values = args.value 115 | else: 116 | values = [args.value] 117 | for value in values: 118 | plot_data(data, value=value) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /hw5/exp/replay.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import copy 4 | 5 | class Replay_Buffer(object): 6 | def __init__(self, max_size=np.inf): 7 | self.memory = [] 8 | self.max_size = int(max_size) 9 | 10 | def adjust_size(self): 11 | if len(self.memory) > self.max_size: 12 | diff = int(len(self.memory) - self.max_size) 13 | self.memory = self.memory[:-diff] # FIFO 14 | print('Adjusted replay size') 15 | 16 | def prepend(self, x): 17 | # assume x is a list of states 18 | self.memory = list(x) + self.memory 19 | self.adjust_size() 20 | 21 | def sample(self, batch_size): 22 | random_batch = random.sample(self.memory, batch_size) 23 | return random_batch 24 | 25 | def __len__(self): 26 | return len(self.memory) 27 | 28 | def __getitem__(self, indices): 29 | return copy.deepcopy(np.array([self.memory[i] for i in indices])) 30 | 31 | def get_memory(self): 32 | return copy.deepcopy(self.memory) 33 | 34 | def clear_buffer(self): 35 | del self.memory[:] -------------------------------------------------------------------------------- /hw5/exp/requirements.txt: -------------------------------------------------------------------------------- 1 | gym==0.10.5 2 | mujoco-py==1.50.1.56 3 | tensorflow 4 | numpy 5 | seaborn 6 | tqdm -------------------------------------------------------------------------------- /hw5/exp/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ########################## 4 | ### P1 Hist PointMass ### 5 | ########################## 6 | 7 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model none -s 8 --exp_name PM_bc0_s8 8 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model hist -bc 0.01 -s 8 --exp_name PM_hist_bc0.01_s8 9 | 10 | ########################## 11 | ### P2 RBF PointMass ### 12 | ########################## 13 | 14 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model rbf -bc 0.01 -s 8 -sig 0.2 --exp_name PM_rbf_bc0.01_s8_sig0.2 15 | 16 | ########################## 17 | ### P3 EX2 PointMass ### 18 | ########################## 19 | 20 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model ex2 -s 8 -bc 0.05 -kl 0.1 -dlr 0.001 -dh 8 -dti 1000 --exp_name PM_ex2_s8_bc0.05_kl0.1_dlr0.001_dh8_dti1000 21 | 22 | ########################### 23 | ### P4 HalfCheetah ### 24 | ########################### 25 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model none --exp_name HC_bc0 26 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.001 -kl 0.1 -dlr 0.005 -dti 1000 --exp_name HC_bc0.001_kl0.1_dlr0.005_dti1000 27 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.0001 -kl 0.1 -dlr 0.005 -dti 10000 --exp_name HC_bc0.0001_kl0.1_dlr0.005_dti10000 28 | -------------------------------------------------------------------------------- /hw5/exp/sparse_half_cheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | def __init__(self): 7 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5) 8 | utils.EzPickle.__init__(self) 9 | 10 | def step(self, action): 11 | ################################################# 12 | ctrl = False 13 | relu = False 14 | threshold = 10.0 15 | ################################################# 16 | xposbefore = self.sim.data.qpos[0] 17 | self.do_simulation(action, self.frame_skip) 18 | xposafter = self.sim.data.qpos[0] 19 | ob = self._get_obs() 20 | # reward_ctrl = - 0.1 * np.square(action).sum() 21 | # reward_run = (xposafter - xposbefore)/self.dt 22 | ################################################# 23 | if ctrl: 24 | reward_ctrl = - 0.1 * np.square(action).sum() 25 | else: 26 | reward_ctrl = 0 27 | if abs(xposafter) <= threshold: 28 | reward_run = 0.0 29 | else: 30 | if relu: 31 | reward_run = np.sign(xposafter)*(xposafter - xposbefore)/self.dt 32 | else: 33 | reward_run = 1.0 34 | ################################################# 35 | reward = reward_ctrl + reward_run 36 | done = False 37 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 38 | 39 | def _get_obs(self): 40 | return np.concatenate([ 41 | self.sim.data.qpos.flat[1:], 42 | self.sim.data.qvel.flat, 43 | ]) 44 | 45 | def reset_model(self): 46 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq) 47 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 48 | self.set_state(qpos, qvel) 49 | return self._get_obs() 50 | 51 | def viewer_setup(self): 52 | self.viewer.cam.distance = self.model.stat.extent * 0.5 53 | -------------------------------------------------------------------------------- /hw5/meta/Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf -------------------------------------------------------------------------------- /hw5/meta/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 5c: Meta-Learning 2 | 3 | Dependencies: 4 | 5 | * Python **3.5** 6 | * Numpy version 1.14.5 7 | * TensorFlow version 1.10.5 8 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56** 9 | * OpenAI Gym version **0.10.5** 10 | * seaborn 11 | * Box2D==2.3.2 12 | 13 | Instructions: [HW5c PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5c.pdf) 14 | 15 | ### 1. Problem1 Context as Task ID 16 | 17 | Run the following command: 18 | 19 | `python train_policy.py 'pm-obs' --exp_name --history 1 -lr 5e-5 -n 200 --num_tasks 4` 20 | 21 | ### 2. Problem2 Meta-Learned Context 22 | 23 | Run the following command: 24 | 25 | **With MLP model** 26 | 27 | `python train_policy.py 'pm' --exp_name --history --discount 0.90 -lr 5e-4 -n 60` 28 | 29 | 30 | **With RNN model** 31 | 32 | `python train_policy.py 'pm' --exp_name --history --discount 0.90 -lr 5e-4 -n 60 --recurrent` 33 | 34 | ### 3. Problem3 Generalization 35 | 36 | Run the following command: 37 | 38 | `python train_policy.py 'pm' --exp_name --history --discount 0.90 -lr 5e-4 -n 60 --recurrent --generalized --granularity ` 39 | 40 | if `--generalized`, the training goals and testing goals will be chosen from chessboard space where 1 corresponds to testing goals and 0 corresponds to training goals. The size of pattern in chessboard is defined by `--granularity`. The value can be chosen from the list `[1,2,4,5,10]` to construct a balanced chessboard. 41 | 42 | -------------------------------------------------------------------------------- /hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "mlp_1", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 1, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/mlp_1_pm_13-11-2018_20-57-59/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : false, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "mlp_30", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 30, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/mlp_30_pm_13-11-2018_20-48-55/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : false, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "mlp_50", 4 | "gamma" : 0.9, 5 | "generalized" : false, 6 | "granularity" : 1, 7 | "gru_size" : 32, 8 | "history" : 50, 9 | "l2reg" : false, 10 | "learning_rate" : 0.0005, 11 | "logdir" : "data/mlp_50_pm_14-11-2018_20-05-53/1", 12 | "max_path_length" : 20, 13 | "min_timesteps_per_batch" : 10000, 14 | "mini_batch_size" : 64, 15 | "n_iter" : 60, 16 | "n_layers" : 1, 17 | "nn_critic" : false, 18 | "normalize_advantages" : true, 19 | "num_ppo_updates" : 780, 20 | "num_tasks" : 1, 21 | "num_value_iters" : 1, 22 | "recurrent" : false, 23 | "seed" : 1, 24 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "mlp_60", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 60, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/mlp_60_pm_13-11-2018_23-01-39/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : false, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm-obs", 3 | "exp_name" : "pro1", 4 | "gamma" : 0.99, 5 | "gru_size" : 32, 6 | "history" : 1, 7 | "l2reg" : false, 8 | "learning_rate" : 5e-05, 9 | "logdir" : "data/pro1_pm-obs_13-11-2018_01-08-37/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 2500, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 200, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 4, 19 | "num_value_iters" : 1, 20 | "recurrent" : false, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "pro3_rnn_60_g_1", 4 | "gamma" : 0.9, 5 | "generalized" : true, 6 | "granularity" : 1, 7 | "gru_size" : 32, 8 | "history" : 60, 9 | "l2reg" : false, 10 | "learning_rate" : 0.0005, 11 | "logdir" : "data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1", 12 | "max_path_length" : 20, 13 | "min_timesteps_per_batch" : 10000, 14 | "mini_batch_size" : 64, 15 | "n_iter" : 60, 16 | "n_layers" : 1, 17 | "nn_critic" : false, 18 | "normalize_advantages" : true, 19 | "num_ppo_updates" : 780, 20 | "num_tasks" : 1, 21 | "num_value_iters" : 1, 22 | "recurrent" : true, 23 | "seed" : 1, 24 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "pro3_rnn_60_g_2", 4 | "gamma" : 0.9, 5 | "generalized" : true, 6 | "granularity" : 2, 7 | "gru_size" : 32, 8 | "history" : 60, 9 | "l2reg" : false, 10 | "learning_rate" : 0.0005, 11 | "logdir" : "data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1", 12 | "max_path_length" : 20, 13 | "min_timesteps_per_batch" : 10000, 14 | "mini_batch_size" : 64, 15 | "n_iter" : 60, 16 | "n_layers" : 1, 17 | "nn_critic" : false, 18 | "normalize_advantages" : true, 19 | "num_ppo_updates" : 780, 20 | "num_tasks" : 1, 21 | "num_value_iters" : 1, 22 | "recurrent" : true, 23 | "seed" : 1, 24 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "pro3_rnn_60_g_4", 4 | "gamma" : 0.9, 5 | "generalized" : true, 6 | "granularity" : 4, 7 | "gru_size" : 32, 8 | "history" : 60, 9 | "l2reg" : false, 10 | "learning_rate" : 0.0005, 11 | "logdir" : "data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1", 12 | "max_path_length" : 20, 13 | "min_timesteps_per_batch" : 10000, 14 | "mini_batch_size" : 64, 15 | "n_iter" : 60, 16 | "n_layers" : 1, 17 | "nn_critic" : false, 18 | "normalize_advantages" : true, 19 | "num_ppo_updates" : 780, 20 | "num_tasks" : 1, 21 | "num_value_iters" : 1, 22 | "recurrent" : true, 23 | "seed" : 1, 24 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "rnn_1", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 1, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/rnn_1_pm_13-11-2018_21-05-16/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : true, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "rnn_30", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 30, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/rnn_30_pm_13-11-2018_19-34-21/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : true, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "rnn_50", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 50, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/rnn_50_pm_14-11-2018_10-34-08/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : true, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "rnn_60", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 60, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/rnn_60_pm_13-11-2018_17-27-20/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : true, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm-obs", 3 | "exp_name" : "pro1", 4 | "gamma" : 0.99, 5 | "gru_size" : 32, 6 | "history" : 1, 7 | "l2reg" : false, 8 | "learning_rate" : 5e-05, 9 | "logdir" : "data/pro1_pm-obs_13-11-2018_01-08-37/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 2500, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 200, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 4, 19 | "num_value_iters" : 1, 20 | "recurrent" : false, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro1/prob1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro1/prob1.png -------------------------------------------------------------------------------- /hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "mlp_1", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 1, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/mlp_1_pm_13-11-2018_20-57-59/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : false, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro2_1/pro2_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/pro2_1.png -------------------------------------------------------------------------------- /hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "rnn_1", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 1, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/rnn_1_pm_13-11-2018_21-05-16/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : true, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "mlp_30", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 30, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/mlp_30_pm_13-11-2018_20-48-55/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : false, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro2_30/prob_30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/prob_30.png -------------------------------------------------------------------------------- /hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "rnn_30", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 30, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/rnn_30_pm_13-11-2018_19-34-21/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : true, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "mlp_50", 4 | "gamma" : 0.9, 5 | "generalized" : false, 6 | "granularity" : 1, 7 | "gru_size" : 32, 8 | "history" : 50, 9 | "l2reg" : false, 10 | "learning_rate" : 0.0005, 11 | "logdir" : "data/mlp_50_pm_14-11-2018_20-05-53/1", 12 | "max_path_length" : 20, 13 | "min_timesteps_per_batch" : 10000, 14 | "mini_batch_size" : 64, 15 | "n_iter" : 60, 16 | "n_layers" : 1, 17 | "nn_critic" : false, 18 | "normalize_advantages" : true, 19 | "num_ppo_updates" : 780, 20 | "num_tasks" : 1, 21 | "num_value_iters" : 1, 22 | "recurrent" : false, 23 | "seed" : 1, 24 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro2_50/pro2_50.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/pro2_50.png -------------------------------------------------------------------------------- /hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "rnn_50", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 50, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/rnn_50_pm_14-11-2018_10-34-08/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : true, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "mlp_60", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 60, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/mlp_60_pm_13-11-2018_23-01-39/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : false, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro2_60/pro2_60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/pro2_60.png -------------------------------------------------------------------------------- /hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "rnn_60", 4 | "gamma" : 0.9, 5 | "gru_size" : 32, 6 | "history" : 60, 7 | "l2reg" : false, 8 | "learning_rate" : 0.0005, 9 | "logdir" : "data/rnn_60_pm_13-11-2018_17-27-20/1", 10 | "max_path_length" : 20, 11 | "min_timesteps_per_batch" : 10000, 12 | "mini_batch_size" : 64, 13 | "n_iter" : 60, 14 | "n_layers" : 1, 15 | "nn_critic" : false, 16 | "normalize_advantages" : true, 17 | "num_ppo_updates" : 780, 18 | "num_tasks" : 1, 19 | "num_value_iters" : 1, 20 | "recurrent" : true, 21 | "seed" : 1, 22 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "pro3_rnn_60_g_1", 4 | "gamma" : 0.9, 5 | "generalized" : true, 6 | "granularity" : 1, 7 | "gru_size" : 32, 8 | "history" : 60, 9 | "l2reg" : false, 10 | "learning_rate" : 0.0005, 11 | "logdir" : "data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1", 12 | "max_path_length" : 20, 13 | "min_timesteps_per_batch" : 10000, 14 | "mini_batch_size" : 64, 15 | "n_iter" : 60, 16 | "n_layers" : 1, 17 | "nn_critic" : false, 18 | "normalize_advantages" : true, 19 | "num_ppo_updates" : 780, 20 | "num_tasks" : 1, 21 | "num_value_iters" : 1, 22 | "recurrent" : true, 23 | "seed" : 1, 24 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1.png -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_avg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_avg.png -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_val.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_val.png -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "pro3_rnn_60_g_2", 4 | "gamma" : 0.9, 5 | "generalized" : true, 6 | "granularity" : 2, 7 | "gru_size" : 32, 8 | "history" : 60, 9 | "l2reg" : false, 10 | "learning_rate" : 0.0005, 11 | "logdir" : "data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1", 12 | "max_path_length" : 20, 13 | "min_timesteps_per_batch" : 10000, 14 | "mini_batch_size" : 64, 15 | "n_iter" : 60, 16 | "n_layers" : 1, 17 | "nn_critic" : false, 18 | "normalize_advantages" : true, 19 | "num_ppo_updates" : 780, 20 | "num_tasks" : 1, 21 | "num_value_iters" : 1, 22 | "recurrent" : true, 23 | "seed" : 1, 24 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2.png -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_avg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_avg.png -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_val.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_val.png -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/params.json: -------------------------------------------------------------------------------- 1 | {"animate" : false, 2 | "env_name" : "pm", 3 | "exp_name" : "pro3_rnn_60_g_4", 4 | "gamma" : 0.9, 5 | "generalized" : true, 6 | "granularity" : 4, 7 | "gru_size" : 32, 8 | "history" : 60, 9 | "l2reg" : false, 10 | "learning_rate" : 0.0005, 11 | "logdir" : "data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1", 12 | "max_path_length" : 20, 13 | "min_timesteps_per_batch" : 10000, 14 | "mini_batch_size" : 64, 15 | "n_iter" : 60, 16 | "n_layers" : 1, 17 | "nn_critic" : false, 18 | "normalize_advantages" : true, 19 | "num_ppo_updates" : 780, 20 | "num_tasks" : 1, 21 | "num_value_iters" : 1, 22 | "recurrent" : true, 23 | "seed" : 1, 24 | "size" : 64} -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4.png -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_avg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_avg.png -------------------------------------------------------------------------------- /hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_val.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_val.png -------------------------------------------------------------------------------- /hw5/meta/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw5/meta/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | sns.set(style="darkgrid", font_scale=1.5) 55 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 56 | 57 | plt.legend(loc='best').draggable() 58 | #plt.savefig('1.png') 59 | plt.show() 60 | 61 | 62 | def get_datasets(fpath, condition=None): 63 | unit = 0 64 | datasets = [] 65 | for root, dir, files in os.walk(fpath): 66 | if 'log.txt' in files: 67 | param_path = open(os.path.join(root,'params.json')) 68 | params = json.load(param_path) 69 | exp_name = params['exp_name'] 70 | 71 | log_path = os.path.join(root,'log.txt') 72 | experiment_data = pd.read_table(log_path) 73 | 74 | experiment_data.insert( 75 | len(experiment_data.columns), 76 | 'Unit', 77 | unit 78 | ) 79 | experiment_data.insert( 80 | len(experiment_data.columns), 81 | 'Condition', 82 | condition or exp_name 83 | ) 84 | 85 | datasets.append(experiment_data) 86 | unit += 1 87 | 88 | return datasets 89 | 90 | 91 | def main(): 92 | import argparse 93 | parser = argparse.ArgumentParser() 94 | parser.add_argument('logdir', nargs='*') 95 | parser.add_argument('--legend', nargs='*') 96 | parser.add_argument('--value', default='AverageReturn', nargs='*') 97 | args = parser.parse_args() 98 | 99 | use_legend = False 100 | if args.legend is not None: 101 | assert len(args.legend) == len(args.logdir), \ 102 | "Must give a legend title for each set of experiments." 103 | use_legend = True 104 | 105 | data = [] 106 | if use_legend: 107 | for logdir, legend_title in zip(args.logdir, args.legend): 108 | data += get_datasets(logdir, legend_title) 109 | else: 110 | for logdir in args.logdir: 111 | data += get_datasets(logdir) 112 | 113 | if isinstance(args.value, list): 114 | values = args.value 115 | else: 116 | values = [args.value] 117 | for value in values: 118 | plot_data(data, value=value) 119 | 120 | if __name__ == "__main__": 121 | main() 122 | -------------------------------------------------------------------------------- /hw5/meta/point_mass.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from gym import Env 4 | 5 | 6 | class PointEnv(Env): 7 | """ 8 | point mass on a 2-D plane 9 | goals are sampled randomly from a square 10 | """ 11 | 12 | def __init__(self, num_tasks=1): 13 | self.reset_task() 14 | self.reset() 15 | self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,)) 16 | self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,)) 17 | 18 | 19 | def reset_task(self, generalized=False, granularity=1, is_evaluation=False): 20 | ''' 21 | sample a new task randomly 22 | 23 | Problem 3: make training and evaluation goals disjoint sets 24 | if `is_evaluation` is true, sample from the evaluation set, 25 | otherwise sample from the training set 26 | ''' 27 | #====================================================================================# 28 | # ----------PROBLEM 3---------- 29 | #====================================================================================# 30 | # YOUR CODE HERE 31 | # Construct the chessboard space with 20 x 20 32 | # The granularity is the size of squares, the value can be chosen from [1, 2, 4, 5, 10] 33 | if generalized: 34 | print("Problem 3...") 35 | print("The size of square is ", granularity) 36 | size = int(20 / granularity) 37 | space = np.zeros((size, size)) 38 | space[1::2,::2] = 1 39 | space[::2,1::2] = 1 40 | if is_evaluation: 41 | dataset = np.where(space == 1) 42 | else: 43 | dataset = np.where(space == 0) 44 | 45 | dataset = np.asarray(dataset).T 46 | nums = dataset.shape[0] 47 | idx = np.random.randint(0, nums) 48 | if is_evaluation: 49 | print("Evaluation") 50 | else: 51 | print("training") 52 | 53 | goal = dataset[idx] 54 | goal[0] = goal[0] * granularity 55 | goal[1] = goal[1] * granularity 56 | 57 | x = np.random.uniform(goal[0], goal[0] + granularity) - 10 58 | y = np.random.uniform(goal[1], goal[1] + granularity) - 10 59 | print((x, y)) 60 | else: 61 | #print("Problem 2...") 62 | x = np.random.uniform(-10, 10) 63 | y = np.random.uniform(-10, 10) 64 | 65 | self._goal = np.array([x, y]) 66 | 67 | #x = np.random.uniform(-10, 10) 68 | #y = np.random.uniform(-10, 10) 69 | #self._goal = np.array([x, y]) 70 | 71 | def reset(self): 72 | self._state = np.array([0, 0], dtype=np.float32) 73 | return self._get_obs() 74 | 75 | def _get_obs(self): 76 | return np.copy(self._state) 77 | 78 | def reward_function(self, x, y): 79 | return - (x ** 2 + y ** 2) ** 0.5 80 | 81 | def step(self, action): 82 | x, y = self._state 83 | # compute reward, add penalty for large actions instead of clipping them 84 | x -= self._goal[0] 85 | y -= self._goal[1] 86 | # check if task is complete 87 | done = abs(x) < .01 and abs(y) < .01 88 | reward = self.reward_function(x, y) 89 | # move to next state 90 | self._state = self._state + action 91 | ob = self._get_obs() 92 | return ob, reward, done, dict() 93 | 94 | def viewer_setup(self): 95 | print('no viewer') 96 | pass 97 | 98 | def render(self): 99 | print('current state:', self._state) 100 | 101 | def seed(self, seed): 102 | np.random.seed = seed 103 | -------------------------------------------------------------------------------- /hw5/meta/point_mass_observed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from gym import Env 4 | 5 | 6 | class ObservedPointEnv(Env): 7 | """ 8 | point mass on a 2-D plane 9 | four tasks: move to (-10, -10), (-10, 10), (10, -10), (10, 10) 10 | 11 | Problem 1: augment the observation with a one-hot vector encoding the task ID 12 | - change the dimension of the observation space 13 | - augment the observation with a one-hot vector that encodes the task ID 14 | """ 15 | #====================================================================================# 16 | # ----------PROBLEM 1---------- 17 | #====================================================================================# 18 | # YOUR CODE SOMEWHERE HERE 19 | def __init__(self, num_tasks=1): 20 | self.tasks = [0, 1, 2, 3][:num_tasks] 21 | self.task_idx = -1 22 | #self.num_tasks = num_tasks 23 | self.reset_task() 24 | self.reset() 25 | 26 | self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2 + num_tasks,)) 27 | self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,)) 28 | 29 | def reset_task(self, generalized=False, granularity=1, is_evaluation=False): 30 | # for evaluation, cycle deterministically through all tasks 31 | if is_evaluation: 32 | self.task_idx = (self.task_idx + 1) % len(self.tasks) 33 | # during training, sample tasks randomly 34 | else: 35 | self.task_idx = np.random.randint(len(self.tasks)) 36 | self._task = self.tasks[self.task_idx] 37 | goals = [[-1, -1], [-1, 1], [1, -1], [1, 1]] 38 | self._goal = np.array(goals[self.task_idx])*10 39 | 40 | def reset(self): 41 | self._state = np.array([0, 0], dtype=np.float32) 42 | return self._get_obs() 43 | 44 | def _get_obs(self): 45 | one_hot = np.zeros(len(self.tasks)) 46 | one_hot[self._task] = 1 47 | 48 | return np.concatenate((np.copy(self._state), one_hot)) 49 | 50 | def step(self, action): 51 | x, y = self._state 52 | # compute reward, add penalty for large actions instead of clipping them 53 | x -= self._goal[0] 54 | y -= self._goal[1] 55 | reward = - (x ** 2 + y ** 2) ** 0.5 56 | # check if task is complete 57 | done = abs(x) < 0.01 and abs(y) < 0.01 58 | # move to next state 59 | self._state = self._state + action 60 | ob = self._get_obs() 61 | 62 | return ob, reward, done, dict() 63 | 64 | def viewer_setup(self): 65 | print('no viewer') 66 | pass 67 | 68 | def render(self): 69 | print('current state:', self._state) 70 | 71 | def seed(self, seed): 72 | np.random.seed = seed 73 | -------------------------------------------------------------------------------- /hw5/meta/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class ReplayBuffer(object): 4 | ''' 5 | minimalistic replay buffer 6 | 7 | a sample consists of 8 | - observation 9 | - action 10 | - reward 11 | - terminal 12 | - hidden state for recurrent policy 13 | 14 | it is memory inefficient to store windowed observations this way 15 | so do not run on tasks with large observations (e.g. from vision) 16 | ''' 17 | 18 | def __init__(self, max_size, ob_dim, ac_dim, hidden_dim, task_dim): 19 | self.max_size = max_size 20 | self.ob_dim = ob_dim 21 | self.ac_dim = ac_dim 22 | self.hidden_dim = hidden_dim 23 | self.task_dim = task_dim 24 | self.flush() 25 | 26 | def flush(self): 27 | ''' 28 | set buffer to empty 29 | ''' 30 | self._observations = np.zeros((self.max_size, *self.ob_dim)) 31 | self._actions = np.zeros((self.max_size, *self.ac_dim)) 32 | self._rewards = np.zeros((self.max_size, 1)) 33 | self._terminals = np.zeros((self.max_size, 1)) 34 | self._hiddens = np.zeros((self.max_size, self.hidden_dim)) 35 | self._tasks = np.zeros((self.max_size, self.task_dim)) 36 | self._top = 0 37 | self._size = 0 38 | 39 | def _advance(self): 40 | ''' 41 | move pointer to top of buffer 42 | if end of buffer is reached, overwrite oldest data 43 | ''' 44 | self._top = (self._top + 1) % self.max_size 45 | if self._size < self.max_size: 46 | self._size += 1 47 | 48 | def add_sample(self, ob, ac, re, te, hi, task): 49 | ''' 50 | add sample to buffer 51 | ''' 52 | self._observations[self._top] = ob 53 | self._actions[self._top] = ac 54 | self._rewards[self._top] = re 55 | self._terminals[self._top] = te 56 | self._hiddens[self._top] = hi 57 | self._tasks[self._top] = task 58 | 59 | self._advance() 60 | 61 | def get_samples(self, indices): 62 | ''' 63 | return buffer data indexed by `indices` 64 | ''' 65 | return dict( 66 | observations=self._observations[indices], 67 | actions=self._actions[indices], 68 | rewards=self._rewards[indices], 69 | terminals=self._terminals[indices], 70 | hiddens=self._hiddens[indices], 71 | tasks=self._tasks[indices], 72 | ) 73 | 74 | def random_batch(self, batch_size): 75 | ''' 76 | return random sample of `batch_size` transitions 77 | ''' 78 | indices = np.random.randint(0, self._size, batch_size) 79 | return self.get_samples(indices) 80 | 81 | def all_batch(self): 82 | ''' 83 | return all data in the buffer 84 | ''' 85 | indices = list(range(self._size)) 86 | return self.get_samples(indices) 87 | 88 | def num_steps_can_sample(self): 89 | return self._size 90 | 91 | 92 | 93 | class PPOReplayBuffer(object): 94 | ''' 95 | replay buffer for PPO algorithm 96 | store fixed log probs, advantages, and returns for use in multiple updates 97 | 98 | n.b. samples must be added as a batch, and we assume that the 99 | batch is the same size as that of the simple buffer 100 | ''' 101 | 102 | def __init__(self, simple_buffer): 103 | self.simple_buffer = simple_buffer 104 | self.max_size = self.simple_buffer.max_size 105 | self.flush() 106 | 107 | def flush(self): 108 | self.simple_buffer.flush() 109 | self._log_probs = np.zeros((self.max_size, 1)) 110 | self._advantages = np.zeros((self.max_size, 1)) 111 | self._returns = np.zeros((self.max_size, 1)) 112 | 113 | def add_samples(self, lp, adv, ret): 114 | self._log_probs = lp 115 | self._advantages = adv 116 | self._returns = ret 117 | 118 | def get_samples(self, indices): 119 | return dict( 120 | log_probs = self._log_probs[indices], 121 | advantages = self._advantages[indices], 122 | returns = self._returns[indices], 123 | ) 124 | 125 | def random_batch(self, batch_size): 126 | indices = np.random.randint(0, self.simple_buffer._size, batch_size) 127 | simple = self.simple_buffer.get_samples(indices) 128 | ppo = self.get_samples(indices) 129 | return {**simple, **ppo} 130 | -------------------------------------------------------------------------------- /hw5/meta/requirements.txt: -------------------------------------------------------------------------------- 1 | mujoco-py==1.50.1.56 2 | gym==0.10.5 3 | tensorflow==1.10.0 4 | numpy==1.14.5 5 | scipy==1.1.0 6 | tensorflow-probability==0.3.0 7 | seaborn 8 | Box2D==2.3.2 9 | -------------------------------------------------------------------------------- /hw5/sac/README.md: -------------------------------------------------------------------------------- 1 | # CS294-112 HW 5b: Soft Actor Critic 2 | Original code from Tuomas Haarnoja, Soroush Nasiriany, and Aurick Zhou for CS294-112 Fall 2018 3 | 4 | Dependencies: 5 | * Python **3.4.5** 6 | * Numpy version **1.15.2** 7 | * TensorFlow version **1.10.0** 8 | * tensorflow-probability version **0.4.0** 9 | * OpenAI Gym version **0.10.8** 10 | * MuJoCo version **1.50** and mujoco-py **1.50.1.59** 11 | * seaborn version **0.9.0** 12 | 13 | You will implement `sac.py`, and `nn.py`. 14 | 15 | See the [HW5 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5b.pdf) for further instructions. 16 | -------------------------------------------------------------------------------- /hw5/sac/environment.yml: -------------------------------------------------------------------------------- 1 | name: hw5-sac 2 | dependencies: 3 | - python==3.4.5 4 | - pip: 5 | - gym==0.10.8 6 | - numpy==1.15.2 7 | - tensorflow==1.10.0 8 | - tensorflow-probability==0.4.0 9 | - mujoco-py==1.50.1.59 10 | - seaborn==0.9.0 11 | -------------------------------------------------------------------------------- /hw5/sac/logz.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | """ 4 | 5 | Some simple logging functionality, inspired by rllab's logging. 6 | Assumes that each diagnostic gets logged each iteration 7 | 8 | Call logz.configure_output_dir() to start logging to a 9 | tab-separated-values file (some_folder_name/log.txt) 10 | 11 | To load the learning curves, you can do, for example 12 | 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True) 14 | A['EpRewMean'] 15 | 16 | """ 17 | 18 | import os.path as osp, shutil, time, atexit, os, subprocess 19 | import pickle 20 | import tensorflow as tf 21 | 22 | color2num = dict( 23 | gray=30, 24 | red=31, 25 | green=32, 26 | yellow=33, 27 | blue=34, 28 | magenta=35, 29 | cyan=36, 30 | white=37, 31 | crimson=38 32 | ) 33 | 34 | def colorize(string, color, bold=False, highlight=False): 35 | attr = [] 36 | num = color2num[color] 37 | if highlight: num += 10 38 | attr.append(str(num)) 39 | if bold: attr.append('1') 40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 41 | 42 | class G: 43 | output_dir = None 44 | output_file = None 45 | first_row = True 46 | log_headers = [] 47 | log_current_row = {} 48 | 49 | def configure_output_dir(d=None): 50 | """ 51 | Set output directory to d, or to /tmp/somerandomnumber if d is None 52 | """ 53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time()) 54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir 55 | os.makedirs(G.output_dir) 56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w') 57 | atexit.register(G.output_file.close) 58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True)) 59 | 60 | def log_tabular(key, val): 61 | """ 62 | Log a value of some diagnostic 63 | Call this once for each diagnostic quantity, each iteration 64 | """ 65 | if G.first_row: 66 | G.log_headers.append(key) 67 | else: 68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key 69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key 70 | G.log_current_row[key] = val 71 | 72 | def save_params(params): 73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out: 74 | out.write(json.dumps(params, indent=2, separators=(',', ': '), sort_keys=True)) 75 | 76 | def pickle_tf_vars(): 77 | """ 78 | Saves tensorflow variables 79 | Requires them to be initialized first, also a default session must exist 80 | """ 81 | _dict = {v.name : v.eval() for v in tf.global_variables()} 82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f: 83 | pickle.dump(_dict, f) 84 | 85 | 86 | def dump_tabular(): 87 | """ 88 | Write all of the diagnostics from the current iteration 89 | """ 90 | vals = [] 91 | key_lens = [len(key) for key in G.log_headers] 92 | max_key_len = max(15,max(key_lens)) 93 | keystr = '%'+'%d'%max_key_len 94 | fmt = "| " + keystr + "s | %15s |" 95 | n_slashes = 22 + max_key_len 96 | print("-"*n_slashes) 97 | for key in G.log_headers: 98 | val = G.log_current_row.get(key, "") 99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val 100 | else: valstr = val 101 | print(fmt%(key, valstr)) 102 | vals.append(val) 103 | print("-"*n_slashes) 104 | if G.output_file is not None: 105 | if G.first_row: 106 | G.output_file.write("\t".join(G.log_headers)) 107 | G.output_file.write("\n") 108 | G.output_file.write("\t".join(map(str,vals))) 109 | G.output_file.write("\n") 110 | G.output_file.flush() 111 | G.log_current_row.clear() 112 | G.first_row=False 113 | -------------------------------------------------------------------------------- /hw5/sac/nn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from tensorflow.keras import layers 4 | from tensorflow_probability import distributions 5 | from tensorflow.python import keras 6 | from tensorflow.python.keras.engine.network import Network 7 | 8 | 9 | class QFunction(Network): 10 | def __init__(self, hidden_layer_sizes, **kwargs): 11 | super(QFunction, self).__init__(**kwargs) 12 | self._hidden_layer_sizes = hidden_layer_sizes 13 | 14 | def build(self, input_shape): 15 | inputs = [ 16 | layers.Input(batch_shape=input_shape[0], name='observations'), 17 | layers.Input(batch_shape=input_shape[1], name='actions') 18 | ] 19 | 20 | x = layers.Concatenate(axis=1)(inputs) 21 | for hidden_units in self._hidden_layer_sizes: 22 | x = layers.Dense(hidden_units, activation='relu')(x) 23 | q_values = layers.Dense(1, activation=None)(x) 24 | 25 | self._init_graph_network(inputs, q_values) 26 | super(QFunction, self).build(input_shape) 27 | 28 | 29 | class ValueFunction(Network): 30 | def __init__(self, hidden_layer_sizes, **kwargs): 31 | super(ValueFunction, self).__init__(**kwargs) 32 | self._hidden_layer_sizes = hidden_layer_sizes 33 | 34 | def build(self, input_shape): 35 | inputs = layers.Input(batch_shape=input_shape, name='observations') 36 | 37 | x = inputs 38 | for hidden_units in self._hidden_layer_sizes: 39 | x = layers.Dense(hidden_units, activation='relu')(x) 40 | values = layers.Dense(1, activation=None)(x) 41 | 42 | self._init_graph_network(inputs, values) 43 | super(ValueFunction, self).build(input_shape) 44 | 45 | 46 | class GaussianPolicy(Network): 47 | def __init__(self, action_dim, hidden_layer_sizes, reparameterize, **kwargs): 48 | super(GaussianPolicy, self).__init__(**kwargs) 49 | self._action_dim = action_dim 50 | self._f = None 51 | self._hidden_layer_sizes = hidden_layer_sizes 52 | self._reparameterize = reparameterize 53 | 54 | def build(self, input_shape): 55 | inputs = layers.Input(batch_shape=input_shape, name='observations') 56 | 57 | x = inputs 58 | for hidden_units in self._hidden_layer_sizes: 59 | x = layers.Dense(hidden_units, activation='relu')(x) 60 | 61 | mean_and_log_std = layers.Dense( 62 | self._action_dim * 2, activation=None)(x) 63 | 64 | def create_distribution_layer(mean_and_log_std): 65 | mean, log_std = tf.split( 66 | mean_and_log_std, num_or_size_splits=2, axis=1) 67 | log_std = tf.clip_by_value(log_std, -20., 2.) 68 | 69 | distribution = distributions.MultivariateNormalDiag( 70 | loc=mean, 71 | scale_diag=tf.exp(log_std)) 72 | 73 | raw_actions = distribution.sample() 74 | if not self._reparameterize: 75 | ### Problem 1.3.A 76 | ### YOUR CODE HERE 77 | raise NotImplementedError 78 | log_probs = distribution.log_prob(raw_actions) 79 | log_probs -= self._squash_correction(raw_actions) 80 | 81 | actions = None 82 | ### Problem 2.A 83 | ### YOUR CODE HERE 84 | raise NotImplementedError 85 | 86 | return actions, log_probs 87 | 88 | samples, log_probs = layers.Lambda(create_distribution_layer)( 89 | mean_and_log_std) 90 | 91 | self._init_graph_network(inputs=inputs, outputs=[samples, log_probs]) 92 | super(GaussianPolicy, self).build(input_shape) 93 | 94 | def _squash_correction(self, raw_actions): 95 | ### Problem 2.B 96 | ### YOUR CODE HERE 97 | raise NotImplementedError 98 | 99 | def eval(self, observation): 100 | assert self.built and observation.ndim == 1 101 | 102 | if self._f is None: 103 | self._f = keras.backend.function(self.inputs, [self.outputs[0]]) 104 | 105 | action, = self._f([observation[None]]) 106 | return action.flatten() 107 | -------------------------------------------------------------------------------- /hw5/sac/plot.py: -------------------------------------------------------------------------------- 1 | import seaborn as sns 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import json 5 | import os 6 | 7 | """ 8 | Using the plotter: 9 | 10 | Call it from the command line, and supply it with logdirs to experiments. 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 12 | random seeds. The runner code stored it in the directory structure 13 | 14 | data 15 | L test_EnvName_DateTime 16 | L 0 17 | L log.txt 18 | L params.json 19 | L 1 20 | L log.txt 21 | L params.json 22 | . 23 | . 24 | . 25 | L 9 26 | L log.txt 27 | L params.json 28 | 29 | To plot learning curves from the experiment, averaged over all random 30 | seeds, call 31 | 32 | python plot.py data/test_EnvName_DateTime --value AverageReturn 33 | 34 | and voila. To see a different statistics, change what you put in for 35 | the keyword --value. You can also enter /multiple/ values, and it will 36 | make all of them in order. 37 | 38 | 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried 40 | a different set of hyperparameters from 'test1', and now you would like 41 | to compare them -- see their learning curves side-by-side. Just call 42 | 43 | python plot.py data/test1 data/test2 44 | 45 | and it will plot them both! They will be given titles in the legend according 46 | to their exp_name parameters. If you want to use custom legend titles, use 47 | the --legend flag and then provide a title for each logdir. 48 | 49 | """ 50 | 51 | def plot_data(data, value="AverageReturn"): 52 | if isinstance(data, list): 53 | data = pd.concat(data, ignore_index=True) 54 | 55 | sns.set(style="darkgrid", font_scale=1.5) 56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition") 57 | plt.legend(loc='best').draggable() 58 | plt.show() 59 | 60 | 61 | def get_datasets(fpath, condition=None): 62 | unit = 0 63 | datasets = [] 64 | for root, dir, files in os.walk(fpath): 65 | if 'log.txt' in files: 66 | param_path = open(os.path.join(root,'params.json')) 67 | params = json.load(param_path) 68 | exp_name = params['exp_name'] 69 | 70 | log_path = os.path.join(root,'log.txt') 71 | experiment_data = pd.read_table(log_path) 72 | 73 | experiment_data.insert( 74 | len(experiment_data.columns), 75 | 'Unit', 76 | unit 77 | ) 78 | experiment_data.insert( 79 | len(experiment_data.columns), 80 | 'Condition', 81 | condition or exp_name 82 | ) 83 | 84 | datasets.append(experiment_data) 85 | unit += 1 86 | 87 | return datasets 88 | 89 | 90 | def main(): 91 | import argparse 92 | parser = argparse.ArgumentParser() 93 | parser.add_argument('logdir', nargs='*') 94 | parser.add_argument('--legend', nargs='*') 95 | parser.add_argument('--value', default='LastEpReturn', nargs='*') 96 | args = parser.parse_args() 97 | 98 | use_legend = False 99 | if args.legend is not None: 100 | assert len(args.legend) == len(args.logdir), \ 101 | "Must give a legend title for each set of experiments." 102 | use_legend = True 103 | 104 | data = [] 105 | if use_legend: 106 | for logdir, legend_title in zip(args.logdir, args.legend): 107 | data += get_datasets(logdir, legend_title) 108 | else: 109 | for logdir in args.logdir: 110 | data += get_datasets(logdir) 111 | 112 | if isinstance(args.value, list): 113 | values = args.value 114 | else: 115 | values = [args.value] 116 | for value in values: 117 | plot_data(data, value=value) 118 | 119 | if __name__ == "__main__": 120 | main() 121 | -------------------------------------------------------------------------------- /hw5/sac/train_mujoco.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import gym 3 | import logz 4 | import numpy as np 5 | import os 6 | import tensorflow as tf 7 | import time 8 | 9 | import nn 10 | from sac import SAC 11 | import utils 12 | 13 | from multiprocessing import Process 14 | 15 | def train_SAC(env_name, exp_name, seed, logdir): 16 | alpha = { 17 | 'Ant-v2': 0.1, 18 | 'HalfCheetah-v2': 0.2, 19 | 'Hopper-v2': 0.2, 20 | 'Humanoid-v2': 0.05, 21 | 'Walker2d-v2': 0.2, 22 | }.get(env_name, 0.2) 23 | 24 | algorithm_params = { 25 | 'alpha': alpha, 26 | 'batch_size': 256, 27 | 'discount': 0.99, 28 | 'learning_rate': 1e-3, 29 | 'reparameterize': False, 30 | 'tau': 0.01, 31 | 'epoch_length': 1000, 32 | 'n_epochs': 500, 33 | 'two_qf': False, 34 | } 35 | sampler_params = { 36 | 'max_episode_length': 1000, 37 | 'prefill_steps': 1000, 38 | } 39 | replay_pool_params = { 40 | 'max_size': 1e6, 41 | } 42 | 43 | value_function_params = { 44 | 'hidden_layer_sizes': (128, 128), 45 | } 46 | 47 | q_function_params = { 48 | 'hidden_layer_sizes': (128, 128), 49 | } 50 | 51 | policy_params = { 52 | 'hidden_layer_sizes': (128, 128), 53 | } 54 | 55 | logz.configure_output_dir(logdir) 56 | params = { 57 | 'exp_name': exp_name, 58 | 'env_name': env_name, 59 | 'algorithm_params': algorithm_params, 60 | 'sampler_params': sampler_params, 61 | 'replay_pool_params': replay_pool_params, 62 | 'value_function_params': value_function_params, 63 | 'q_function_params': q_function_params, 64 | 'policy_params': policy_params 65 | } 66 | logz.save_params(params) 67 | 68 | env = gym.envs.make(env_name) 69 | # Set random seeds 70 | tf.set_random_seed(seed) 71 | np.random.seed(seed) 72 | env.seed(seed) 73 | 74 | sampler = utils.SimpleSampler(**sampler_params) 75 | replay_pool = utils.SimpleReplayPool( 76 | observation_shape=env.observation_space.shape, 77 | action_shape=env.action_space.shape, 78 | **replay_pool_params) 79 | 80 | q_function = nn.QFunction(name='q_function', **q_function_params) 81 | if algorithm_params.get('two_qf', False): 82 | q_function2 = nn.QFunction(name='q_function2', **q_function_params) 83 | else: 84 | q_function2 = None 85 | value_function = nn.ValueFunction( 86 | name='value_function', **value_function_params) 87 | target_value_function = nn.ValueFunction( 88 | name='target_value_function', **value_function_params) 89 | policy = nn.GaussianPolicy( 90 | action_dim=env.action_space.shape[0], 91 | reparameterize=algorithm_params['reparameterize'], 92 | **policy_params) 93 | 94 | sampler.initialize(env, policy, replay_pool) 95 | 96 | algorithm = SAC(**algorithm_params) 97 | 98 | tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 99 | tf_config.gpu_options.allow_growth = True # may need if using GPU 100 | with tf.Session(config=tf_config): 101 | algorithm.build( 102 | env=env, 103 | policy=policy, 104 | q_function=q_function, 105 | q_function2=q_function2, 106 | value_function=value_function, 107 | target_value_function=target_value_function) 108 | 109 | for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)): 110 | logz.log_tabular('Iteration', epoch) 111 | for k, v in algorithm.get_statistics().items(): 112 | logz.log_tabular(k, v) 113 | for k, v in replay_pool.get_statistics().items(): 114 | logz.log_tabular(k, v) 115 | for k, v in sampler.get_statistics().items(): 116 | logz.log_tabular(k, v) 117 | logz.dump_tabular() 118 | 119 | def main(): 120 | parser = argparse.ArgumentParser() 121 | parser.add_argument('--env_name', type=str, default='HalfCheetah-v2') 122 | parser.add_argument('--exp_name', type=str, default=None) 123 | parser.add_argument('--seed', type=int, default=1) 124 | parser.add_argument('--n_experiments', '-e', type=int, default=1) 125 | args = parser.parse_args() 126 | 127 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data') 128 | 129 | if not (os.path.exists(data_path)): 130 | os.makedirs(data_path) 131 | logdir = 'sac_' + args.env_name + '_' + args.exp_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") 132 | logdir = os.path.join(data_path, logdir) 133 | 134 | processes = [] 135 | 136 | for e in range(args.n_experiments): 137 | seed = args.seed + 10*e 138 | print('Running experiment with seed %d'%seed) 139 | 140 | def train_func(): 141 | train_SAC( 142 | env_name=args.env_name, 143 | exp_name=args.exp_name, 144 | seed=seed, 145 | logdir=os.path.join(logdir, '%d' % seed), 146 | ) 147 | # # Awkward hacky process runs, because Tensorflow does not like 148 | # # repeatedly calling train_AC in the same thread. 149 | p = Process(target=train_func, args=tuple()) 150 | p.start() 151 | processes.append(p) 152 | # if you comment in the line below, then the loop will block 153 | # until this process finishes 154 | # p.join() 155 | 156 | for p in processes: 157 | p.join() 158 | 159 | if __name__ == '__main__': 160 | main() 161 | --------------------------------------------------------------------------------