├── .gitignore
├── README.md
├── hw1
├── DAgger.py
├── Readme.md
├── behavior_cloning.py
├── experts
│ ├── Ant-v2.pkl
│ ├── HalfCheetah-v2.pkl
│ ├── Hopper-v2.pkl
│ ├── Humanoid-v2.pkl
│ ├── Reacher-v2.pkl
│ └── Walker2d-v2.pkl
├── hw1.bash
├── load_policy.py
├── plot.py
├── run_expert.py
└── tf_util.py
├── hw2
├── README.md
├── data_HalfCheetah_8
│ ├── hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ └── hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── 11
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ └── 21
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_InvertedPendulum
│ └── hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── 11
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ └── 21
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_large
│ ├── lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ └── lib_rtg_na_CartPole-v0_18-09-2018_00-58-19
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── 11
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ └── 21
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_lunar
│ └── ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── 11
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ └── 21
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_small
│ ├── sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ └── sb_rtg_na_CartPole-v0_18-09-2018_00-36-45
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── 11
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ └── 21
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── hw2.bash
├── logz.py
├── lunar_lander.py
├── plot.py
└── train_pg_f18.py
├── hw3
├── DDQN_Pong.pkl
├── DQNAtari_Ponglr_multi0.1.pkl
├── DQNAtari_Ponglr_multi10.0.pkl
├── DQNAtari_Ponglr_multi5.0.pkl
├── DQN_Pong.pkl
├── Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf
├── README.md
├── atari_wrappers.py
├── data:pkl
│ ├── 1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl
│ ├── 3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl
│ ├── 43109373-50a0-47a8-b483-17921386ed82Lander.pkl
│ ├── 518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl
│ ├── 63926721-2624-40a7-b029-cee54d11097aLander.pkl
│ ├── 8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl
│ ├── 9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl
│ ├── Atari_DDQN.pkl
│ ├── DDQN-Lunar-test1.pkl
│ ├── DDQNFalseLander.pkl
│ ├── DDQNFalseLander_1e4.pkl
│ ├── DDQNFalseLander_lr2e3.pkl
│ ├── DDQNFalseLander_lr3e3.pkl
│ ├── DDQNTrueLander.pkl
│ ├── DQN-Atari-Pong.pkl
│ ├── DQN-Lunar-2
│ ├── DQN-Pong.pkl
│ ├── b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl
│ └── ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl
├── data_CartPole
│ ├── .DS_Store
│ ├── ac_100_1_CartPole-v0_02-10-2018_17-05-47
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── ac_10_10_CartPole-v0_02-10-2018_17-09-03
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── ac_1_100_CartPole-v0_02-10-2018_17-07-35
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── 11
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ └── 21
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ └── ac_1_1_CartPole-v0_02-10-2018_09-37-30
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── 11
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ └── 21
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_HalfCheetah
│ ├── .DS_Store
│ └── ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── 11
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ └── 21
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_InvertedPendulum
│ └── ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── 11
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ └── 21
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── dqn.py
├── dqn_utils.py
├── figures
│ ├── p1q1.png
│ ├── p1q2.png
│ ├── p1q3.png
│ ├── p2q1.png
│ ├── p2q2_1.png
│ └── p2q2_2.png
├── hw3.pdf
├── logz.py
├── lunar_lander.py
├── p1q1.py
├── p1q2.py
├── p1q3.py
├── plot.py
├── plot_q_learning.ipynb
├── requirements.txt
├── run_dqn_atari.py
├── run_dqn_lander.py
├── run_dqn_ram.py
└── train_ac_f18.py
├── hw4
├── Deep_RL_Assignment_4__Model_Based_RL.pdf
├── Readme.md
├── half_cheetah_env.py
├── logger.py
├── main.py
├── model_based_policy.py
├── model_based_rl.py
├── plot.py
├── requirements.txt
├── run_all.sh
├── tabulate.py
├── timer.py
└── utils.py
└── hw5
├── exp
├── README.md
├── density_model.py
├── ex_utils.py
├── exploration.py
├── hw5a.pdf
├── logz.py
├── plot.py
├── pointmass.py
├── replay.py
├── requirements.txt
├── run_all.sh
├── sparse_half_cheetah.py
└── train_ac_exploration_f18.py
├── meta
├── Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf
├── README.md
├── data
│ ├── mlp_1_pm_13-11-2018_20-57-59
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── mlp_30_pm_13-11-2018_20-48-55
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── mlp_50_pm_14-11-2018_20-05-53
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── mlp_60_pm_13-11-2018_23-01-39
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── pro1_pm-obs_13-11-2018_01-08-37
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── pro3_rnn_60_g_1_pm_15-11-2018_01-30-55
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── pro3_rnn_60_g_2_pm_14-11-2018_16-22-59
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── pro3_rnn_60_g_4_pm_15-11-2018_01-34-18
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── rnn_1_pm_13-11-2018_21-05-16
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── rnn_30_pm_13-11-2018_19-34-21
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── rnn_50_pm_14-11-2018_10-34-08
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ └── rnn_60_pm_13-11-2018_17-27-20
│ │ └── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_pro1
│ ├── pro1_pm-obs_13-11-2018_01-08-37
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ └── prob1.png
├── data_pro2_1
│ ├── mlp_1_pm_13-11-2018_20-57-59
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── pro2_1.png
│ └── rnn_1_pm_13-11-2018_21-05-16
│ │ └── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_pro2_30
│ ├── mlp_30_pm_13-11-2018_20-48-55
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── prob_30.png
│ └── rnn_30_pm_13-11-2018_19-34-21
│ │ └── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_pro2_50
│ ├── mlp_50_pm_14-11-2018_20-05-53
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── pro2_50.png
│ └── rnn_50_pm_14-11-2018_10-34-08
│ │ └── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_pro2_60
│ ├── mlp_60_pm_13-11-2018_23-01-39
│ │ └── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ ├── pro2_60.png
│ └── rnn_60_pm_13-11-2018_17-27-20
│ │ └── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
├── data_pro3
│ ├── pro3_rnn_60_g_1_pm_15-11-2018_01-30-55
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── g_1.png
│ │ ├── g_1_avg.png
│ │ └── g_1_val.png
│ ├── pro3_rnn_60_g_2_pm_14-11-2018_16-22-59
│ │ ├── 1
│ │ │ ├── log.txt
│ │ │ ├── params.json
│ │ │ └── vars.pkl
│ │ ├── g_2.png
│ │ ├── g_2_avg.png
│ │ └── g_2_val.png
│ └── pro3_rnn_60_g_4_pm_15-11-2018_01-34-18
│ │ ├── 1
│ │ ├── log.txt
│ │ ├── params.json
│ │ └── vars.pkl
│ │ ├── g_4.png
│ │ ├── g_4_avg.png
│ │ └── g_4_val.png
├── logz.py
├── plot.py
├── point_mass.py
├── point_mass_observed.py
├── replay_buffer.py
├── requirements.txt
└── train_policy.py
└── sac
├── README.md
├── environment.yml
├── logz.py
├── nn.py
├── plot.py
├── sac.py
├── train_mujoco.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
3 |
4 | */.DS_Store
5 |
6 | __pycache__
7 |
8 | Thumbs.db
9 |
10 | .ipynb_checkpoints/
11 |
12 | .gitignore
13 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CS294-112-Deep-Reinforcement-Learning
2 |
3 |
4 | --
5 |
6 | **- This is my assignments and project of CS294-112 Deep Reinforcement Learning course at UC Berkeley in Fall 2018**
7 |
8 |
9 | **- For assignments details, please step into specific homework folders.**
10 |
11 | **- The course website is [CS294-112](http://rail.eecs.berkeley.edu/deeprlcourse/)**
12 |
--------------------------------------------------------------------------------
/hw1/Readme.md:
--------------------------------------------------------------------------------
1 | # CS294-112 HW 1: Imitation Learning
2 |
3 | ---
4 |
5 | ###Run the bash script
6 | ###### `./hw1.bash`
7 | ###to get all results of hw1
8 |
9 |
10 |
11 |
12 | ---
13 | #####The following steps are detailed guidance to section 2 and section 3 of hw1
14 | In order to run this assignment, first you need to make a folder names **expert_data** which saves the output data for expert_policy
15 |
16 | `mkdir expert_data`
17 |
18 | 1. Load up expert policy and run data
19 | Run `python run_expert.py experts/task.pkl task --render --num_rollouts [num]` to run expert policy
20 | Eg.
21 | `python run_expert.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20` for the Hopper task
22 | `python run_expert.py experts/Reacher-v2.pkl Reacher-v2 --render --num_rollouts 400` for the Reacher task
23 |
24 | 2. Implement Behavior_cloning
25 | Run `python behavior_cloning.py experts/task.pkl task --render --num_rollouts [num]`to implement BC
26 | Eg.
27 | `python behavior_cloning.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20`for the Hopper task
28 | `python behavior_cloning.py experts/Reacher-v2.pkl Reacher-v2 --render --num_rollouts 400`for the Reacher task
29 | This command will generate a `.pkl` file which saves mean value of reward and std of reward with epoch increasing
30 |
31 | 3. Implement DAgger
32 | Run `python DAgger.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20`for the Hopper task
33 | This command will also generate a `.pkl` file which saves mean value of reward and std of reward with DAgger iterations.
34 |
35 | 4. Plot
36 | With the `.pkl` files generated from step2 and step3, run
37 | `python plot.py Hopper-v2 --num_rollouts 20` to generate figures for behavior cloning and DAgger
38 |
39 |
--------------------------------------------------------------------------------
/hw1/behavior_cloning.py:
--------------------------------------------------------------------------------
1 | # Implement behavior cloning
2 |
3 |
4 | import tensorflow as tf
5 | import pickle
6 | import numpy as np
7 | import tf_util
8 | import argparse
9 | import load_policy
10 | import gym
11 | from sklearn.cross_validation import train_test_split
12 | from sklearn.utils import shuffle
13 |
14 |
15 | # Parameters
16 |
17 | learning_rate = 0.001
18 | num_epoch = 100
19 | batch_size = 128
20 |
21 | # Network Parameters
22 |
23 | num_hid_1 = 128
24 | num_hid_2 = 128
25 |
26 |
27 | #Load training data from expert demonstrations generated by run_expert.py
28 | def load_expert_data (filename):
29 | with open (filename, 'rb') as f:
30 | data = pickle.loads(f.read())
31 | return data
32 |
33 | def data_preprocessing(x, y):
34 |
35 | x, y = shuffle(x, y, random_state=0)
36 | x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
37 | y_train = y_train.reshape(y_train.shape[0], y_train.shape[2])
38 | y_test = y_test.reshape(y_test.shape[0], y_test.shape[2])
39 |
40 | return x_train, x_test, y_train, y_test
41 |
42 | def next_batch(batch_size, x, y):
43 |
44 | indices = np.random.randint(low = 0, high = len(x), size = batch_size)
45 | input_batch = x[indices]
46 | label_batch = y[indices]
47 |
48 | return input_batch, label_batch
49 |
50 | def network_model(num_obs, num_act):
51 |
52 | x = tf.placeholder(tf.float32, shape = [None, num_obs], name = 'x')
53 | y = tf.placeholder(tf.float32, shape = [None, num_act], name = 'y')
54 | layer_1 = tf.layers.dense(x, num_hid_1, activation = tf.nn.relu, use_bias=True)
55 | layer_2 = tf.layers.dense(layer_1, num_hid_2, activation = tf.nn.relu, use_bias = True)
56 | output = tf.layers.dense(layer_2, num_act, activation = None, use_bias = True)
57 |
58 | return output, x, y
59 |
60 | def train_network(output, y):
61 | loss = tf.losses.mean_squared_error(output, y)
62 | train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
63 |
64 | return loss, train_op
65 |
66 |
67 |
68 | def main():
69 | parser = argparse.ArgumentParser();
70 | parser.add_argument('expert_policy_file', type=str)
71 | parser.add_argument('envname', type=str)
72 | parser.add_argument('--render', action='store_true')
73 | parser.add_argument("--max_timesteps", type=int)
74 | parser.add_argument('--num_rollouts', type=int, default=20,
75 | help='Number of expert roll outs')
76 | args = parser.parse_args()
77 |
78 | task = args.envname
79 | dataset = 'expert_data/' + args.envname + '_' + str(args.num_rollouts) + '_data.pkl'
80 |
81 |
82 | #Load training data
83 | data = load_expert_data(dataset)
84 | observations = np.array(data['observations'])
85 | actions = np.array(data['actions'])
86 | num_obs = observations.shape[1]
87 | num_act = actions.shape[2]
88 |
89 | obs_train, obs_test, act_train, act_test = data_preprocessing(observations, actions)
90 |
91 | output, x, y = network_model(num_obs, num_act)
92 |
93 | lossfunction, train_op = train_network(output, y)
94 |
95 | tf.add_to_collection('pred_network', output)
96 |
97 | mean_reward = []
98 | std_reward = []
99 |
100 |
101 | # Train
102 | init = tf.global_variables_initializer()
103 | #model_path = './bc_policy/' + task + '_' + str(args.num_rollouts) + '_bc'
104 | #builder = tf.saved_model.builder.SavedModelBuilder(model_path)
105 | with tf.Session() as sess:
106 | sess.run(init)
107 |
108 | for epoch in range(num_epoch + 1):
109 |
110 | num_batch = int(len(obs_train) / batch_size)
111 |
112 | for num in range(num_batch):
113 |
114 | obs_train_batch, act_train_batch = next_batch(batch_size, obs_train, act_train)
115 |
116 | sess.run(train_op, feed_dict = {x: obs_train_batch, y: act_train_batch})
117 |
118 | if epoch % 10 == 0:
119 |
120 | loss = sess.run(lossfunction, feed_dict = {x: obs_train, y: act_train})
121 |
122 | print("Number of Epoch: %d, Training Loss = %.08f "%(epoch, loss))
123 |
124 | test_output = sess.run(output, feed_dict = {x: obs_test})
125 |
126 | testloss = np.mean((test_output - act_test)**2)
127 |
128 | print ("Testing loss = %.08f" % testloss)
129 |
130 | env = gym.make(args.envname)
131 | max_steps = args.max_timesteps or env.spec.timestep_limit
132 |
133 | returns = []
134 | observations = []
135 | actions = []
136 | for i in range(args.num_rollouts):
137 | print('iter', i)
138 | obs = env.reset()
139 | done = False
140 | totalr = 0.
141 | steps = 0
142 | while not done:
143 |
144 | pre_action = sess.run(output, feed_dict = {x:obs[None,:]})
145 | observations.append(obs)
146 | actions.append(pre_action)
147 | obs, r, done, _ = env.step(pre_action)
148 | totalr += r
149 | steps += 1
150 | if args.render:
151 | env.render()
152 | #if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
153 | if steps >= max_steps:
154 | break
155 | returns.append(totalr)
156 |
157 | print('returns', returns)
158 | print('mean return', np.mean(returns))
159 | print('std of return', np.std(returns))
160 | mean_reward.append(np.mean(returns))
161 | std_reward.append(np.std(returns))
162 |
163 | #builder.add_meta_graph_and_variables(sess, ['Training'])
164 | #builder.save
165 | BC_result = {'mean_reward': np.array(mean_reward),
166 | 'std_reward': np.array(std_reward)}
167 |
168 |
169 | outfilename = './' + args.envname + '_' + str(args.num_rollouts) + '_bc_data.pkl'
170 |
171 | with open((outfilename), 'wb') as f:
172 | pickle.dump(BC_result, f, pickle.HIGHEST_PROTOCOL)
173 |
174 |
175 |
176 |
177 |
178 | if __name__ == '__main__':
179 | main()
180 |
181 |
--------------------------------------------------------------------------------
/hw1/experts/Ant-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Ant-v2.pkl
--------------------------------------------------------------------------------
/hw1/experts/HalfCheetah-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/HalfCheetah-v2.pkl
--------------------------------------------------------------------------------
/hw1/experts/Hopper-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Hopper-v2.pkl
--------------------------------------------------------------------------------
/hw1/experts/Humanoid-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Humanoid-v2.pkl
--------------------------------------------------------------------------------
/hw1/experts/Reacher-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Reacher-v2.pkl
--------------------------------------------------------------------------------
/hw1/experts/Walker2d-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Walker2d-v2.pkl
--------------------------------------------------------------------------------
/hw1/hw1.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 | if [ ! -d "expert_data" ]; then
4 | mkdir expert_data
5 | fi
6 |
7 | python run_expert.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20
8 | python run_expert.py experts/Reacher-v2.pkl Reacher-v2 --num_rollouts 400
9 | python behavior_cloning.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20
10 | python behavior_cloning.py experts/Reacher-v2.pkl Reacher-v2 --num_rollouts 400
11 | python DAgger.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20
12 | python plot.py Hopper-v2 --num_rollouts 20
--------------------------------------------------------------------------------
/hw1/load_policy.py:
--------------------------------------------------------------------------------
1 | import pickle, tensorflow as tf, tf_util, numpy as np
2 |
3 | def load_policy(filename):
4 | with open(filename, 'rb') as f:
5 | data = pickle.loads(f.read())
6 |
7 | # assert len(data.keys()) == 2
8 | nonlin_type = data['nonlin_type']
9 | policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
10 |
11 | assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
12 | policy_params = data[policy_type]
13 |
14 | assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
15 |
16 | # Keep track of input and output dims (i.e. observation and action dims) for the user
17 |
18 | def build_policy(obs_bo):
19 | def read_layer(l):
20 | assert list(l.keys()) == ['AffineLayer']
21 | assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
22 | return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)
23 |
24 | def apply_nonlin(x):
25 | if nonlin_type == 'lrelu':
26 | return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
27 | elif nonlin_type == 'tanh':
28 | return tf.tanh(x)
29 | else:
30 | raise NotImplementedError(nonlin_type)
31 |
32 | # Build the policy. First, observation normalization.
33 | assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
34 | obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
35 | obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
36 | obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
37 | print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
38 | normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
39 |
40 | curr_activations_bd = normedobs_bo
41 |
42 | # Hidden layers next
43 | assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
44 | layer_params = policy_params['hidden']['FeedforwardNet']
45 | for layer_name in sorted(layer_params.keys()):
46 | l = layer_params[layer_name]
47 | W, b = read_layer(l)
48 | curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)
49 |
50 | # Output layer
51 | W, b = read_layer(policy_params['out'])
52 | output_bo = tf.matmul(curr_activations_bd, W) + b
53 | return output_bo
54 |
55 | obs_bo = tf.placeholder(tf.float32, [None, None])
56 | a_ba = build_policy(obs_bo)
57 | policy_fn = tf_util.function([obs_bo], a_ba)
58 | return policy_fn
--------------------------------------------------------------------------------
/hw1/plot.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | def load_expert_data (filename):
6 | with open (filename, 'rb') as f:
7 | data = pickle.loads(f.read())
8 | return data
9 |
10 |
11 | def main():
12 |
13 |
14 | #Behavior cloning result with the number of epoch
15 |
16 | BC_path = 'Hopper-v2_20_bc_data.pkl'
17 | BC_result = load_expert_data(BC_path)
18 | BC_mean = np.array(BC_result['mean_reward'])
19 | BC_std = np.array(BC_result['std_reward'])
20 | epoch = np.arange(0, 101, 10)
21 | BC_plot = plt.figure(1)
22 | p1, = plt.plot(epoch, BC_mean, color='blue', label='Behavor_cloning' )
23 | plt.errorbar(epoch, BC_mean, ecolor='r', color='blue', yerr = BC_std, fmt = '-o', elinewidth=2, capsize=4)
24 | plt.suptitle('Behavorial Cloning: Epoches vs. Reward', fontsize=20)
25 | plt.xlabel('Number of Training Epoches')
26 | plt.ylabel('Mean Reward')
27 | plt.legend()
28 | plt.show()
29 |
30 |
31 |
32 |
33 | DAgger_path = './Hopper-v2_20_data.pkl'
34 | DAgger_result = load_expert_data(DAgger_path)
35 | mean = np.array(DAgger_result['mean_reward'])
36 | std = np.array(DAgger_result['std_reward'])
37 | iteration = np.arange(std.shape[0])
38 | iteration = iteration + 1;
39 |
40 |
41 | DAgger_plot = plt.figure(2)
42 | Dag, = plt.plot(iteration, mean, marker = '*', color='b', label='DAgger Policy')
43 | plt.errorbar(iteration, mean, yerr = std, fmt = '-*',color='b',ecolor='r' , elinewidth=2, capsize=4)
44 | plt.suptitle('DAgger Iterations vs. Reward', fontsize=20)
45 | plt.xlabel('DAgger Iteration')
46 | plt.ylabel('Mean Reward')
47 | plt.xlim([0, 6.5])
48 | plt.ylim([1000, 4000])
49 | expert = plt.axhline(y=3778.4842779089204, color='k', label='Expert Policy')
50 | bc = plt.axhline(y=2009.9990, color='g', label='Behaviorial Cloning')
51 | plt.legend(loc= 4)
52 | plt.show()
53 |
54 |
55 |
56 |
57 |
58 | if __name__ == '__main__':
59 | main()
60 |
--------------------------------------------------------------------------------
/hw1/run_expert.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | Code to load an expert policy and generate roll-out data for behavioral cloning.
5 | Example usage:
6 | python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
7 | --num_rollouts 20
8 |
9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
10 | """
11 |
12 | import os
13 | import pickle
14 | import tensorflow as tf
15 | import numpy as np
16 | import tf_util
17 | import gym
18 | import load_policy
19 |
20 | def main():
21 | import argparse
22 | parser = argparse.ArgumentParser()
23 | parser.add_argument('expert_policy_file', type=str)
24 | parser.add_argument('envname', type=str)
25 | parser.add_argument('--render', action='store_true')
26 | parser.add_argument("--max_timesteps", type=int)
27 | parser.add_argument('--num_rollouts', type=int, default=20,
28 | help='Number of expert roll outs')
29 | args = parser.parse_args()
30 |
31 | print('loading and building expert policy')
32 | policy_fn = load_policy.load_policy(args.expert_policy_file)
33 | print('loaded and built')
34 |
35 | with tf.Session():
36 | tf_util.initialize()
37 |
38 | import gym
39 | env = gym.make(args.envname)
40 | max_steps = args.max_timesteps or env.spec.timestep_limit
41 |
42 | returns = []
43 | observations = []
44 | actions = []
45 | for i in range(args.num_rollouts):
46 | print('steps', max_steps)
47 | print('iter', i)
48 | obs = env.reset()
49 | done = False
50 | totalr = 0.
51 | steps = 0
52 | while not done:
53 | action = policy_fn(obs[None,:])
54 | observations.append(obs)
55 | actions.append(action)
56 | obs, r, done, _ = env.step(action)
57 | totalr += r
58 | steps += 1
59 | if args.render:
60 | env.render()
61 | if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
62 | if steps >= max_steps:
63 | break
64 | returns.append(totalr)
65 |
66 | print('returns', returns)
67 | print('mean return', np.mean(returns))
68 | print('std of return', np.std(returns))
69 |
70 | expert_data = {'observations': np.array(observations),
71 | 'actions': np.array(actions)}
72 |
73 | outfilename = 'expert_data/' + args.envname + '_' + str(args.num_rollouts) + '_data.pkl'
74 |
75 | with open((outfilename), 'wb') as f:
76 | pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
77 |
78 | if __name__ == '__main__':
79 | main()
80 |
--------------------------------------------------------------------------------
/hw2/README.md:
--------------------------------------------------------------------------------
1 | # CS294-112 HW 2: Policy Gradient
2 |
3 | For all command-line expressions that used to run my experiments, they are stored in the `hw2.bash` script with annotations of different problems.
4 |
5 | If you want to run the whole expriment, just run:
6 | `./hw2.bash` in the master folder of `train_pg_f18.py`
7 |
8 | (For this bash script, it will store all data file into the `./data` folder)
9 |
10 | I also provided the data I got with expriments:
11 |
12 | 1. For problem 4, if you want to get the graph of small batch, the data is stored in `./data_small`, and run `python plot.py data_small/*` then you can get the graph. if you want to get the graph of large batch, the data is stored in `./data_large`, and run `python plot.py data_large/*` then you can get the graph.
13 |
14 | 2. For problem 5, the data is stored in `./data_InvertedPendulum` folder and run `python plot.py data_InvertedPendulum/*` to get the graph.
15 |
16 | 3. For problem 7, the data is stored in `./data_lunar` folder and run `python plot.py data_lunar/*` to get the graph.
17 |
18 | 4. For problem 8, the folder `./data_HalfCheetah`contains all result with different batch size and learning rate. The folder `./data_HalfCheetah_8` stored the result of optimal for four runs. Run `python plot.py data_HalfCheetah/*` to get the graph.
19 |
20 |
21 |
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_None",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : false,
14 | "seed" : 1,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_None",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : false,
14 | "seed" : 11,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_None",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : false,
14 | "seed" : 21,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_bl",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : false,
14 | "seed" : 1,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_bl",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : false,
14 | "seed" : 11,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_bl",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : false,
14 | "seed" : 21,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_rtg",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 1,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_rtg",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 11,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_rtg",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 21,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_rtg_bl",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 1,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_rtg_bl",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 11,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "hc_b50000_r0.02_rtg_bl",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 50000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 21,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "InvertedPendulum-v2",
3 | "exp_name" : "hc_b400_r0.02",
4 | "gamma" : 0.9,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 400,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "InvertedPendulum-v2",
3 | "exp_name" : "hc_b400_r0.02",
4 | "gamma" : 0.9,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 400,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "InvertedPendulum-v2",
3 | "exp_name" : "hc_b400_r0.02",
4 | "gamma" : 0.9,
5 | "learning_rate" : 0.02,
6 | "logdir" : "data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 400,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lb_no_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : false,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lb_no_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : false,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lb_no_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : false,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lb_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : true,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lb_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : true,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lb_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : true,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lib_rtg_na",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lib_rtg_na",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "lib_rtg_na",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "LunarLanderContinuous-v2",
3 | "exp_name" : "ll_b40000_r0.005",
4 | "gamma" : 0.99,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 40000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "LunarLanderContinuous-v2",
3 | "exp_name" : "ll_b40000_r0.005",
4 | "gamma" : 0.99,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 40000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "LunarLanderContinuous-v2",
3 | "exp_name" : "ll_b40000_r0.005",
4 | "gamma" : 0.99,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 40000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : true,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_no_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : false,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_no_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : false,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_no_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : false,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : true,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : true,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_rtg_dna",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : false,
13 | "reward_to_go" : true,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_rtg_na",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_rtg_na",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/vars.pkl
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "sb_rtg_na",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "nn_baseline" : false,
12 | "normalize_advantages" : true,
13 | "reward_to_go" : true,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/vars.pkl
--------------------------------------------------------------------------------
/hw2/hw2.bash:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -eux
3 |
4 | #Problem 4
5 |
6 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -dna --exp_name sb_no_rtg_dna
7 |
8 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -rtg -dna --exp_name sb_rtg_dna
9 |
10 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -rtg --exp_name sb_rtg_na
11 |
12 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -dna --exp_name lb_no_rtg_dna
13 |
14 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -rtg -dna --exp_name lb_rtg_dna
15 |
16 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -rtg --exp_name lb_rtg_na
17 |
18 | #Peoblem 5
19 |
20 | python train_pg_f18.py InvertedPendulum-v2 -ep 1000 --discount 0.9 -n 100 -e 3 -l 2 -s 64 -b 400 -lr 0.02 -rtg --exp_name hc_b400_r0.02
21 |
22 | #Problem 7
23 |
24 | python train_pg_f18.py LunarLanderContinuous-v2 -ep 1000 --discount 0.99 -n 100 -e 3 -l 2 -s 64 -b 40000 -lr 0.005 -rtg --nn_baseline --exp_name ll_b40000_r0.005
25 |
26 | #Problem 8
27 |
28 | #Find the best batch size and learning rate
29 | for batch in 10000 30000 50000
30 | do
31 | for lr in 0.005 0.01 0.02
32 | do
33 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b $batch -lr $lr -rtg --nn_baseline --exp_name hc_b${batch}_r${lr}
34 | done
35 | done
36 |
37 |
38 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 --exp_name hc_b50000_r0.02_None
39 |
40 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 -rtg --exp_name hc_b50000_r0.02_rtg
41 |
42 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 --nn_baseline --exp_name hc_b50000_r0.02_bl
43 |
44 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 -rtg --nn_baseline --exp_name hc_b50000_r0.02_rtg_bl
45 |
46 |
--------------------------------------------------------------------------------
/hw2/logz.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | """
4 |
5 | Some simple logging functionality, inspired by rllab's logging.
6 | Assumes that each diagnostic gets logged each iteration
7 |
8 | Call logz.configure_output_dir() to start logging to a
9 | tab-separated-values file (some_folder_name/log.txt)
10 |
11 | To load the learning curves, you can do, for example
12 |
13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
14 | A['EpRewMean']
15 |
16 | """
17 |
18 | import os.path as osp, shutil, time, atexit, os, subprocess
19 | import pickle
20 | import tensorflow as tf
21 |
22 | color2num = dict(
23 | gray=30,
24 | red=31,
25 | green=32,
26 | yellow=33,
27 | blue=34,
28 | magenta=35,
29 | cyan=36,
30 | white=37,
31 | crimson=38
32 | )
33 |
34 | def colorize(string, color, bold=False, highlight=False):
35 | attr = []
36 | num = color2num[color]
37 | if highlight: num += 10
38 | attr.append(str(num))
39 | if bold: attr.append('1')
40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
41 |
42 | class G:
43 | output_dir = None
44 | output_file = None
45 | first_row = True
46 | log_headers = []
47 | log_current_row = {}
48 |
49 | def configure_output_dir(d=None):
50 | """
51 | Set output directory to d, or to /tmp/somerandomnumber if d is None
52 | """
53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
55 | os.makedirs(G.output_dir)
56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
57 | atexit.register(G.output_file.close)
58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
59 |
60 | def log_tabular(key, val):
61 | """
62 | Log a value of some diagnostic
63 | Call this once for each diagnostic quantity, each iteration
64 | """
65 | if G.first_row:
66 | G.log_headers.append(key)
67 | else:
68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
70 | G.log_current_row[key] = val
71 |
72 | def save_params(params):
73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out:
74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
75 |
76 | def pickle_tf_vars():
77 | """
78 | Saves tensorflow variables
79 | Requires them to be initialized first, also a default session must exist
80 | """
81 | _dict = {v.name : v.eval() for v in tf.global_variables()}
82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
83 | pickle.dump(_dict, f)
84 |
85 |
86 | def dump_tabular():
87 | """
88 | Write all of the diagnostics from the current iteration
89 | """
90 | vals = []
91 | key_lens = [len(key) for key in G.log_headers]
92 | max_key_len = max(15,max(key_lens))
93 | keystr = '%'+'%d'%max_key_len
94 | fmt = "| " + keystr + "s | %15s |"
95 | n_slashes = 22 + max_key_len
96 | print("-"*n_slashes)
97 | for key in G.log_headers:
98 | val = G.log_current_row.get(key, "")
99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 | else: valstr = val
101 | print(fmt%(key, valstr))
102 | vals.append(val)
103 | print("-"*n_slashes)
104 | if G.output_file is not None:
105 | if G.first_row:
106 | G.output_file.write("\t".join(G.log_headers))
107 | G.output_file.write("\n")
108 | G.output_file.write("\t".join(map(str,vals)))
109 | G.output_file.write("\n")
110 | G.output_file.flush()
111 | G.log_current_row.clear()
112 | G.first_row=False
113 |
--------------------------------------------------------------------------------
/hw2/plot.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | import json
5 | import os
6 |
7 | """
8 | Using the plotter:
9 |
10 | Call it from the command line, and supply it with logdirs to experiments.
11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10
12 | random seeds. The runner code stored it in the directory structure
13 |
14 | data
15 | L test_EnvName_DateTime
16 | L 0
17 | L log.txt
18 | L params.json
19 | L 1
20 | L log.txt
21 | L params.json
22 | .
23 | .
24 | .
25 | L 9
26 | L log.txt
27 | L params.json
28 |
29 | To plot learning curves from the experiment, averaged over all random
30 | seeds, call
31 |
32 | python plot.py data/test_EnvName_DateTime --value AverageReturn
33 |
34 | and voila. To see a different statistics, change what you put in for
35 | the keyword --value. You can also enter /multiple/ values, and it will
36 | make all of them in order.
37 |
38 |
39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
40 | a different set of hyperparameters from 'test1', and now you would like
41 | to compare them -- see their learning curves side-by-side. Just call
42 |
43 | python plot.py data/test1 data/test2
44 |
45 | and it will plot them both! They will be given titles in the legend according
46 | to their exp_name parameters. If you want to use custom legend titles, use
47 | the --legend flag and then provide a title for each logdir.
48 |
49 | """
50 |
51 | def plot_data(data, value="AverageReturn"):
52 | if isinstance(data, list):
53 | data = pd.concat(data, ignore_index=True)
54 |
55 | sns.set(style="darkgrid", font_scale=1.5)
56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
57 | plt.legend(loc='best').draggable()
58 | plt.show()
59 |
60 |
61 | def get_datasets(fpath, condition=None):
62 | unit = 0
63 | datasets = []
64 | for root, dir, files in os.walk(fpath):
65 | if 'log.txt' in files:
66 | param_path = open(os.path.join(root,'params.json'))
67 | params = json.load(param_path)
68 | exp_name = params['exp_name']
69 |
70 | log_path = os.path.join(root,'log.txt')
71 | experiment_data = pd.read_table(log_path)
72 |
73 | experiment_data.insert(
74 | len(experiment_data.columns),
75 | 'Unit',
76 | unit
77 | )
78 | experiment_data.insert(
79 | len(experiment_data.columns),
80 | 'Condition',
81 | condition or exp_name
82 | )
83 |
84 | datasets.append(experiment_data)
85 | unit += 1
86 |
87 | return datasets
88 |
89 |
90 | def main():
91 | import argparse
92 | parser = argparse.ArgumentParser()
93 | parser.add_argument('logdir', nargs='*')
94 | parser.add_argument('--legend', nargs='*')
95 | parser.add_argument('--value', default='AverageReturn', nargs='*')
96 | args = parser.parse_args()
97 |
98 | use_legend = False
99 | if args.legend is not None:
100 | assert len(args.legend) == len(args.logdir), \
101 | "Must give a legend title for each set of experiments."
102 | use_legend = True
103 |
104 | data = []
105 | if use_legend:
106 | for logdir, legend_title in zip(args.logdir, args.legend):
107 | data += get_datasets(logdir, legend_title)
108 | else:
109 | for logdir in args.logdir:
110 | data += get_datasets(logdir)
111 |
112 | if isinstance(args.value, list):
113 | values = args.value
114 | else:
115 | values = [args.value]
116 | for value in values:
117 | plot_data(data, value=value)
118 |
119 | if __name__ == "__main__":
120 | main()
121 |
--------------------------------------------------------------------------------
/hw3/DDQN_Pong.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DDQN_Pong.pkl
--------------------------------------------------------------------------------
/hw3/DQNAtari_Ponglr_multi0.1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi0.1.pkl
--------------------------------------------------------------------------------
/hw3/DQNAtari_Ponglr_multi10.0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi10.0.pkl
--------------------------------------------------------------------------------
/hw3/DQNAtari_Ponglr_multi5.0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi5.0.pkl
--------------------------------------------------------------------------------
/hw3/DQN_Pong.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQN_Pong.pkl
--------------------------------------------------------------------------------
/hw3/Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf
--------------------------------------------------------------------------------
/hw3/README.md:
--------------------------------------------------------------------------------
1 | # CS294-112 HW 3: Q-Learning
2 |
3 |
4 | ---
5 | Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file.
6 |
7 | ###Problem 1
8 |
9 | #####Question 1
10 |
11 | Run `python run_dqn_atari.py` directly with vanilla Q-learning and random seed with learning multiplier 1
12 |
13 | Plot
14 | `python p1q1.py` (Replace the `.pkl` filename)
15 |
16 | #####Question 2
17 |
18 | Run `python run_dqn_atari.py --double` with double Q-learning and random seed.
19 |
20 | Plot
21 | `python p1q2.py` (Replace the `.pkl` filename) to plot the vanilla Q-learning and Double Q-learning.
22 |
23 | #####Question 3
24 |
25 | Run `python run_dqn_atari.py -m <> --seed <--double>` with the learning multiplier and a fixed seed number **5000**, if `--double` then with double Q-learning else vanilla Q-learning.
26 |
27 | Plot
28 | `python p1q3.py` (Replace the `.pkl` filename) to plot different learning curves with learning multiplier.
29 |
30 | ###Problem 2
31 |
32 | #####Question 1
33 | Run
34 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 1_1 -ntu 1 -ngsptu 1`
35 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 1_100 -ntu 1-ngsptu 100`
36 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 100_1 -ntu100 -ngsptu 1`
37 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 10_10 -ntu10 -ngsptu 10`
38 |
39 | Plot
40 | `python plot.py data_CartPole/*`
41 |
42 | #####Question 2
43 | Run
44 | `python train_ac_f18.py InvertedPendulum-v2 -ep 1000 --discount 0.95 -n 100 -e3 -l 2 -s 64 -b 5000 -lr 0.01 --exp_name 10_10 -ntu 10 -ngsptu 10` for InvertedPendulum task
45 | Run
46 | `python train_ac_f18.py HalfCheetah-v2 -ep 150 --discount 0.90 -n 100 -e 3 -l 2-s 32 -b 30000 -lr 0.02 --exp_name 10_10 -ntu 10 -ngsptu 10` for HalfCheetah task
47 |
48 | Plot
49 | `python plot.py data_InvertedPendulum/*`
50 |
51 | `python plot.py data_HalfCheetah/*`
52 |
53 |
--------------------------------------------------------------------------------
/hw3/data:pkl/1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/43109373-50a0-47a8-b483-17921386ed82Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/43109373-50a0-47a8-b483-17921386ed82Lander.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/63926721-2624-40a7-b029-cee54d11097aLander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/63926721-2624-40a7-b029-cee54d11097aLander.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/Atari_DDQN.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/Atari_DDQN.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/DDQN-Lunar-test1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQN-Lunar-test1.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNFalseLander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNFalseLander_1e4.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_1e4.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNFalseLander_lr2e3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_lr2e3.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNFalseLander_lr3e3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_lr3e3.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNTrueLander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNTrueLander.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/DQN-Atari-Pong.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Atari-Pong.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/DQN-Lunar-2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Lunar-2
--------------------------------------------------------------------------------
/hw3/data:pkl/DQN-Pong.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Pong.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl
--------------------------------------------------------------------------------
/hw3/data:pkl/ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/.DS_Store
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "100_1",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 1,
13 | "num_target_updates" : 100,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "100_1",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 1,
13 | "num_target_updates" : 100,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "100_1",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 1,
13 | "num_target_updates" : 100,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "10_10",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "10_10",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "10_10",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "1_100",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 100,
13 | "num_target_updates" : 1,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "1_100",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 100,
13 | "num_target_updates" : 1,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "1_100",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 100,
13 | "num_target_updates" : 1,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "1_1",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 1,
13 | "num_target_updates" : 1,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "1_1",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 1,
13 | "num_target_updates" : 1,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "CartPole-v0",
3 | "exp_name" : "1_1",
4 | "gamma" : 1.0,
5 | "learning_rate" : 0.005,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21",
7 | "max_path_length" : null,
8 | "min_timesteps_per_batch" : 1000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 1,
13 | "num_target_updates" : 1,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/.DS_Store
--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "10_10",
4 | "gamma" : 0.9,
5 | "learning_rate" : 0.02,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 30000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 1,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "10_10",
4 | "gamma" : 0.9,
5 | "learning_rate" : 0.02,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 30000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 11,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "HalfCheetah-v2",
3 | "exp_name" : "10_10",
4 | "gamma" : 0.9,
5 | "learning_rate" : 0.02,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21",
7 | "max_path_length" : 150.0,
8 | "min_timesteps_per_batch" : 30000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 21,
15 | "size" : 32}
--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "InvertedPendulum-v2",
3 | "exp_name" : "10_10",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.01,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 1,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "InvertedPendulum-v2",
3 | "exp_name" : "10_10",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.01,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 11,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/vars.pkl
--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "InvertedPendulum-v2",
3 | "exp_name" : "10_10",
4 | "gamma" : 0.95,
5 | "learning_rate" : 0.01,
6 | "logdir" : "/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21",
7 | "max_path_length" : 1000.0,
8 | "min_timesteps_per_batch" : 5000,
9 | "n_iter" : 100,
10 | "n_layers" : 2,
11 | "normalize_advantages" : true,
12 | "num_grad_steps_per_target_update" : 10,
13 | "num_target_updates" : 10,
14 | "seed" : 21,
15 | "size" : 64}
--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/vars.pkl
--------------------------------------------------------------------------------
/hw3/figures/p1q1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q1.png
--------------------------------------------------------------------------------
/hw3/figures/p1q2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q2.png
--------------------------------------------------------------------------------
/hw3/figures/p1q3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q3.png
--------------------------------------------------------------------------------
/hw3/figures/p2q1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q1.png
--------------------------------------------------------------------------------
/hw3/figures/p2q2_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q2_1.png
--------------------------------------------------------------------------------
/hw3/figures/p2q2_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q2_2.png
--------------------------------------------------------------------------------
/hw3/hw3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/hw3.pdf
--------------------------------------------------------------------------------
/hw3/logz.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | """
4 |
5 | Some simple logging functionality, inspired by rllab's logging.
6 | Assumes that each diagnostic gets logged each iteration
7 |
8 | Call logz.configure_output_dir() to start logging to a
9 | tab-separated-values file (some_folder_name/log.txt)
10 |
11 | To load the learning curves, you can do, for example
12 |
13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
14 | A['EpRewMean']
15 |
16 | """
17 |
18 | import os.path as osp, shutil, time, atexit, os, subprocess
19 | import pickle
20 | import tensorflow as tf
21 |
22 | color2num = dict(
23 | gray=30,
24 | red=31,
25 | green=32,
26 | yellow=33,
27 | blue=34,
28 | magenta=35,
29 | cyan=36,
30 | white=37,
31 | crimson=38
32 | )
33 |
34 | def colorize(string, color, bold=False, highlight=False):
35 | attr = []
36 | num = color2num[color]
37 | if highlight: num += 10
38 | attr.append(str(num))
39 | if bold: attr.append('1')
40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
41 |
42 | class G:
43 | output_dir = None
44 | output_file = None
45 | first_row = True
46 | log_headers = []
47 | log_current_row = {}
48 |
49 | def configure_output_dir(d=None):
50 | """
51 | Set output directory to d, or to /tmp/somerandomnumber if d is None
52 | """
53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
55 | os.makedirs(G.output_dir)
56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
57 | atexit.register(G.output_file.close)
58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
59 |
60 | def log_tabular(key, val):
61 | """
62 | Log a value of some diagnostic
63 | Call this once for each diagnostic quantity, each iteration
64 | """
65 | if G.first_row:
66 | G.log_headers.append(key)
67 | else:
68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
70 | G.log_current_row[key] = val
71 |
72 | def save_params(params):
73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out:
74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
75 |
76 | def pickle_tf_vars():
77 | """
78 | Saves tensorflow variables
79 | Requires them to be initialized first, also a default session must exist
80 | """
81 | _dict = {v.name : v.eval() for v in tf.global_variables()}
82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
83 | pickle.dump(_dict, f)
84 |
85 |
86 | def dump_tabular():
87 | """
88 | Write all of the diagnostics from the current iteration
89 | """
90 | vals = []
91 | key_lens = [len(key) for key in G.log_headers]
92 | max_key_len = max(15,max(key_lens))
93 | keystr = '%'+'%d'%max_key_len
94 | fmt = "| " + keystr + "s | %15s |"
95 | n_slashes = 22 + max_key_len
96 | print("-"*n_slashes)
97 | for key in G.log_headers:
98 | val = G.log_current_row.get(key, "")
99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 | else: valstr = val
101 | print(fmt%(key, valstr))
102 | vals.append(val)
103 | print("-"*n_slashes)
104 | if G.output_file is not None:
105 | if G.first_row:
106 | G.output_file.write("\t".join(G.log_headers))
107 | G.output_file.write("\n")
108 | G.output_file.write("\t".join(map(str,vals)))
109 | G.output_file.write("\n")
110 | G.output_file.flush()
111 | G.log_current_row.clear()
112 | G.first_row=False
113 |
--------------------------------------------------------------------------------
/hw3/p1q1.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 |
6 | with open('DQN_Pong.pkl', 'rb') as f:
7 | data = pickle.loads(f.read())
8 | time_step = data['Timestep']
9 | mean_reward = data['mean']
10 | best_reward = data['best']
11 | best_vanilla = best_reward[-1]
12 | plt.figure()
13 | plt.plot(time_step, mean_reward, color='red', linestyle = '-')
14 | plt.plot(time_step, best_reward, color='blue', linestyle = '--')
15 | plt.xlabel('Timesteps')
16 | plt.ylabel('Mean Episode Reward')
17 | plt.legend(['Mean_DQN','Best Mean_DQN'])
18 | plt.title('Vanilla Q-Learning on Pong', fontsize=12)
19 | plt.grid()
20 | ax = plt.gca()
21 | ax.xaxis.get_major_formatter().set_powerlimits((0,0))
22 | plt.show()
--------------------------------------------------------------------------------
/hw3/p1q2.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 |
6 | with open('DQN_Pong.pkl', 'rb') as f:
7 | data = pickle.loads(f.read())
8 | time_step = data['Timestep']
9 | mean_reward = data['mean']
10 | best_reward = data['best']
11 | best_vanilla = best_reward[-1]
12 | print(best_vanilla)
13 |
14 | with open('DDQN_Pong.pkl', 'rb') as l:
15 | data_d = pickle.loads(l.read())
16 | time_step_d = data_d['Timestep']
17 | mean_reward_d = data_d['mean']
18 | best_reward_d = data_d['best']
19 | best_DDQN = best_reward_d[-1]
20 | print(best_DDQN)
21 |
22 | plt.figure()
23 | plt.plot(time_step, mean_reward, color='green', linestyle = '-')
24 | plt.plot(time_step, best_reward, color='green', linestyle = '--')
25 | plt.plot(time_step_d, mean_reward_d, color='red', linestyle = '-')
26 | plt.plot(time_step_d, best_reward_d, color='red', linestyle = '--')
27 | plt.title('Vanilla Q-Learning Vs. Double Q-Learning on Pong', fontsize=11)
28 | plt.xlabel('Timesteps')
29 | plt.ylabel('Mean Episode Reward')
30 | plt.legend(['Mean_DQN', 'Best Mean_DQN', 'Mean_DDQN', 'Best Mean_DDQN'])
31 | plt.grid()
32 | ax = plt.gca()
33 | ax.xaxis.get_major_formatter().set_powerlimits((0,0))
34 | plt.show()
--------------------------------------------------------------------------------
/hw3/p1q3.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 |
6 | with open('DQN_Pong.pkl', 'rb') as a:
7 | data = pickle.loads(a.read())
8 | time_step_a = data['Timestep'][0:300]
9 | mean_reward_a = data['mean'][0:300]
10 | best_reward_a = data['best'][0:300]
11 |
12 | with open('DQNAtari_Ponglr_multi0.1.pkl', 'rb') as b:
13 | data = pickle.loads(b.read())
14 | time_step_b = data['Timestep'][0:300]
15 | mean_reward_b = data['mean'][0:300]
16 | best_reward_b = data['best'][0:300]
17 |
18 | with open('DQNAtari_Ponglr_multi5.0.pkl', 'rb') as c:
19 | data = pickle.loads(c.read())
20 | time_step_c = data['Timestep'][0:300]
21 | mean_reward_c = data['mean'][0:300]
22 | best_reward_c = data['best'][0:300]
23 |
24 | with open('DQNAtari_Ponglr_multi10.0.pkl', 'rb') as d:
25 | data = pickle.loads(d.read())
26 | time_step_d = data['Timestep'][0:300]
27 | mean_reward_d = data['mean'][0:300]
28 | best_reward_d = data['best'][0:300]
29 |
30 |
31 | plt.figure()
32 | plt.plot(time_step_a, mean_reward_a, color='green', linestyle = '-')
33 | plt.plot(time_step_a, best_reward_a, color='green', linestyle = '--')
34 |
35 | plt.plot(time_step_b, mean_reward_b, color='red', linestyle = '-')
36 | plt.plot(time_step_b, best_reward_b, color='red', linestyle = '--')
37 |
38 | plt.plot(time_step_c, mean_reward_c, color='blue', linestyle = '-')
39 | plt.plot(time_step_c, best_reward_c, color='blue', linestyle = '--')
40 |
41 | plt.plot(time_step_d, mean_reward_d, color='magenta', linestyle = '-')
42 | plt.plot(time_step_d, best_reward_d, color='magenta', linestyle = '--')
43 |
44 | plt.title('Q-learning on Pong with different learning rate', fontsize=11)
45 | plt.xlabel('Timesteps')
46 | plt.ylabel('Mean Episode Reward')
47 | plt.grid()
48 | plt.legend(['Mean_lr_multi = 1', 'Best_lr_multi = 1', 'Mean_lr_multi = 0.1', 'Best_lr_multi = 0.1', 'Mean_lr_multi = 5', 'Best_lr_multi = 5', 'Mean_lr_multi = 10', 'Best_lr_multi = 10'])
49 | ax = plt.gca()
50 | ax.xaxis.get_major_formatter().set_powerlimits((0,0))
51 | plt.show()
--------------------------------------------------------------------------------
/hw3/plot.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | import json
5 | import os
6 |
7 | """
8 | Using the plotter:
9 |
10 | Call it from the command line, and supply it with logdirs to experiments.
11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10
12 | random seeds. The runner code stored it in the directory structure
13 |
14 | data
15 | L test_EnvName_DateTime
16 | L 0
17 | L log.txt
18 | L params.json
19 | L 1
20 | L log.txt
21 | L params.json
22 | .
23 | .
24 | .
25 | L 9
26 | L log.txt
27 | L params.json
28 |
29 | To plot learning curves from the experiment, averaged over all random
30 | seeds, call
31 |
32 | python plot.py data/test_EnvName_DateTime --value AverageReturn
33 |
34 | and voila. To see a different statistics, change what you put in for
35 | the keyword --value. You can also enter /multiple/ values, and it will
36 | make all of them in order.
37 |
38 |
39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
40 | a different set of hyperparameters from 'test1', and now you would like
41 | to compare them -- see their learning curves side-by-side. Just call
42 |
43 | python plot.py data/test1 data/test2
44 |
45 | and it will plot them both! They will be given titles in the legend according
46 | to their exp_name parameters. If you want to use custom legend titles, use
47 | the --legend flag and then provide a title for each logdir.
48 |
49 | """
50 |
51 | def plot_data(data, value="AverageReturn"):
52 | if isinstance(data, list):
53 | data = pd.concat(data, ignore_index=True)
54 |
55 | sns.set(style="darkgrid", font_scale=1.5)
56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
57 | plt.legend(loc='best').draggable()
58 | plt.show()
59 |
60 |
61 | def get_datasets(fpath, condition=None):
62 | unit = 0
63 | datasets = []
64 | for root, dir, files in os.walk(fpath):
65 | if 'log.txt' in files:
66 | param_path = open(os.path.join(root,'params.json'))
67 | params = json.load(param_path)
68 | exp_name = params['exp_name']
69 |
70 | log_path = os.path.join(root,'log.txt')
71 | experiment_data = pd.read_table(log_path)
72 |
73 | experiment_data.insert(
74 | len(experiment_data.columns),
75 | 'Unit',
76 | unit
77 | )
78 | experiment_data.insert(
79 | len(experiment_data.columns),
80 | 'Condition',
81 | condition or exp_name
82 | )
83 |
84 | datasets.append(experiment_data)
85 | unit += 1
86 |
87 | return datasets
88 |
89 |
90 | def main():
91 | import argparse
92 | parser = argparse.ArgumentParser()
93 | parser.add_argument('logdir', nargs='*')
94 | parser.add_argument('--legend', nargs='*')
95 | parser.add_argument('--value', default='AverageReturn', nargs='*')
96 | args = parser.parse_args()
97 |
98 | use_legend = False
99 | if args.legend is not None:
100 | assert len(args.legend) == len(args.logdir), \
101 | "Must give a legend title for each set of experiments."
102 | use_legend = True
103 |
104 | data = []
105 | if use_legend:
106 | for logdir, legend_title in zip(args.logdir, args.legend):
107 | data += get_datasets(logdir, legend_title)
108 | else:
109 | for logdir in args.logdir:
110 | data += get_datasets(logdir)
111 |
112 | if isinstance(args.value, list):
113 | values = args.value
114 | else:
115 | values = [args.value]
116 | for value in values:
117 | plot_data(data, value=value)
118 |
119 | if __name__ == "__main__":
120 | main()
121 |
--------------------------------------------------------------------------------
/hw3/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.5
2 | gym[atari]
3 | box2d
4 | mujoco-py==1.50.1.56
5 | tensorflow
6 | numpy
7 | seaborn
8 | opencv-python
9 |
--------------------------------------------------------------------------------
/hw3/run_dqn_atari.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gym
3 | from gym import wrappers
4 | import os.path as osp
5 | import random
6 | import numpy as np
7 | import tensorflow as tf
8 | import tensorflow.contrib.layers as layers
9 |
10 | import dqn
11 | from dqn_utils import *
12 | from atari_wrappers import *
13 |
14 |
15 | def atari_model(img_in, num_actions, scope, reuse=False):
16 | # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
17 | with tf.variable_scope(scope, reuse=reuse):
18 | out = img_in
19 | with tf.variable_scope("convnet"):
20 | # original architecture
21 | out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
22 | out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
23 | out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
24 |
25 | out = layers.flatten(out)
26 | with tf.variable_scope("action_value"):
27 | out = layers.fully_connected(out, num_outputs=512, activation_fn=tf.nn.relu)
28 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
29 |
30 | return out
31 |
32 | def atari_learn(env,
33 | session,
34 | num_timesteps,
35 | lr_multiplier,
36 | double_q):
37 | # This is just a rough estimate
38 | num_iterations = float(num_timesteps) / 4.0
39 |
40 | lr_multiplier = lr_multiplier
41 | print("The learning rate multiplier is :", lr_multiplier)
42 | lr_schedule = PiecewiseSchedule([
43 | (0, 1e-4 * lr_multiplier),
44 | (num_iterations / 10, 1e-4 * lr_multiplier),
45 | (num_iterations / 2, 5e-5 * lr_multiplier),
46 | ],
47 | outside_value=5e-5 * lr_multiplier)
48 | optimizer = dqn.OptimizerSpec(
49 | constructor=tf.train.AdamOptimizer,
50 | kwargs=dict(epsilon=1e-4),
51 | lr_schedule=lr_schedule
52 | )
53 |
54 | def stopping_criterion(env, t):
55 | # notice that here t is the number of steps of the wrapped env,
56 | # which is different from the number of steps in the underlying env
57 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
58 |
59 | exploration_schedule = PiecewiseSchedule(
60 | [
61 | (0, 1.0),
62 | (1e6, 0.1),
63 | (num_iterations / 2, 0.01),
64 | ], outside_value=0.01
65 | )
66 |
67 | dqn.learn(
68 | env=env,
69 | q_func=atari_model,
70 | optimizer_spec=optimizer,
71 | session=session,
72 | exploration=exploration_schedule,
73 | stopping_criterion=stopping_criterion,
74 | replay_buffer_size=1000000,
75 | batch_size=32,
76 | gamma=0.99,
77 | learning_starts=50000,
78 | learning_freq=4,
79 | frame_history_len=4,
80 | target_update_freq=10000,
81 | grad_norm_clipping=10,
82 | rew_file = 'Atari_Pong' + 'lr_multi' + str(lr_multiplier),
83 | double_q=double_q
84 |
85 | )
86 | env.close()
87 |
88 | def get_available_gpus():
89 | from tensorflow.python.client import device_lib
90 | local_device_protos = device_lib.list_local_devices()
91 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
92 |
93 | def set_global_seeds(i):
94 | try:
95 | import tensorflow as tf
96 | except ImportError:
97 | pass
98 | else:
99 | tf.set_random_seed(i)
100 | np.random.seed(i)
101 | random.seed(i)
102 |
103 | def get_session():
104 | tf.reset_default_graph()
105 | tf_config = tf.ConfigProto(
106 | inter_op_parallelism_threads=1,
107 | intra_op_parallelism_threads=1)
108 | session = tf.Session(config=tf_config)
109 | print("AVAILABLE GPUS: ", get_available_gpus())
110 | return session
111 |
112 | def get_env(task, seed):
113 | env = gym.make('PongNoFrameskip-v4')
114 |
115 | set_global_seeds(seed)
116 | env.seed(seed)
117 |
118 | expt_dir = '/tmp/hw3_vid_dir2/'
119 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
120 | env = wrap_deepmind(env)
121 |
122 | return env
123 |
124 | def main():
125 | import argparse
126 | parser = argparse.ArgumentParser()
127 | parser.add_argument('--multiplier', '-m', type = float, default = 1)
128 | parser.add_argument('--seed', action='store_true')
129 | parser.add_argument('--double', action = 'store_true')
130 | args = parser.parse_args()
131 |
132 |
133 | # Get Atari games.
134 | task = gym.make('PongNoFrameskip-v4')
135 |
136 | if args.seed:
137 | seed = 5000
138 | print('seed = %d' % seed)
139 | # Run training
140 | else:
141 | seed = random.randint(0, 9999)
142 | print('random seed = %d' % seed)
143 | env = get_env(task, seed)
144 | session = get_session()
145 | atari_learn(env, session, num_timesteps=2e8, lr_multiplier = args.multiplier, double_q = args.double)
146 |
147 | if __name__ == "__main__":
148 | main()
149 |
--------------------------------------------------------------------------------
/hw3/run_dqn_lander.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gym
3 | from gym import wrappers
4 | import os.path as osp
5 | import random
6 | import numpy as np
7 | import tensorflow as tf
8 | import tensorflow.contrib.layers as layers
9 |
10 | import dqn
11 | from dqn_utils import *
12 |
13 | import argparse
14 |
15 | def lander_model(obs, num_actions, scope, reuse=False):
16 | with tf.variable_scope(scope, reuse=reuse):
17 | out = obs
18 | with tf.variable_scope("action_value"):
19 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
20 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
21 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
22 | return out
23 |
24 | def lander_optimizer():
25 | return dqn.OptimizerSpec(
26 | constructor=tf.train.AdamOptimizer,
27 | lr_schedule=ConstantSchedule(1e-4),
28 | kwargs={}
29 | )
30 |
31 | def lander_stopping_criterion(num_timesteps):
32 | def stopping_criterion(env, t):
33 | # notice that here t is the number of steps of the wrapped env,
34 | # which is different from the number of steps in the underlying env
35 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
36 | return stopping_criterion
37 |
38 | def lander_exploration_schedule(num_timesteps):
39 | return PiecewiseSchedule(
40 | [
41 | (0, 1),
42 | (num_timesteps * 0.1, 0.02),
43 | ], outside_value=0.02
44 | )
45 |
46 | def lander_kwargs():
47 | return {
48 | 'optimizer_spec': lander_optimizer(),
49 | 'q_func': lander_model,
50 | 'replay_buffer_size': 50000,
51 | 'batch_size': 32,
52 | 'gamma': 1.00,
53 | 'learning_starts': 1000,
54 | 'learning_freq': 1,
55 | 'frame_history_len': 1,
56 | 'target_update_freq': 3000,
57 | 'grad_norm_clipping': 10,
58 | 'lander': True
59 | }
60 |
61 | def lander_learn(env,
62 | session,
63 | num_timesteps,
64 | seed):
65 |
66 | optimizer = lander_optimizer()
67 | stopping_criterion = lander_stopping_criterion(num_timesteps)
68 | exploration_schedule = lander_exploration_schedule(num_timesteps)
69 |
70 | dqn.learn(
71 | env=env,
72 | session=session,
73 | exploration=lander_exploration_schedule(num_timesteps),
74 | stopping_criterion=lander_stopping_criterion(num_timesteps),
75 | rew_file = 'Lander',
76 | double_q=False,
77 | **lander_kwargs()
78 | )
79 | env.close()
80 |
81 | def set_global_seeds(i):
82 | tf.set_random_seed(i)
83 | np.random.seed(i)
84 | random.seed(i)
85 |
86 | def get_session():
87 | tf.reset_default_graph()
88 | tf_config = tf.ConfigProto(
89 | inter_op_parallelism_threads=1,
90 | intra_op_parallelism_threads=1,
91 | device_count={'GPU': 0})
92 | # GPUs don't significantly speed up deep Q-learning for lunar lander,
93 | # since the observations are low-dimensional
94 | session = tf.Session(config=tf_config)
95 | return session
96 |
97 | def get_env(seed):
98 | env = gym.make('LunarLander-v2')
99 |
100 | set_global_seeds(seed)
101 | env.seed(seed)
102 |
103 | expt_dir = '/tmp/hw3_vid_dir/'
104 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True, video_callable=False)
105 |
106 | return env
107 |
108 | def main():
109 | # Run training
110 | seed = np.random.randint(9999) # you may want to randomize this
111 | print('random seed = %d' % seed)
112 | env = get_env(seed)
113 | session = get_session()
114 | set_global_seeds(seed)
115 | lander_learn(env, session, num_timesteps=500000, seed=seed)
116 |
117 | if __name__ == "__main__":
118 | main()
119 |
--------------------------------------------------------------------------------
/hw3/run_dqn_ram.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gym
3 | from gym import wrappers
4 | import os.path as osp
5 | import random
6 | import numpy as np
7 | import tensorflow as tf
8 | import tensorflow.contrib.layers as layers
9 |
10 | import dqn
11 | from dqn_utils import *
12 | from atari_wrappers import *
13 |
14 |
15 | def atari_model(ram_in, num_actions, scope, reuse=False):
16 | with tf.variable_scope(scope, reuse=reuse):
17 | out = ram_in
18 | #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
19 | with tf.variable_scope("action_value"):
20 | out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
21 | out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
22 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
23 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
24 |
25 | return out
26 |
27 | def atari_learn(env,
28 | session,
29 | num_timesteps):
30 | # This is just a rough estimate
31 | num_iterations = float(num_timesteps) / 4.0
32 |
33 | lr_multiplier = 1.0
34 | lr_schedule = PiecewiseSchedule([
35 | (0, 1e-4 * lr_multiplier),
36 | (num_iterations / 10, 1e-4 * lr_multiplier),
37 | (num_iterations / 2, 5e-5 * lr_multiplier),
38 | ],
39 | outside_value=5e-5 * lr_multiplier)
40 | optimizer = dqn.OptimizerSpec(
41 | constructor=tf.train.AdamOptimizer,
42 | kwargs=dict(epsilon=1e-4),
43 | lr_schedule=lr_schedule
44 | )
45 |
46 | def stopping_criterion(env, t):
47 | # notice that here t is the number of steps of the wrapped env,
48 | # which is different from the number of steps in the underlying env
49 | return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
50 |
51 | exploration_schedule = PiecewiseSchedule(
52 | [
53 | (0, 0.2),
54 | (1e6, 0.1),
55 | (num_iterations / 2, 0.01),
56 | ], outside_value=0.01
57 | )
58 |
59 | dqn.learn(
60 | env,
61 | q_func=atari_model,
62 | optimizer_spec=optimizer,
63 | session=session,
64 | exploration=exploration_schedule,
65 | stopping_criterion=stopping_criterion,
66 | replay_buffer_size=1000000,
67 | batch_size=32,
68 | gamma=0.99,
69 | learning_starts=50000,
70 | learning_freq=4,
71 | frame_history_len=1,
72 | target_update_freq=10000,
73 | grad_norm_clipping=10
74 | )
75 | env.close()
76 |
77 | def get_available_gpus():
78 | from tensorflow.python.client import device_lib
79 | local_device_protos = device_lib.list_local_devices()
80 | return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
81 |
82 | def set_global_seeds(i):
83 | try:
84 | import tensorflow as tf
85 | except ImportError:
86 | pass
87 | else:
88 | tf.set_random_seed(i)
89 | np.random.seed(i)
90 | random.seed(i)
91 |
92 | def get_session():
93 | tf.reset_default_graph()
94 | tf_config = tf.ConfigProto(
95 | inter_op_parallelism_threads=1,
96 | intra_op_parallelism_threads=1)
97 | session = tf.Session(config=tf_config)
98 | print("AVAILABLE GPUS: ", get_available_gpus())
99 | return session
100 |
101 | def get_env(seed):
102 | env = gym.make('Pong-ram-v0')
103 |
104 | set_global_seeds(seed)
105 | env.seed(seed)
106 |
107 | expt_dir = '/tmp/hw3_vid_dir/'
108 | env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
109 | env = wrap_deepmind_ram(env)
110 |
111 | return env
112 |
113 | def main():
114 | # Run training
115 | seed = 0 # Use a seed of zero (you may want to randomize the seed!)
116 | env = get_env(seed)
117 | session = get_session()
118 | atari_learn(env, session, num_timesteps=int(4e7))
119 |
120 | if __name__ == "__main__":
121 | main()
122 |
--------------------------------------------------------------------------------
/hw4/Deep_RL_Assignment_4__Model_Based_RL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw4/Deep_RL_Assignment_4__Model_Based_RL.pdf
--------------------------------------------------------------------------------
/hw4/Readme.md:
--------------------------------------------------------------------------------
1 | ###CS294-112 Assignment 4: Model-Based RL
2 |
3 | ---
4 |
5 | To run the whole solution to all problems, just run the `run_all.sh` in the terminal.
6 |
7 |
8 | The command is `bash ./run_all.sh` and all result data and figures will be saved in the related folders.
--------------------------------------------------------------------------------
/hw4/half_cheetah_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from gym import utils
4 | from gym.envs.mujoco import mujoco_env
5 |
6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
7 | def __init__(self):
8 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
9 | utils.EzPickle.__init__(self)
10 |
11 | def step(self, action):
12 | xposbefore = self.sim.data.qpos[0]
13 | self.do_simulation(action, self.frame_skip)
14 | xposafter = self.sim.data.qpos[0]
15 | ob = self._get_obs()
16 | reward_ctrl = - 0.1 * np.square(action).sum()
17 | reward_run = (xposafter - xposbefore)/self.dt
18 | reward = reward_ctrl + reward_run
19 | done = False
20 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
21 |
22 | def _get_obs(self):
23 | return np.concatenate([
24 | self.sim.data.qpos.flat[1:],
25 | self.sim.data.qvel.flat,
26 | self.get_body_com("torso").flat,
27 | # self.get_body_comvel("torso").flat,
28 | ])
29 |
30 | def reset_model(self):
31 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
32 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
33 | self.set_state(qpos, qvel)
34 | return self._get_obs()
35 |
36 | def viewer_setup(self):
37 | self.viewer.cam.distance = self.model.stat.extent * 0.5
38 |
39 | @staticmethod
40 | def cost_fn(states, actions, next_states):
41 | is_tf = tf.contrib.framework.is_tensor(states)
42 | is_single_state = (len(states.get_shape()) == 1) if is_tf else (len(states.shape) == 1)
43 |
44 | if is_single_state:
45 | states = states[None, ...]
46 | actions = actions[None, ...]
47 | next_states = next_states[None, ...]
48 |
49 | scores = tf.zeros(actions.get_shape()[0].value) if is_tf else np.zeros(actions.shape[0])
50 |
51 | heading_penalty_factor = 10
52 |
53 | # dont move front shin back so far that you tilt forward
54 | front_leg = states[:, 5]
55 | my_range = 0.2
56 | if is_tf:
57 | scores += tf.cast(front_leg >= my_range, tf.float32) * heading_penalty_factor
58 | else:
59 | scores += (front_leg >= my_range) * heading_penalty_factor
60 |
61 | front_shin = states[:, 6]
62 | my_range = 0
63 | if is_tf:
64 | scores += tf.cast(front_shin >= my_range, tf.float32) * heading_penalty_factor
65 | else:
66 | scores += (front_shin >= my_range) * heading_penalty_factor
67 |
68 | front_foot = states[:, 7]
69 | my_range = 0
70 | if is_tf:
71 | scores += tf.cast(front_foot >= my_range, tf.float32) * heading_penalty_factor
72 | else:
73 | scores += (front_foot >= my_range) * heading_penalty_factor
74 |
75 | scores -= (next_states[:, 17] - states[:, 17]) / 0.01
76 |
77 | if is_single_state:
78 | scores = scores[0]
79 |
80 | return scores
81 |
--------------------------------------------------------------------------------
/hw4/main.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import time
4 |
5 | from half_cheetah_env import HalfCheetahEnv
6 | from logger import logger
7 | from model_based_rl import ModelBasedRL
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('question', type=str, choices=('q1, q2, q3'))
11 | parser.add_argument('--exp_name', type=str, default=None)
12 | parser.add_argument('--env', type=str, default='HalfCheetah', choices=('HalfCheetah',))
13 | parser.add_argument('--render', action='store_true')
14 | parser.add_argument('--mpc_horizon', type=int, default=15)
15 | parser.add_argument('--num_random_action_selection', type=int, default=4096)
16 | parser.add_argument('--nn_layers', type=int, default=1)
17 | args = parser.parse_args()
18 |
19 | data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
20 | exp_name = '{0}_{1}_{2}'.format(args.env,
21 | args.question,
22 | args.exp_name if args.exp_name else time.strftime("%d-%m-%Y_%H-%M-%S"))
23 | exp_dir = os.path.join(data_dir, exp_name)
24 | assert not os.path.exists(exp_dir),\
25 | 'Experiment directory {0} already exists. Either delete the directory, or run the experiment with a different name'.format(exp_dir)
26 | os.makedirs(exp_dir, exist_ok=True)
27 | logger.setup(exp_name, os.path.join(exp_dir, 'log.txt'), 'debug')
28 |
29 | env = {
30 | 'HalfCheetah': HalfCheetahEnv()
31 | }[args.env]
32 |
33 | mbrl = ModelBasedRL(env=env,
34 | render=args.render,
35 | mpc_horizon=args.mpc_horizon,
36 | num_random_action_selection=args.num_random_action_selection,
37 | nn_layers=args.nn_layers)
38 |
39 | run_func = {
40 | 'q1': mbrl.run_q1,
41 | 'q2': mbrl.run_q2,
42 | 'q3': mbrl.run_q3
43 | }[args.question]
44 | run_func()
45 |
--------------------------------------------------------------------------------
/hw4/plot.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 |
4 | import matplotlib.pyplot as plt
5 | import matplotlib.cm as cm
6 | import pandas
7 |
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--exps', nargs='+', type=str)
11 | parser.add_argument('--save', type=str, default=None)
12 | args = parser.parse_args()
13 |
14 | f, ax = plt.subplots(1, 1)
15 | for i, exp in enumerate(args.exps):
16 | log_fname = os.path.join('data', exp, 'log.csv')
17 | csv = pandas.read_csv(log_fname)
18 |
19 | color = cm.viridis(i / float(len(args.exps)))
20 | ax.plot(csv['Itr'], csv['ReturnAvg'], color=color, label=exp)
21 | ax.fill_between(csv['Itr'], csv['ReturnAvg'] - csv['ReturnStd'], csv['ReturnAvg'] + csv['ReturnStd'],
22 | color=color, alpha=0.2)
23 |
24 | ax.legend()
25 | ax.set_xlabel('Iteration')
26 | ax.set_ylabel('Return')
27 |
28 | if args.save:
29 | os.makedirs('plots', exist_ok=True)
30 | f.savefig(os.path.join('plots', args.save + '.jpg'))
31 | else:
32 | plt.show()
33 |
--------------------------------------------------------------------------------
/hw4/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | matplotlib
3 | colorlog
--------------------------------------------------------------------------------
/hw4/run_all.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ##########
4 | ### Q1 ###
5 | ##########
6 |
7 | python main.py q1 --exp_name exp
8 |
9 | ##########
10 | ### Q2 ###
11 | ##########
12 |
13 | python main.py q2 --exp_name exp
14 |
15 | ###########
16 | ### Q3a ###
17 | ###########
18 |
19 | python main.py q3 --exp_name default
20 | python plot.py --exps HalfCheetah_q3_default --save HalfCheetah_q3_default
21 |
22 | ###########
23 | ### Q3b ###
24 | ###########
25 |
26 | python main.py q3 --exp_name action128 --num_random_action_selection 128
27 | python main.py q3 --exp_name action4096 --num_random_action_selection 4096
28 | python main.py q3 --exp_name action16384 --num_random_action_selection 16384
29 | python plot.py --exps HalfCheetah_q3_action128 HalfCheetah_q3_action4096 HalfCheetah_q3_action16384 --save HalfCheetah_q3_actions
30 |
31 | python main.py q3 --exp_name horizon10 --mpc_horizon 10
32 | python main.py q3 --exp_name horizon15 --mpc_horizon 15
33 | python main.py q3 --exp_name horizon20 --mpc_horizon 20
34 | python plot.py --exps HalfCheetah_q3_horizon10 HalfCheetah_q3_horizon15 HalfCheetah_q3_horizon20 --save HalfCheetah_q3_mpc_horizon
35 |
36 | python main.py q3 --exp_name layers1 --nn_layers 1
37 | python main.py q3 --exp_name layers2 --nn_layers 2
38 | python main.py q3 --exp_name layers3 --nn_layers 3
39 | python plot.py --exps HalfCheetah_q3_layers1 HalfCheetah_q3_layers2 HalfCheetah_q3_layers3 --save HalfCheetah_q3_nn_layers
40 |
--------------------------------------------------------------------------------
/hw4/timer.py:
--------------------------------------------------------------------------------
1 | import time
2 | from collections import defaultdict
3 |
4 | class TimeIt(object):
5 | def __init__(self, prefix=''):
6 | self.prefix = prefix
7 | self.start_times = dict()
8 | self.elapsed_times = defaultdict(int)
9 |
10 | def start(self, name):
11 | assert(name not in self.start_times)
12 | self.start_times[name] = time.time()
13 |
14 | def stop(self, name):
15 | assert(name in self.start_times)
16 | self.elapsed_times[name] += time.time() - self.start_times[name]
17 | self.start_times.pop(name)
18 |
19 | def elapsed(self, name):
20 | return self.elapsed_times[name]
21 |
22 | def reset(self):
23 | self.start_times = dict()
24 | self.elapsed_times = defaultdict(int)
25 |
26 | def __str__(self):
27 | s = ''
28 | names_elapsed = sorted(self.elapsed_times.items(), key=lambda x: x[1], reverse=True)
29 | for name, elapsed in names_elapsed:
30 | if 'total' not in self.elapsed_times:
31 | s += '{0}: {1: <10} {2:.1f}\n'.format(self.prefix, name, elapsed)
32 | else:
33 | assert(self.elapsed_times['total'] >= max(self.elapsed_times.values()))
34 | pct = 100. * elapsed / self.elapsed_times['total']
35 | s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, name, elapsed, pct)
36 | if 'total' in self.elapsed_times:
37 | times_summed = sum([t for k, t in self.elapsed_times.items() if k != 'total'])
38 | other_time = self.elapsed_times['total'] - times_summed
39 | assert(other_time >= 0)
40 | pct = 100. * other_time / self.elapsed_times['total']
41 | s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, 'other', other_time, pct)
42 | return s
43 |
44 | timeit = TimeIt()
45 |
--------------------------------------------------------------------------------
/hw5/exp/README.md:
--------------------------------------------------------------------------------
1 | # CS294-112 HW 5a: Exploration
2 |
3 | Dependencies:
4 | * Python **3.5**
5 | * Numpy version **1.14.5**
6 | * TensorFlow version **1.10.5**
7 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
8 | * seaborn
9 | * tqdm==**4.26.0**
10 |
11 | Before doing anything, first replace `gym/envs/mujoco/half_cheetah.py` with the provided `sparse_half_cheetah.py` file. It is always a good idea to keep a copy of the original `gym/envs/mujoco/half_cheetah.py` just in case you need it for something else.
12 |
13 | You will implement `density_model.py`, `exploration.py`, and `train_ac_exploration_f18.py`.
14 |
15 | See the hw5a.pdf in this folder for further instructions.
16 | .
17 |
--------------------------------------------------------------------------------
/hw5/exp/ex_utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None):
4 | """
5 | Builds a feedforward neural network
6 |
7 | arguments:
8 | input_placeholder: placeholder variable for the state (batch_size, input_size)
9 | output_size: size of the output layer
10 | scope: variable scope of the network
11 | n_layers: number of hidden layers
12 | size: dimension of the hidden layer
13 | activation: activation of the hidden layers
14 | output_activation: activation of the ouput layers
15 |
16 | returns:
17 | output placeholder of the network (the result of a forward pass)
18 |
19 | Hint: use tf.layers.dense
20 | """
21 | output_placeholder = input_placeholder
22 | with tf.variable_scope(scope):
23 | for _ in range(n_layers):
24 | output_placeholder = tf.layers.dense(output_placeholder, size, activation=activation)
25 | output_placeholder = tf.layers.dense(output_placeholder, output_size, activation=output_activation)
26 | return output_placeholder
--------------------------------------------------------------------------------
/hw5/exp/hw5a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/exp/hw5a.pdf
--------------------------------------------------------------------------------
/hw5/exp/logz.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | """
4 |
5 | Some simple logging functionality, inspired by rllab's logging.
6 | Assumes that each diagnostic gets logged each iteration
7 |
8 | Call logz.configure_output_dir() to start logging to a
9 | tab-separated-values file (some_folder_name/log.txt)
10 |
11 | To load the learning curves, you can do, for example
12 |
13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
14 | A['EpRewMean']
15 |
16 | """
17 |
18 | import os.path as osp, shutil, time, atexit, os, subprocess
19 | import pickle
20 | import tensorflow as tf
21 |
22 | color2num = dict(
23 | gray=30,
24 | red=31,
25 | green=32,
26 | yellow=33,
27 | blue=34,
28 | magenta=35,
29 | cyan=36,
30 | white=37,
31 | crimson=38
32 | )
33 |
34 | def colorize(string, color, bold=False, highlight=False):
35 | attr = []
36 | num = color2num[color]
37 | if highlight: num += 10
38 | attr.append(str(num))
39 | if bold: attr.append('1')
40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
41 |
42 | class G:
43 | output_dir = None
44 | output_file = None
45 | first_row = True
46 | log_headers = []
47 | log_current_row = {}
48 |
49 | def configure_output_dir(d=None):
50 | """
51 | Set output directory to d, or to /tmp/somerandomnumber if d is None
52 | """
53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
55 | os.makedirs(G.output_dir)
56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
57 | atexit.register(G.output_file.close)
58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
59 |
60 | def log_tabular(key, val):
61 | """
62 | Log a value of some diagnostic
63 | Call this once for each diagnostic quantity, each iteration
64 | """
65 | if G.first_row:
66 | G.log_headers.append(key)
67 | else:
68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
70 | G.log_current_row[key] = val
71 |
72 | def save_params(params):
73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out:
74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
75 |
76 | def pickle_tf_vars():
77 | """
78 | Saves tensorflow variables
79 | Requires them to be initialized first, also a default session must exist
80 | """
81 | _dict = {v.name : v.eval() for v in tf.global_variables()}
82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
83 | pickle.dump(_dict, f)
84 |
85 |
86 | def dump_tabular():
87 | """
88 | Write all of the diagnostics from the current iteration
89 | """
90 | vals = []
91 | key_lens = [len(key) for key in G.log_headers]
92 | max_key_len = max(15,max(key_lens))
93 | keystr = '%'+'%d'%max_key_len
94 | fmt = "| " + keystr + "s | %15s |"
95 | n_slashes = 22 + max_key_len
96 | print("-"*n_slashes)
97 | for key in G.log_headers:
98 | val = G.log_current_row.get(key, "")
99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 | else: valstr = val
101 | print(fmt%(key, valstr))
102 | vals.append(val)
103 | print("-"*n_slashes)
104 | if G.output_file is not None:
105 | if G.first_row:
106 | G.output_file.write("\t".join(G.log_headers))
107 | G.output_file.write("\n")
108 | G.output_file.write("\t".join(map(str,vals)))
109 | G.output_file.write("\n")
110 | G.output_file.flush()
111 | G.log_current_row.clear()
112 | G.first_row=False
113 |
--------------------------------------------------------------------------------
/hw5/exp/plot.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | import json
5 | import os
6 |
7 | """
8 | Using the plotter:
9 |
10 | Call it from the command line, and supply it with logdirs to experiments.
11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10
12 | random seeds. The runner code stored it in the directory structure
13 |
14 | data
15 | L test_EnvName_DateTime
16 | L 0
17 | L log.txt
18 | L params.json
19 | L 1
20 | L log.txt
21 | L params.json
22 | .
23 | .
24 | .
25 | L 9
26 | L log.txt
27 | L params.json
28 |
29 | To plot learning curves from the experiment, averaged over all random
30 | seeds, call
31 |
32 | python plot.py data/test_EnvName_DateTime --value AverageReturn
33 |
34 | and voila. To see a different statistics, change what you put in for
35 | the keyword --value. You can also enter /multiple/ values, and it will
36 | make all of them in order.
37 |
38 |
39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
40 | a different set of hyperparameters from 'test1', and now you would like
41 | to compare them -- see their learning curves side-by-side. Just call
42 |
43 | python plot.py data/test1 data/test2
44 |
45 | and it will plot them both! They will be given titles in the legend according
46 | to their exp_name parameters. If you want to use custom legend titles, use
47 | the --legend flag and then provide a title for each logdir.
48 |
49 | """
50 |
51 | def plot_data(data, value="AverageReturn"):
52 | if isinstance(data, list):
53 | data = pd.concat(data, ignore_index=True)
54 |
55 | sns.set(style="darkgrid", font_scale=1.5)
56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
57 | plt.legend(loc='best').draggable()
58 | # plt.legend(loc='best', bbox_to_anchor=(1, 1), fontsize=8).draggable()
59 | plt.show()
60 |
61 |
62 | def get_datasets(fpath, condition=None):
63 | unit = 0
64 | datasets = []
65 | for root, dir, files in os.walk(fpath):
66 | if 'log.txt' in files:
67 | param_path = open(os.path.join(root,'params.json'))
68 | params = json.load(param_path)
69 | exp_name = params['exp_name']
70 |
71 | log_path = os.path.join(root,'log.txt')
72 | experiment_data = pd.read_table(log_path)
73 |
74 | experiment_data.insert(
75 | len(experiment_data.columns),
76 | 'Unit',
77 | unit
78 | )
79 | experiment_data.insert(
80 | len(experiment_data.columns),
81 | 'Condition',
82 | condition or exp_name
83 | )
84 |
85 | datasets.append(experiment_data)
86 | unit += 1
87 |
88 | return datasets
89 |
90 |
91 | def main():
92 | import argparse
93 | parser = argparse.ArgumentParser()
94 | parser.add_argument('logdir', nargs='*')
95 | parser.add_argument('--legend', nargs='*')
96 | parser.add_argument('--value', default='AverageReturn', nargs='*')
97 | args = parser.parse_args()
98 |
99 | use_legend = False
100 | if args.legend is not None:
101 | assert len(args.legend) == len(args.logdir), \
102 | "Must give a legend title for each set of experiments."
103 | use_legend = True
104 |
105 | data = []
106 | if use_legend:
107 | for logdir, legend_title in zip(args.logdir, args.legend):
108 | data += get_datasets(logdir, legend_title)
109 | else:
110 | for logdir in args.logdir:
111 | data += get_datasets(logdir)
112 |
113 | if isinstance(args.value, list):
114 | values = args.value
115 | else:
116 | values = [args.value]
117 | for value in values:
118 | plot_data(data, value=value)
119 |
120 | if __name__ == "__main__":
121 | main()
122 |
--------------------------------------------------------------------------------
/hw5/exp/replay.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | import copy
4 |
5 | class Replay_Buffer(object):
6 | def __init__(self, max_size=np.inf):
7 | self.memory = []
8 | self.max_size = int(max_size)
9 |
10 | def adjust_size(self):
11 | if len(self.memory) > self.max_size:
12 | diff = int(len(self.memory) - self.max_size)
13 | self.memory = self.memory[:-diff] # FIFO
14 | print('Adjusted replay size')
15 |
16 | def prepend(self, x):
17 | # assume x is a list of states
18 | self.memory = list(x) + self.memory
19 | self.adjust_size()
20 |
21 | def sample(self, batch_size):
22 | random_batch = random.sample(self.memory, batch_size)
23 | return random_batch
24 |
25 | def __len__(self):
26 | return len(self.memory)
27 |
28 | def __getitem__(self, indices):
29 | return copy.deepcopy(np.array([self.memory[i] for i in indices]))
30 |
31 | def get_memory(self):
32 | return copy.deepcopy(self.memory)
33 |
34 | def clear_buffer(self):
35 | del self.memory[:]
--------------------------------------------------------------------------------
/hw5/exp/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.5
2 | mujoco-py==1.50.1.56
3 | tensorflow
4 | numpy
5 | seaborn
6 | tqdm
--------------------------------------------------------------------------------
/hw5/exp/run_all.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | ##########################
4 | ### P1 Hist PointMass ###
5 | ##########################
6 |
7 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model none -s 8 --exp_name PM_bc0_s8
8 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model hist -bc 0.01 -s 8 --exp_name PM_hist_bc0.01_s8
9 |
10 | ##########################
11 | ### P2 RBF PointMass ###
12 | ##########################
13 |
14 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model rbf -bc 0.01 -s 8 -sig 0.2 --exp_name PM_rbf_bc0.01_s8_sig0.2
15 |
16 | ##########################
17 | ### P3 EX2 PointMass ###
18 | ##########################
19 |
20 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model ex2 -s 8 -bc 0.05 -kl 0.1 -dlr 0.001 -dh 8 -dti 1000 --exp_name PM_ex2_s8_bc0.05_kl0.1_dlr0.001_dh8_dti1000
21 |
22 | ###########################
23 | ### P4 HalfCheetah ###
24 | ###########################
25 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model none --exp_name HC_bc0
26 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.001 -kl 0.1 -dlr 0.005 -dti 1000 --exp_name HC_bc0.001_kl0.1_dlr0.005_dti1000
27 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.0001 -kl 0.1 -dlr 0.005 -dti 10000 --exp_name HC_bc0.0001_kl0.1_dlr0.005_dti10000
28 |
--------------------------------------------------------------------------------
/hw5/exp/sparse_half_cheetah.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import utils
3 | from gym.envs.mujoco import mujoco_env
4 |
5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
6 | def __init__(self):
7 | mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
8 | utils.EzPickle.__init__(self)
9 |
10 | def step(self, action):
11 | #################################################
12 | ctrl = False
13 | relu = False
14 | threshold = 10.0
15 | #################################################
16 | xposbefore = self.sim.data.qpos[0]
17 | self.do_simulation(action, self.frame_skip)
18 | xposafter = self.sim.data.qpos[0]
19 | ob = self._get_obs()
20 | # reward_ctrl = - 0.1 * np.square(action).sum()
21 | # reward_run = (xposafter - xposbefore)/self.dt
22 | #################################################
23 | if ctrl:
24 | reward_ctrl = - 0.1 * np.square(action).sum()
25 | else:
26 | reward_ctrl = 0
27 | if abs(xposafter) <= threshold:
28 | reward_run = 0.0
29 | else:
30 | if relu:
31 | reward_run = np.sign(xposafter)*(xposafter - xposbefore)/self.dt
32 | else:
33 | reward_run = 1.0
34 | #################################################
35 | reward = reward_ctrl + reward_run
36 | done = False
37 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
38 |
39 | def _get_obs(self):
40 | return np.concatenate([
41 | self.sim.data.qpos.flat[1:],
42 | self.sim.data.qvel.flat,
43 | ])
44 |
45 | def reset_model(self):
46 | qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
47 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
48 | self.set_state(qpos, qvel)
49 | return self._get_obs()
50 |
51 | def viewer_setup(self):
52 | self.viewer.cam.distance = self.model.stat.extent * 0.5
53 |
--------------------------------------------------------------------------------
/hw5/meta/Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf
--------------------------------------------------------------------------------
/hw5/meta/README.md:
--------------------------------------------------------------------------------
1 | # CS294-112 HW 5c: Meta-Learning
2 |
3 | Dependencies:
4 |
5 | * Python **3.5**
6 | * Numpy version 1.14.5
7 | * TensorFlow version 1.10.5
8 | * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
9 | * OpenAI Gym version **0.10.5**
10 | * seaborn
11 | * Box2D==2.3.2
12 |
13 | Instructions: [HW5c PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5c.pdf)
14 |
15 | ### 1. Problem1 Context as Task ID
16 |
17 | Run the following command:
18 |
19 | `python train_policy.py 'pm-obs' --exp_name --history 1 -lr 5e-5 -n 200 --num_tasks 4`
20 |
21 | ### 2. Problem2 Meta-Learned Context
22 |
23 | Run the following command:
24 |
25 | **With MLP model**
26 |
27 | `python train_policy.py 'pm' --exp_name --history --discount 0.90 -lr 5e-4 -n 60`
28 |
29 |
30 | **With RNN model**
31 |
32 | `python train_policy.py 'pm' --exp_name --history --discount 0.90 -lr 5e-4 -n 60 --recurrent`
33 |
34 | ### 3. Problem3 Generalization
35 |
36 | Run the following command:
37 |
38 | `python train_policy.py 'pm' --exp_name --history --discount 0.90 -lr 5e-4 -n 60 --recurrent --generalized --granularity `
39 |
40 | if `--generalized`, the training goals and testing goals will be chosen from chessboard space where 1 corresponds to testing goals and 0 corresponds to training goals. The size of pattern in chessboard is defined by `--granularity`. The value can be chosen from the list `[1,2,4,5,10]` to construct a balanced chessboard.
41 |
42 |
--------------------------------------------------------------------------------
/hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "mlp_1",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 1,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/mlp_1_pm_13-11-2018_20-57-59/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : false,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "mlp_30",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 30,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/mlp_30_pm_13-11-2018_20-48-55/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : false,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "mlp_50",
4 | "gamma" : 0.9,
5 | "generalized" : false,
6 | "granularity" : 1,
7 | "gru_size" : 32,
8 | "history" : 50,
9 | "l2reg" : false,
10 | "learning_rate" : 0.0005,
11 | "logdir" : "data/mlp_50_pm_14-11-2018_20-05-53/1",
12 | "max_path_length" : 20,
13 | "min_timesteps_per_batch" : 10000,
14 | "mini_batch_size" : 64,
15 | "n_iter" : 60,
16 | "n_layers" : 1,
17 | "nn_critic" : false,
18 | "normalize_advantages" : true,
19 | "num_ppo_updates" : 780,
20 | "num_tasks" : 1,
21 | "num_value_iters" : 1,
22 | "recurrent" : false,
23 | "seed" : 1,
24 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "mlp_60",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 60,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/mlp_60_pm_13-11-2018_23-01-39/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : false,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm-obs",
3 | "exp_name" : "pro1",
4 | "gamma" : 0.99,
5 | "gru_size" : 32,
6 | "history" : 1,
7 | "l2reg" : false,
8 | "learning_rate" : 5e-05,
9 | "logdir" : "data/pro1_pm-obs_13-11-2018_01-08-37/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 2500,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 200,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 4,
19 | "num_value_iters" : 1,
20 | "recurrent" : false,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "pro3_rnn_60_g_1",
4 | "gamma" : 0.9,
5 | "generalized" : true,
6 | "granularity" : 1,
7 | "gru_size" : 32,
8 | "history" : 60,
9 | "l2reg" : false,
10 | "learning_rate" : 0.0005,
11 | "logdir" : "data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1",
12 | "max_path_length" : 20,
13 | "min_timesteps_per_batch" : 10000,
14 | "mini_batch_size" : 64,
15 | "n_iter" : 60,
16 | "n_layers" : 1,
17 | "nn_critic" : false,
18 | "normalize_advantages" : true,
19 | "num_ppo_updates" : 780,
20 | "num_tasks" : 1,
21 | "num_value_iters" : 1,
22 | "recurrent" : true,
23 | "seed" : 1,
24 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "pro3_rnn_60_g_2",
4 | "gamma" : 0.9,
5 | "generalized" : true,
6 | "granularity" : 2,
7 | "gru_size" : 32,
8 | "history" : 60,
9 | "l2reg" : false,
10 | "learning_rate" : 0.0005,
11 | "logdir" : "data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1",
12 | "max_path_length" : 20,
13 | "min_timesteps_per_batch" : 10000,
14 | "mini_batch_size" : 64,
15 | "n_iter" : 60,
16 | "n_layers" : 1,
17 | "nn_critic" : false,
18 | "normalize_advantages" : true,
19 | "num_ppo_updates" : 780,
20 | "num_tasks" : 1,
21 | "num_value_iters" : 1,
22 | "recurrent" : true,
23 | "seed" : 1,
24 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "pro3_rnn_60_g_4",
4 | "gamma" : 0.9,
5 | "generalized" : true,
6 | "granularity" : 4,
7 | "gru_size" : 32,
8 | "history" : 60,
9 | "l2reg" : false,
10 | "learning_rate" : 0.0005,
11 | "logdir" : "data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1",
12 | "max_path_length" : 20,
13 | "min_timesteps_per_batch" : 10000,
14 | "mini_batch_size" : 64,
15 | "n_iter" : 60,
16 | "n_layers" : 1,
17 | "nn_critic" : false,
18 | "normalize_advantages" : true,
19 | "num_ppo_updates" : 780,
20 | "num_tasks" : 1,
21 | "num_value_iters" : 1,
22 | "recurrent" : true,
23 | "seed" : 1,
24 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "rnn_1",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 1,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/rnn_1_pm_13-11-2018_21-05-16/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : true,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "rnn_30",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 30,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/rnn_30_pm_13-11-2018_19-34-21/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : true,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "rnn_50",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 50,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/rnn_50_pm_14-11-2018_10-34-08/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : true,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "rnn_60",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 60,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/rnn_60_pm_13-11-2018_17-27-20/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : true,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm-obs",
3 | "exp_name" : "pro1",
4 | "gamma" : 0.99,
5 | "gru_size" : 32,
6 | "history" : 1,
7 | "l2reg" : false,
8 | "learning_rate" : 5e-05,
9 | "logdir" : "data/pro1_pm-obs_13-11-2018_01-08-37/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 2500,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 200,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 4,
19 | "num_value_iters" : 1,
20 | "recurrent" : false,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro1/prob1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro1/prob1.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "mlp_1",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 1,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/mlp_1_pm_13-11-2018_20-57-59/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : false,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/pro2_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/pro2_1.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "rnn_1",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 1,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/rnn_1_pm_13-11-2018_21-05-16/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : true,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "mlp_30",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 30,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/mlp_30_pm_13-11-2018_20-48-55/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : false,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/prob_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/prob_30.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "rnn_30",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 30,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/rnn_30_pm_13-11-2018_19-34-21/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : true,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "mlp_50",
4 | "gamma" : 0.9,
5 | "generalized" : false,
6 | "granularity" : 1,
7 | "gru_size" : 32,
8 | "history" : 50,
9 | "l2reg" : false,
10 | "learning_rate" : 0.0005,
11 | "logdir" : "data/mlp_50_pm_14-11-2018_20-05-53/1",
12 | "max_path_length" : 20,
13 | "min_timesteps_per_batch" : 10000,
14 | "mini_batch_size" : 64,
15 | "n_iter" : 60,
16 | "n_layers" : 1,
17 | "nn_critic" : false,
18 | "normalize_advantages" : true,
19 | "num_ppo_updates" : 780,
20 | "num_tasks" : 1,
21 | "num_value_iters" : 1,
22 | "recurrent" : false,
23 | "seed" : 1,
24 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/pro2_50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/pro2_50.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "rnn_50",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 50,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/rnn_50_pm_14-11-2018_10-34-08/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : true,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "mlp_60",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 60,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/mlp_60_pm_13-11-2018_23-01-39/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : false,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/pro2_60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/pro2_60.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "rnn_60",
4 | "gamma" : 0.9,
5 | "gru_size" : 32,
6 | "history" : 60,
7 | "l2reg" : false,
8 | "learning_rate" : 0.0005,
9 | "logdir" : "data/rnn_60_pm_13-11-2018_17-27-20/1",
10 | "max_path_length" : 20,
11 | "min_timesteps_per_batch" : 10000,
12 | "mini_batch_size" : 64,
13 | "n_iter" : 60,
14 | "n_layers" : 1,
15 | "nn_critic" : false,
16 | "normalize_advantages" : true,
17 | "num_ppo_updates" : 780,
18 | "num_tasks" : 1,
19 | "num_value_iters" : 1,
20 | "recurrent" : true,
21 | "seed" : 1,
22 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "pro3_rnn_60_g_1",
4 | "gamma" : 0.9,
5 | "generalized" : true,
6 | "granularity" : 1,
7 | "gru_size" : 32,
8 | "history" : 60,
9 | "l2reg" : false,
10 | "learning_rate" : 0.0005,
11 | "logdir" : "data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1",
12 | "max_path_length" : 20,
13 | "min_timesteps_per_batch" : 10000,
14 | "mini_batch_size" : 64,
15 | "n_iter" : 60,
16 | "n_layers" : 1,
17 | "nn_critic" : false,
18 | "normalize_advantages" : true,
19 | "num_ppo_updates" : 780,
20 | "num_tasks" : 1,
21 | "num_value_iters" : 1,
22 | "recurrent" : true,
23 | "seed" : 1,
24 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_avg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_avg.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_val.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_val.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "pro3_rnn_60_g_2",
4 | "gamma" : 0.9,
5 | "generalized" : true,
6 | "granularity" : 2,
7 | "gru_size" : 32,
8 | "history" : 60,
9 | "l2reg" : false,
10 | "learning_rate" : 0.0005,
11 | "logdir" : "data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1",
12 | "max_path_length" : 20,
13 | "min_timesteps_per_batch" : 10000,
14 | "mini_batch_size" : 64,
15 | "n_iter" : 60,
16 | "n_layers" : 1,
17 | "nn_critic" : false,
18 | "normalize_advantages" : true,
19 | "num_ppo_updates" : 780,
20 | "num_tasks" : 1,
21 | "num_value_iters" : 1,
22 | "recurrent" : true,
23 | "seed" : 1,
24 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_avg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_avg.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_val.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_val.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/params.json:
--------------------------------------------------------------------------------
1 | {"animate" : false,
2 | "env_name" : "pm",
3 | "exp_name" : "pro3_rnn_60_g_4",
4 | "gamma" : 0.9,
5 | "generalized" : true,
6 | "granularity" : 4,
7 | "gru_size" : 32,
8 | "history" : 60,
9 | "l2reg" : false,
10 | "learning_rate" : 0.0005,
11 | "logdir" : "data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1",
12 | "max_path_length" : 20,
13 | "min_timesteps_per_batch" : 10000,
14 | "mini_batch_size" : 64,
15 | "n_iter" : 60,
16 | "n_layers" : 1,
17 | "nn_critic" : false,
18 | "normalize_advantages" : true,
19 | "num_ppo_updates" : 780,
20 | "num_tasks" : 1,
21 | "num_value_iters" : 1,
22 | "recurrent" : true,
23 | "seed" : 1,
24 | "size" : 64}
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_avg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_avg.png
--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_val.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_val.png
--------------------------------------------------------------------------------
/hw5/meta/logz.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | """
4 |
5 | Some simple logging functionality, inspired by rllab's logging.
6 | Assumes that each diagnostic gets logged each iteration
7 |
8 | Call logz.configure_output_dir() to start logging to a
9 | tab-separated-values file (some_folder_name/log.txt)
10 |
11 | To load the learning curves, you can do, for example
12 |
13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
14 | A['EpRewMean']
15 |
16 | """
17 |
18 | import os.path as osp, shutil, time, atexit, os, subprocess
19 | import pickle
20 | import tensorflow as tf
21 |
22 | color2num = dict(
23 | gray=30,
24 | red=31,
25 | green=32,
26 | yellow=33,
27 | blue=34,
28 | magenta=35,
29 | cyan=36,
30 | white=37,
31 | crimson=38
32 | )
33 |
34 | def colorize(string, color, bold=False, highlight=False):
35 | attr = []
36 | num = color2num[color]
37 | if highlight: num += 10
38 | attr.append(str(num))
39 | if bold: attr.append('1')
40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
41 |
42 | class G:
43 | output_dir = None
44 | output_file = None
45 | first_row = True
46 | log_headers = []
47 | log_current_row = {}
48 |
49 | def configure_output_dir(d=None):
50 | """
51 | Set output directory to d, or to /tmp/somerandomnumber if d is None
52 | """
53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
55 | os.makedirs(G.output_dir)
56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
57 | atexit.register(G.output_file.close)
58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
59 |
60 | def log_tabular(key, val):
61 | """
62 | Log a value of some diagnostic
63 | Call this once for each diagnostic quantity, each iteration
64 | """
65 | if G.first_row:
66 | G.log_headers.append(key)
67 | else:
68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
70 | G.log_current_row[key] = val
71 |
72 | def save_params(params):
73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out:
74 | out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
75 |
76 | def pickle_tf_vars():
77 | """
78 | Saves tensorflow variables
79 | Requires them to be initialized first, also a default session must exist
80 | """
81 | _dict = {v.name : v.eval() for v in tf.global_variables()}
82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
83 | pickle.dump(_dict, f)
84 |
85 |
86 | def dump_tabular():
87 | """
88 | Write all of the diagnostics from the current iteration
89 | """
90 | vals = []
91 | key_lens = [len(key) for key in G.log_headers]
92 | max_key_len = max(15,max(key_lens))
93 | keystr = '%'+'%d'%max_key_len
94 | fmt = "| " + keystr + "s | %15s |"
95 | n_slashes = 22 + max_key_len
96 | print("-"*n_slashes)
97 | for key in G.log_headers:
98 | val = G.log_current_row.get(key, "")
99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 | else: valstr = val
101 | print(fmt%(key, valstr))
102 | vals.append(val)
103 | print("-"*n_slashes)
104 | if G.output_file is not None:
105 | if G.first_row:
106 | G.output_file.write("\t".join(G.log_headers))
107 | G.output_file.write("\n")
108 | G.output_file.write("\t".join(map(str,vals)))
109 | G.output_file.write("\n")
110 | G.output_file.flush()
111 | G.log_current_row.clear()
112 | G.first_row=False
113 |
--------------------------------------------------------------------------------
/hw5/meta/plot.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | import json
5 | import os
6 |
7 | """
8 | Using the plotter:
9 |
10 | Call it from the command line, and supply it with logdirs to experiments.
11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10
12 | random seeds. The runner code stored it in the directory structure
13 |
14 | data
15 | L test_EnvName_DateTime
16 | L 0
17 | L log.txt
18 | L params.json
19 | L 1
20 | L log.txt
21 | L params.json
22 | .
23 | .
24 | .
25 | L 9
26 | L log.txt
27 | L params.json
28 |
29 | To plot learning curves from the experiment, averaged over all random
30 | seeds, call
31 |
32 | python plot.py data/test_EnvName_DateTime --value AverageReturn
33 |
34 | and voila. To see a different statistics, change what you put in for
35 | the keyword --value. You can also enter /multiple/ values, and it will
36 | make all of them in order.
37 |
38 |
39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
40 | a different set of hyperparameters from 'test1', and now you would like
41 | to compare them -- see their learning curves side-by-side. Just call
42 |
43 | python plot.py data/test1 data/test2
44 |
45 | and it will plot them both! They will be given titles in the legend according
46 | to their exp_name parameters. If you want to use custom legend titles, use
47 | the --legend flag and then provide a title for each logdir.
48 |
49 | """
50 |
51 | def plot_data(data, value="AverageReturn"):
52 | if isinstance(data, list):
53 | data = pd.concat(data, ignore_index=True)
54 | sns.set(style="darkgrid", font_scale=1.5)
55 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
56 |
57 | plt.legend(loc='best').draggable()
58 | #plt.savefig('1.png')
59 | plt.show()
60 |
61 |
62 | def get_datasets(fpath, condition=None):
63 | unit = 0
64 | datasets = []
65 | for root, dir, files in os.walk(fpath):
66 | if 'log.txt' in files:
67 | param_path = open(os.path.join(root,'params.json'))
68 | params = json.load(param_path)
69 | exp_name = params['exp_name']
70 |
71 | log_path = os.path.join(root,'log.txt')
72 | experiment_data = pd.read_table(log_path)
73 |
74 | experiment_data.insert(
75 | len(experiment_data.columns),
76 | 'Unit',
77 | unit
78 | )
79 | experiment_data.insert(
80 | len(experiment_data.columns),
81 | 'Condition',
82 | condition or exp_name
83 | )
84 |
85 | datasets.append(experiment_data)
86 | unit += 1
87 |
88 | return datasets
89 |
90 |
91 | def main():
92 | import argparse
93 | parser = argparse.ArgumentParser()
94 | parser.add_argument('logdir', nargs='*')
95 | parser.add_argument('--legend', nargs='*')
96 | parser.add_argument('--value', default='AverageReturn', nargs='*')
97 | args = parser.parse_args()
98 |
99 | use_legend = False
100 | if args.legend is not None:
101 | assert len(args.legend) == len(args.logdir), \
102 | "Must give a legend title for each set of experiments."
103 | use_legend = True
104 |
105 | data = []
106 | if use_legend:
107 | for logdir, legend_title in zip(args.logdir, args.legend):
108 | data += get_datasets(logdir, legend_title)
109 | else:
110 | for logdir in args.logdir:
111 | data += get_datasets(logdir)
112 |
113 | if isinstance(args.value, list):
114 | values = args.value
115 | else:
116 | values = [args.value]
117 | for value in values:
118 | plot_data(data, value=value)
119 |
120 | if __name__ == "__main__":
121 | main()
122 |
--------------------------------------------------------------------------------
/hw5/meta/point_mass.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import spaces
3 | from gym import Env
4 |
5 |
6 | class PointEnv(Env):
7 | """
8 | point mass on a 2-D plane
9 | goals are sampled randomly from a square
10 | """
11 |
12 | def __init__(self, num_tasks=1):
13 | self.reset_task()
14 | self.reset()
15 | self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
16 | self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
17 |
18 |
19 | def reset_task(self, generalized=False, granularity=1, is_evaluation=False):
20 | '''
21 | sample a new task randomly
22 |
23 | Problem 3: make training and evaluation goals disjoint sets
24 | if `is_evaluation` is true, sample from the evaluation set,
25 | otherwise sample from the training set
26 | '''
27 | #====================================================================================#
28 | # ----------PROBLEM 3----------
29 | #====================================================================================#
30 | # YOUR CODE HERE
31 | # Construct the chessboard space with 20 x 20
32 | # The granularity is the size of squares, the value can be chosen from [1, 2, 4, 5, 10]
33 | if generalized:
34 | print("Problem 3...")
35 | print("The size of square is ", granularity)
36 | size = int(20 / granularity)
37 | space = np.zeros((size, size))
38 | space[1::2,::2] = 1
39 | space[::2,1::2] = 1
40 | if is_evaluation:
41 | dataset = np.where(space == 1)
42 | else:
43 | dataset = np.where(space == 0)
44 |
45 | dataset = np.asarray(dataset).T
46 | nums = dataset.shape[0]
47 | idx = np.random.randint(0, nums)
48 | if is_evaluation:
49 | print("Evaluation")
50 | else:
51 | print("training")
52 |
53 | goal = dataset[idx]
54 | goal[0] = goal[0] * granularity
55 | goal[1] = goal[1] * granularity
56 |
57 | x = np.random.uniform(goal[0], goal[0] + granularity) - 10
58 | y = np.random.uniform(goal[1], goal[1] + granularity) - 10
59 | print((x, y))
60 | else:
61 | #print("Problem 2...")
62 | x = np.random.uniform(-10, 10)
63 | y = np.random.uniform(-10, 10)
64 |
65 | self._goal = np.array([x, y])
66 |
67 | #x = np.random.uniform(-10, 10)
68 | #y = np.random.uniform(-10, 10)
69 | #self._goal = np.array([x, y])
70 |
71 | def reset(self):
72 | self._state = np.array([0, 0], dtype=np.float32)
73 | return self._get_obs()
74 |
75 | def _get_obs(self):
76 | return np.copy(self._state)
77 |
78 | def reward_function(self, x, y):
79 | return - (x ** 2 + y ** 2) ** 0.5
80 |
81 | def step(self, action):
82 | x, y = self._state
83 | # compute reward, add penalty for large actions instead of clipping them
84 | x -= self._goal[0]
85 | y -= self._goal[1]
86 | # check if task is complete
87 | done = abs(x) < .01 and abs(y) < .01
88 | reward = self.reward_function(x, y)
89 | # move to next state
90 | self._state = self._state + action
91 | ob = self._get_obs()
92 | return ob, reward, done, dict()
93 |
94 | def viewer_setup(self):
95 | print('no viewer')
96 | pass
97 |
98 | def render(self):
99 | print('current state:', self._state)
100 |
101 | def seed(self, seed):
102 | np.random.seed = seed
103 |
--------------------------------------------------------------------------------
/hw5/meta/point_mass_observed.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import spaces
3 | from gym import Env
4 |
5 |
6 | class ObservedPointEnv(Env):
7 | """
8 | point mass on a 2-D plane
9 | four tasks: move to (-10, -10), (-10, 10), (10, -10), (10, 10)
10 |
11 | Problem 1: augment the observation with a one-hot vector encoding the task ID
12 | - change the dimension of the observation space
13 | - augment the observation with a one-hot vector that encodes the task ID
14 | """
15 | #====================================================================================#
16 | # ----------PROBLEM 1----------
17 | #====================================================================================#
18 | # YOUR CODE SOMEWHERE HERE
19 | def __init__(self, num_tasks=1):
20 | self.tasks = [0, 1, 2, 3][:num_tasks]
21 | self.task_idx = -1
22 | #self.num_tasks = num_tasks
23 | self.reset_task()
24 | self.reset()
25 |
26 | self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2 + num_tasks,))
27 | self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
28 |
29 | def reset_task(self, generalized=False, granularity=1, is_evaluation=False):
30 | # for evaluation, cycle deterministically through all tasks
31 | if is_evaluation:
32 | self.task_idx = (self.task_idx + 1) % len(self.tasks)
33 | # during training, sample tasks randomly
34 | else:
35 | self.task_idx = np.random.randint(len(self.tasks))
36 | self._task = self.tasks[self.task_idx]
37 | goals = [[-1, -1], [-1, 1], [1, -1], [1, 1]]
38 | self._goal = np.array(goals[self.task_idx])*10
39 |
40 | def reset(self):
41 | self._state = np.array([0, 0], dtype=np.float32)
42 | return self._get_obs()
43 |
44 | def _get_obs(self):
45 | one_hot = np.zeros(len(self.tasks))
46 | one_hot[self._task] = 1
47 |
48 | return np.concatenate((np.copy(self._state), one_hot))
49 |
50 | def step(self, action):
51 | x, y = self._state
52 | # compute reward, add penalty for large actions instead of clipping them
53 | x -= self._goal[0]
54 | y -= self._goal[1]
55 | reward = - (x ** 2 + y ** 2) ** 0.5
56 | # check if task is complete
57 | done = abs(x) < 0.01 and abs(y) < 0.01
58 | # move to next state
59 | self._state = self._state + action
60 | ob = self._get_obs()
61 |
62 | return ob, reward, done, dict()
63 |
64 | def viewer_setup(self):
65 | print('no viewer')
66 | pass
67 |
68 | def render(self):
69 | print('current state:', self._state)
70 |
71 | def seed(self, seed):
72 | np.random.seed = seed
73 |
--------------------------------------------------------------------------------
/hw5/meta/replay_buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class ReplayBuffer(object):
4 | '''
5 | minimalistic replay buffer
6 |
7 | a sample consists of
8 | - observation
9 | - action
10 | - reward
11 | - terminal
12 | - hidden state for recurrent policy
13 |
14 | it is memory inefficient to store windowed observations this way
15 | so do not run on tasks with large observations (e.g. from vision)
16 | '''
17 |
18 | def __init__(self, max_size, ob_dim, ac_dim, hidden_dim, task_dim):
19 | self.max_size = max_size
20 | self.ob_dim = ob_dim
21 | self.ac_dim = ac_dim
22 | self.hidden_dim = hidden_dim
23 | self.task_dim = task_dim
24 | self.flush()
25 |
26 | def flush(self):
27 | '''
28 | set buffer to empty
29 | '''
30 | self._observations = np.zeros((self.max_size, *self.ob_dim))
31 | self._actions = np.zeros((self.max_size, *self.ac_dim))
32 | self._rewards = np.zeros((self.max_size, 1))
33 | self._terminals = np.zeros((self.max_size, 1))
34 | self._hiddens = np.zeros((self.max_size, self.hidden_dim))
35 | self._tasks = np.zeros((self.max_size, self.task_dim))
36 | self._top = 0
37 | self._size = 0
38 |
39 | def _advance(self):
40 | '''
41 | move pointer to top of buffer
42 | if end of buffer is reached, overwrite oldest data
43 | '''
44 | self._top = (self._top + 1) % self.max_size
45 | if self._size < self.max_size:
46 | self._size += 1
47 |
48 | def add_sample(self, ob, ac, re, te, hi, task):
49 | '''
50 | add sample to buffer
51 | '''
52 | self._observations[self._top] = ob
53 | self._actions[self._top] = ac
54 | self._rewards[self._top] = re
55 | self._terminals[self._top] = te
56 | self._hiddens[self._top] = hi
57 | self._tasks[self._top] = task
58 |
59 | self._advance()
60 |
61 | def get_samples(self, indices):
62 | '''
63 | return buffer data indexed by `indices`
64 | '''
65 | return dict(
66 | observations=self._observations[indices],
67 | actions=self._actions[indices],
68 | rewards=self._rewards[indices],
69 | terminals=self._terminals[indices],
70 | hiddens=self._hiddens[indices],
71 | tasks=self._tasks[indices],
72 | )
73 |
74 | def random_batch(self, batch_size):
75 | '''
76 | return random sample of `batch_size` transitions
77 | '''
78 | indices = np.random.randint(0, self._size, batch_size)
79 | return self.get_samples(indices)
80 |
81 | def all_batch(self):
82 | '''
83 | return all data in the buffer
84 | '''
85 | indices = list(range(self._size))
86 | return self.get_samples(indices)
87 |
88 | def num_steps_can_sample(self):
89 | return self._size
90 |
91 |
92 |
93 | class PPOReplayBuffer(object):
94 | '''
95 | replay buffer for PPO algorithm
96 | store fixed log probs, advantages, and returns for use in multiple updates
97 |
98 | n.b. samples must be added as a batch, and we assume that the
99 | batch is the same size as that of the simple buffer
100 | '''
101 |
102 | def __init__(self, simple_buffer):
103 | self.simple_buffer = simple_buffer
104 | self.max_size = self.simple_buffer.max_size
105 | self.flush()
106 |
107 | def flush(self):
108 | self.simple_buffer.flush()
109 | self._log_probs = np.zeros((self.max_size, 1))
110 | self._advantages = np.zeros((self.max_size, 1))
111 | self._returns = np.zeros((self.max_size, 1))
112 |
113 | def add_samples(self, lp, adv, ret):
114 | self._log_probs = lp
115 | self._advantages = adv
116 | self._returns = ret
117 |
118 | def get_samples(self, indices):
119 | return dict(
120 | log_probs = self._log_probs[indices],
121 | advantages = self._advantages[indices],
122 | returns = self._returns[indices],
123 | )
124 |
125 | def random_batch(self, batch_size):
126 | indices = np.random.randint(0, self.simple_buffer._size, batch_size)
127 | simple = self.simple_buffer.get_samples(indices)
128 | ppo = self.get_samples(indices)
129 | return {**simple, **ppo}
130 |
--------------------------------------------------------------------------------
/hw5/meta/requirements.txt:
--------------------------------------------------------------------------------
1 | mujoco-py==1.50.1.56
2 | gym==0.10.5
3 | tensorflow==1.10.0
4 | numpy==1.14.5
5 | scipy==1.1.0
6 | tensorflow-probability==0.3.0
7 | seaborn
8 | Box2D==2.3.2
9 |
--------------------------------------------------------------------------------
/hw5/sac/README.md:
--------------------------------------------------------------------------------
1 | # CS294-112 HW 5b: Soft Actor Critic
2 | Original code from Tuomas Haarnoja, Soroush Nasiriany, and Aurick Zhou for CS294-112 Fall 2018
3 |
4 | Dependencies:
5 | * Python **3.4.5**
6 | * Numpy version **1.15.2**
7 | * TensorFlow version **1.10.0**
8 | * tensorflow-probability version **0.4.0**
9 | * OpenAI Gym version **0.10.8**
10 | * MuJoCo version **1.50** and mujoco-py **1.50.1.59**
11 | * seaborn version **0.9.0**
12 |
13 | You will implement `sac.py`, and `nn.py`.
14 |
15 | See the [HW5 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5b.pdf) for further instructions.
16 |
--------------------------------------------------------------------------------
/hw5/sac/environment.yml:
--------------------------------------------------------------------------------
1 | name: hw5-sac
2 | dependencies:
3 | - python==3.4.5
4 | - pip:
5 | - gym==0.10.8
6 | - numpy==1.15.2
7 | - tensorflow==1.10.0
8 | - tensorflow-probability==0.4.0
9 | - mujoco-py==1.50.1.59
10 | - seaborn==0.9.0
11 |
--------------------------------------------------------------------------------
/hw5/sac/logz.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | """
4 |
5 | Some simple logging functionality, inspired by rllab's logging.
6 | Assumes that each diagnostic gets logged each iteration
7 |
8 | Call logz.configure_output_dir() to start logging to a
9 | tab-separated-values file (some_folder_name/log.txt)
10 |
11 | To load the learning curves, you can do, for example
12 |
13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
14 | A['EpRewMean']
15 |
16 | """
17 |
18 | import os.path as osp, shutil, time, atexit, os, subprocess
19 | import pickle
20 | import tensorflow as tf
21 |
22 | color2num = dict(
23 | gray=30,
24 | red=31,
25 | green=32,
26 | yellow=33,
27 | blue=34,
28 | magenta=35,
29 | cyan=36,
30 | white=37,
31 | crimson=38
32 | )
33 |
34 | def colorize(string, color, bold=False, highlight=False):
35 | attr = []
36 | num = color2num[color]
37 | if highlight: num += 10
38 | attr.append(str(num))
39 | if bold: attr.append('1')
40 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
41 |
42 | class G:
43 | output_dir = None
44 | output_file = None
45 | first_row = True
46 | log_headers = []
47 | log_current_row = {}
48 |
49 | def configure_output_dir(d=None):
50 | """
51 | Set output directory to d, or to /tmp/somerandomnumber if d is None
52 | """
53 | G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
54 | assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
55 | os.makedirs(G.output_dir)
56 | G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
57 | atexit.register(G.output_file.close)
58 | print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
59 |
60 | def log_tabular(key, val):
61 | """
62 | Log a value of some diagnostic
63 | Call this once for each diagnostic quantity, each iteration
64 | """
65 | if G.first_row:
66 | G.log_headers.append(key)
67 | else:
68 | assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
69 | assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
70 | G.log_current_row[key] = val
71 |
72 | def save_params(params):
73 | with open(osp.join(G.output_dir, "params.json"), 'w') as out:
74 | out.write(json.dumps(params, indent=2, separators=(',', ': '), sort_keys=True))
75 |
76 | def pickle_tf_vars():
77 | """
78 | Saves tensorflow variables
79 | Requires them to be initialized first, also a default session must exist
80 | """
81 | _dict = {v.name : v.eval() for v in tf.global_variables()}
82 | with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
83 | pickle.dump(_dict, f)
84 |
85 |
86 | def dump_tabular():
87 | """
88 | Write all of the diagnostics from the current iteration
89 | """
90 | vals = []
91 | key_lens = [len(key) for key in G.log_headers]
92 | max_key_len = max(15,max(key_lens))
93 | keystr = '%'+'%d'%max_key_len
94 | fmt = "| " + keystr + "s | %15s |"
95 | n_slashes = 22 + max_key_len
96 | print("-"*n_slashes)
97 | for key in G.log_headers:
98 | val = G.log_current_row.get(key, "")
99 | if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 | else: valstr = val
101 | print(fmt%(key, valstr))
102 | vals.append(val)
103 | print("-"*n_slashes)
104 | if G.output_file is not None:
105 | if G.first_row:
106 | G.output_file.write("\t".join(G.log_headers))
107 | G.output_file.write("\n")
108 | G.output_file.write("\t".join(map(str,vals)))
109 | G.output_file.write("\n")
110 | G.output_file.flush()
111 | G.log_current_row.clear()
112 | G.first_row=False
113 |
--------------------------------------------------------------------------------
/hw5/sac/nn.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from tensorflow.keras import layers
4 | from tensorflow_probability import distributions
5 | from tensorflow.python import keras
6 | from tensorflow.python.keras.engine.network import Network
7 |
8 |
9 | class QFunction(Network):
10 | def __init__(self, hidden_layer_sizes, **kwargs):
11 | super(QFunction, self).__init__(**kwargs)
12 | self._hidden_layer_sizes = hidden_layer_sizes
13 |
14 | def build(self, input_shape):
15 | inputs = [
16 | layers.Input(batch_shape=input_shape[0], name='observations'),
17 | layers.Input(batch_shape=input_shape[1], name='actions')
18 | ]
19 |
20 | x = layers.Concatenate(axis=1)(inputs)
21 | for hidden_units in self._hidden_layer_sizes:
22 | x = layers.Dense(hidden_units, activation='relu')(x)
23 | q_values = layers.Dense(1, activation=None)(x)
24 |
25 | self._init_graph_network(inputs, q_values)
26 | super(QFunction, self).build(input_shape)
27 |
28 |
29 | class ValueFunction(Network):
30 | def __init__(self, hidden_layer_sizes, **kwargs):
31 | super(ValueFunction, self).__init__(**kwargs)
32 | self._hidden_layer_sizes = hidden_layer_sizes
33 |
34 | def build(self, input_shape):
35 | inputs = layers.Input(batch_shape=input_shape, name='observations')
36 |
37 | x = inputs
38 | for hidden_units in self._hidden_layer_sizes:
39 | x = layers.Dense(hidden_units, activation='relu')(x)
40 | values = layers.Dense(1, activation=None)(x)
41 |
42 | self._init_graph_network(inputs, values)
43 | super(ValueFunction, self).build(input_shape)
44 |
45 |
46 | class GaussianPolicy(Network):
47 | def __init__(self, action_dim, hidden_layer_sizes, reparameterize, **kwargs):
48 | super(GaussianPolicy, self).__init__(**kwargs)
49 | self._action_dim = action_dim
50 | self._f = None
51 | self._hidden_layer_sizes = hidden_layer_sizes
52 | self._reparameterize = reparameterize
53 |
54 | def build(self, input_shape):
55 | inputs = layers.Input(batch_shape=input_shape, name='observations')
56 |
57 | x = inputs
58 | for hidden_units in self._hidden_layer_sizes:
59 | x = layers.Dense(hidden_units, activation='relu')(x)
60 |
61 | mean_and_log_std = layers.Dense(
62 | self._action_dim * 2, activation=None)(x)
63 |
64 | def create_distribution_layer(mean_and_log_std):
65 | mean, log_std = tf.split(
66 | mean_and_log_std, num_or_size_splits=2, axis=1)
67 | log_std = tf.clip_by_value(log_std, -20., 2.)
68 |
69 | distribution = distributions.MultivariateNormalDiag(
70 | loc=mean,
71 | scale_diag=tf.exp(log_std))
72 |
73 | raw_actions = distribution.sample()
74 | if not self._reparameterize:
75 | ### Problem 1.3.A
76 | ### YOUR CODE HERE
77 | raise NotImplementedError
78 | log_probs = distribution.log_prob(raw_actions)
79 | log_probs -= self._squash_correction(raw_actions)
80 |
81 | actions = None
82 | ### Problem 2.A
83 | ### YOUR CODE HERE
84 | raise NotImplementedError
85 |
86 | return actions, log_probs
87 |
88 | samples, log_probs = layers.Lambda(create_distribution_layer)(
89 | mean_and_log_std)
90 |
91 | self._init_graph_network(inputs=inputs, outputs=[samples, log_probs])
92 | super(GaussianPolicy, self).build(input_shape)
93 |
94 | def _squash_correction(self, raw_actions):
95 | ### Problem 2.B
96 | ### YOUR CODE HERE
97 | raise NotImplementedError
98 |
99 | def eval(self, observation):
100 | assert self.built and observation.ndim == 1
101 |
102 | if self._f is None:
103 | self._f = keras.backend.function(self.inputs, [self.outputs[0]])
104 |
105 | action, = self._f([observation[None]])
106 | return action.flatten()
107 |
--------------------------------------------------------------------------------
/hw5/sac/plot.py:
--------------------------------------------------------------------------------
1 | import seaborn as sns
2 | import pandas as pd
3 | import matplotlib.pyplot as plt
4 | import json
5 | import os
6 |
7 | """
8 | Using the plotter:
9 |
10 | Call it from the command line, and supply it with logdirs to experiments.
11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10
12 | random seeds. The runner code stored it in the directory structure
13 |
14 | data
15 | L test_EnvName_DateTime
16 | L 0
17 | L log.txt
18 | L params.json
19 | L 1
20 | L log.txt
21 | L params.json
22 | .
23 | .
24 | .
25 | L 9
26 | L log.txt
27 | L params.json
28 |
29 | To plot learning curves from the experiment, averaged over all random
30 | seeds, call
31 |
32 | python plot.py data/test_EnvName_DateTime --value AverageReturn
33 |
34 | and voila. To see a different statistics, change what you put in for
35 | the keyword --value. You can also enter /multiple/ values, and it will
36 | make all of them in order.
37 |
38 |
39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
40 | a different set of hyperparameters from 'test1', and now you would like
41 | to compare them -- see their learning curves side-by-side. Just call
42 |
43 | python plot.py data/test1 data/test2
44 |
45 | and it will plot them both! They will be given titles in the legend according
46 | to their exp_name parameters. If you want to use custom legend titles, use
47 | the --legend flag and then provide a title for each logdir.
48 |
49 | """
50 |
51 | def plot_data(data, value="AverageReturn"):
52 | if isinstance(data, list):
53 | data = pd.concat(data, ignore_index=True)
54 |
55 | sns.set(style="darkgrid", font_scale=1.5)
56 | sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
57 | plt.legend(loc='best').draggable()
58 | plt.show()
59 |
60 |
61 | def get_datasets(fpath, condition=None):
62 | unit = 0
63 | datasets = []
64 | for root, dir, files in os.walk(fpath):
65 | if 'log.txt' in files:
66 | param_path = open(os.path.join(root,'params.json'))
67 | params = json.load(param_path)
68 | exp_name = params['exp_name']
69 |
70 | log_path = os.path.join(root,'log.txt')
71 | experiment_data = pd.read_table(log_path)
72 |
73 | experiment_data.insert(
74 | len(experiment_data.columns),
75 | 'Unit',
76 | unit
77 | )
78 | experiment_data.insert(
79 | len(experiment_data.columns),
80 | 'Condition',
81 | condition or exp_name
82 | )
83 |
84 | datasets.append(experiment_data)
85 | unit += 1
86 |
87 | return datasets
88 |
89 |
90 | def main():
91 | import argparse
92 | parser = argparse.ArgumentParser()
93 | parser.add_argument('logdir', nargs='*')
94 | parser.add_argument('--legend', nargs='*')
95 | parser.add_argument('--value', default='LastEpReturn', nargs='*')
96 | args = parser.parse_args()
97 |
98 | use_legend = False
99 | if args.legend is not None:
100 | assert len(args.legend) == len(args.logdir), \
101 | "Must give a legend title for each set of experiments."
102 | use_legend = True
103 |
104 | data = []
105 | if use_legend:
106 | for logdir, legend_title in zip(args.logdir, args.legend):
107 | data += get_datasets(logdir, legend_title)
108 | else:
109 | for logdir in args.logdir:
110 | data += get_datasets(logdir)
111 |
112 | if isinstance(args.value, list):
113 | values = args.value
114 | else:
115 | values = [args.value]
116 | for value in values:
117 | plot_data(data, value=value)
118 |
119 | if __name__ == "__main__":
120 | main()
121 |
--------------------------------------------------------------------------------
/hw5/sac/train_mujoco.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import gym
3 | import logz
4 | import numpy as np
5 | import os
6 | import tensorflow as tf
7 | import time
8 |
9 | import nn
10 | from sac import SAC
11 | import utils
12 |
13 | from multiprocessing import Process
14 |
15 | def train_SAC(env_name, exp_name, seed, logdir):
16 | alpha = {
17 | 'Ant-v2': 0.1,
18 | 'HalfCheetah-v2': 0.2,
19 | 'Hopper-v2': 0.2,
20 | 'Humanoid-v2': 0.05,
21 | 'Walker2d-v2': 0.2,
22 | }.get(env_name, 0.2)
23 |
24 | algorithm_params = {
25 | 'alpha': alpha,
26 | 'batch_size': 256,
27 | 'discount': 0.99,
28 | 'learning_rate': 1e-3,
29 | 'reparameterize': False,
30 | 'tau': 0.01,
31 | 'epoch_length': 1000,
32 | 'n_epochs': 500,
33 | 'two_qf': False,
34 | }
35 | sampler_params = {
36 | 'max_episode_length': 1000,
37 | 'prefill_steps': 1000,
38 | }
39 | replay_pool_params = {
40 | 'max_size': 1e6,
41 | }
42 |
43 | value_function_params = {
44 | 'hidden_layer_sizes': (128, 128),
45 | }
46 |
47 | q_function_params = {
48 | 'hidden_layer_sizes': (128, 128),
49 | }
50 |
51 | policy_params = {
52 | 'hidden_layer_sizes': (128, 128),
53 | }
54 |
55 | logz.configure_output_dir(logdir)
56 | params = {
57 | 'exp_name': exp_name,
58 | 'env_name': env_name,
59 | 'algorithm_params': algorithm_params,
60 | 'sampler_params': sampler_params,
61 | 'replay_pool_params': replay_pool_params,
62 | 'value_function_params': value_function_params,
63 | 'q_function_params': q_function_params,
64 | 'policy_params': policy_params
65 | }
66 | logz.save_params(params)
67 |
68 | env = gym.envs.make(env_name)
69 | # Set random seeds
70 | tf.set_random_seed(seed)
71 | np.random.seed(seed)
72 | env.seed(seed)
73 |
74 | sampler = utils.SimpleSampler(**sampler_params)
75 | replay_pool = utils.SimpleReplayPool(
76 | observation_shape=env.observation_space.shape,
77 | action_shape=env.action_space.shape,
78 | **replay_pool_params)
79 |
80 | q_function = nn.QFunction(name='q_function', **q_function_params)
81 | if algorithm_params.get('two_qf', False):
82 | q_function2 = nn.QFunction(name='q_function2', **q_function_params)
83 | else:
84 | q_function2 = None
85 | value_function = nn.ValueFunction(
86 | name='value_function', **value_function_params)
87 | target_value_function = nn.ValueFunction(
88 | name='target_value_function', **value_function_params)
89 | policy = nn.GaussianPolicy(
90 | action_dim=env.action_space.shape[0],
91 | reparameterize=algorithm_params['reparameterize'],
92 | **policy_params)
93 |
94 | sampler.initialize(env, policy, replay_pool)
95 |
96 | algorithm = SAC(**algorithm_params)
97 |
98 | tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
99 | tf_config.gpu_options.allow_growth = True # may need if using GPU
100 | with tf.Session(config=tf_config):
101 | algorithm.build(
102 | env=env,
103 | policy=policy,
104 | q_function=q_function,
105 | q_function2=q_function2,
106 | value_function=value_function,
107 | target_value_function=target_value_function)
108 |
109 | for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)):
110 | logz.log_tabular('Iteration', epoch)
111 | for k, v in algorithm.get_statistics().items():
112 | logz.log_tabular(k, v)
113 | for k, v in replay_pool.get_statistics().items():
114 | logz.log_tabular(k, v)
115 | for k, v in sampler.get_statistics().items():
116 | logz.log_tabular(k, v)
117 | logz.dump_tabular()
118 |
119 | def main():
120 | parser = argparse.ArgumentParser()
121 | parser.add_argument('--env_name', type=str, default='HalfCheetah-v2')
122 | parser.add_argument('--exp_name', type=str, default=None)
123 | parser.add_argument('--seed', type=int, default=1)
124 | parser.add_argument('--n_experiments', '-e', type=int, default=1)
125 | args = parser.parse_args()
126 |
127 | data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
128 |
129 | if not (os.path.exists(data_path)):
130 | os.makedirs(data_path)
131 | logdir = 'sac_' + args.env_name + '_' + args.exp_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
132 | logdir = os.path.join(data_path, logdir)
133 |
134 | processes = []
135 |
136 | for e in range(args.n_experiments):
137 | seed = args.seed + 10*e
138 | print('Running experiment with seed %d'%seed)
139 |
140 | def train_func():
141 | train_SAC(
142 | env_name=args.env_name,
143 | exp_name=args.exp_name,
144 | seed=seed,
145 | logdir=os.path.join(logdir, '%d' % seed),
146 | )
147 | # # Awkward hacky process runs, because Tensorflow does not like
148 | # # repeatedly calling train_AC in the same thread.
149 | p = Process(target=train_func, args=tuple())
150 | p.start()
151 | processes.append(p)
152 | # if you comment in the line below, then the loop will block
153 | # until this process finishes
154 | # p.join()
155 |
156 | for p in processes:
157 | p.join()
158 |
159 | if __name__ == '__main__':
160 | main()
161 |
--------------------------------------------------------------------------------