├── .gitignore
├── README.md
├── hw1
    ├── DAgger.py
    ├── Readme.md
    ├── behavior_cloning.py
    ├── experts
    │   ├── Ant-v2.pkl
    │   ├── HalfCheetah-v2.pkl
    │   ├── Hopper-v2.pkl
    │   ├── Humanoid-v2.pkl
    │   ├── Reacher-v2.pkl
    │   └── Walker2d-v2.pkl
    ├── hw1.bash
    ├── load_policy.py
    ├── plot.py
    ├── run_expert.py
    └── tf_util.py
├── hw2
    ├── README.md
    ├── data_HalfCheetah_8
    │   ├── hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   ├── hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   ├── hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   └── hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14
    │   │   ├── 1
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   ├── 11
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   └── 21
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    ├── data_InvertedPendulum
    │   └── hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50
    │   │   ├── 1
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   ├── 11
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   └── 21
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    ├── data_large
    │   ├── lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   ├── lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   └── lib_rtg_na_CartPole-v0_18-09-2018_00-58-19
    │   │   ├── 1
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   ├── 11
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   └── 21
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    ├── data_lunar
    │   └── ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43
    │   │   ├── 1
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   ├── 11
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   └── 21
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    ├── data_small
    │   ├── sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   ├── sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   └── sb_rtg_na_CartPole-v0_18-09-2018_00-36-45
    │   │   ├── 1
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   ├── 11
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   └── 21
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    ├── hw2.bash
    ├── logz.py
    ├── lunar_lander.py
    ├── plot.py
    └── train_pg_f18.py
├── hw3
    ├── DDQN_Pong.pkl
    ├── DQNAtari_Ponglr_multi0.1.pkl
    ├── DQNAtari_Ponglr_multi10.0.pkl
    ├── DQNAtari_Ponglr_multi5.0.pkl
    ├── DQN_Pong.pkl
    ├── Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf
    ├── README.md
    ├── atari_wrappers.py
    ├── data:pkl
    │   ├── 1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl
    │   ├── 3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl
    │   ├── 43109373-50a0-47a8-b483-17921386ed82Lander.pkl
    │   ├── 518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl
    │   ├── 63926721-2624-40a7-b029-cee54d11097aLander.pkl
    │   ├── 8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl
    │   ├── 9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl
    │   ├── Atari_DDQN.pkl
    │   ├── DDQN-Lunar-test1.pkl
    │   ├── DDQNFalseLander.pkl
    │   ├── DDQNFalseLander_1e4.pkl
    │   ├── DDQNFalseLander_lr2e3.pkl
    │   ├── DDQNFalseLander_lr3e3.pkl
    │   ├── DDQNTrueLander.pkl
    │   ├── DQN-Atari-Pong.pkl
    │   ├── DQN-Lunar-2
    │   ├── DQN-Pong.pkl
    │   ├── b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl
    │   └── ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl
    ├── data_CartPole
    │   ├── .DS_Store
    │   ├── ac_100_1_CartPole-v0_02-10-2018_17-05-47
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   ├── ac_10_10_CartPole-v0_02-10-2018_17-09-03
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   ├── ac_1_100_CartPole-v0_02-10-2018_17-07-35
    │   │   ├── 1
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   ├── 11
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   │   └── 21
    │   │   │   ├── log.txt
    │   │   │   ├── params.json
    │   │   │   └── vars.pkl
    │   └── ac_1_1_CartPole-v0_02-10-2018_09-37-30
    │   │   ├── 1
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   ├── 11
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   └── 21
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    ├── data_HalfCheetah
    │   ├── .DS_Store
    │   └── ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06
    │   │   ├── 1
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   ├── 11
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   └── 21
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    ├── data_InvertedPendulum
    │   └── ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45
    │   │   ├── 1
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   ├── 11
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    │   │   └── 21
    │   │       ├── log.txt
    │   │       ├── params.json
    │   │       └── vars.pkl
    ├── dqn.py
    ├── dqn_utils.py
    ├── figures
    │   ├── p1q1.png
    │   ├── p1q2.png
    │   ├── p1q3.png
    │   ├── p2q1.png
    │   ├── p2q2_1.png
    │   └── p2q2_2.png
    ├── hw3.pdf
    ├── logz.py
    ├── lunar_lander.py
    ├── p1q1.py
    ├── p1q2.py
    ├── p1q3.py
    ├── plot.py
    ├── plot_q_learning.ipynb
    ├── requirements.txt
    ├── run_dqn_atari.py
    ├── run_dqn_lander.py
    ├── run_dqn_ram.py
    └── train_ac_f18.py
├── hw4
    ├── Deep_RL_Assignment_4__Model_Based_RL.pdf
    ├── Readme.md
    ├── half_cheetah_env.py
    ├── logger.py
    ├── main.py
    ├── model_based_policy.py
    ├── model_based_rl.py
    ├── plot.py
    ├── requirements.txt
    ├── run_all.sh
    ├── tabulate.py
    ├── timer.py
    └── utils.py
└── hw5
    ├── exp
        ├── README.md
        ├── density_model.py
        ├── ex_utils.py
        ├── exploration.py
        ├── hw5a.pdf
        ├── logz.py
        ├── plot.py
        ├── pointmass.py
        ├── replay.py
        ├── requirements.txt
        ├── run_all.sh
        ├── sparse_half_cheetah.py
        └── train_ac_exploration_f18.py
    ├── meta
        ├── Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf
        ├── README.md
        ├── data
        │   ├── mlp_1_pm_13-11-2018_20-57-59
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── mlp_30_pm_13-11-2018_20-48-55
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── mlp_50_pm_14-11-2018_20-05-53
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── mlp_60_pm_13-11-2018_23-01-39
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── pro1_pm-obs_13-11-2018_01-08-37
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── pro3_rnn_60_g_1_pm_15-11-2018_01-30-55
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── pro3_rnn_60_g_2_pm_14-11-2018_16-22-59
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── pro3_rnn_60_g_4_pm_15-11-2018_01-34-18
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── rnn_1_pm_13-11-2018_21-05-16
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── rnn_30_pm_13-11-2018_19-34-21
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── rnn_50_pm_14-11-2018_10-34-08
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   └── rnn_60_pm_13-11-2018_17-27-20
        │   │   └── 1
        │   │       ├── log.txt
        │   │       ├── params.json
        │   │       └── vars.pkl
        ├── data_pro1
        │   ├── pro1_pm-obs_13-11-2018_01-08-37
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   └── prob1.png
        ├── data_pro2_1
        │   ├── mlp_1_pm_13-11-2018_20-57-59
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── pro2_1.png
        │   └── rnn_1_pm_13-11-2018_21-05-16
        │   │   └── 1
        │   │       ├── log.txt
        │   │       ├── params.json
        │   │       └── vars.pkl
        ├── data_pro2_30
        │   ├── mlp_30_pm_13-11-2018_20-48-55
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── prob_30.png
        │   └── rnn_30_pm_13-11-2018_19-34-21
        │   │   └── 1
        │   │       ├── log.txt
        │   │       ├── params.json
        │   │       └── vars.pkl
        ├── data_pro2_50
        │   ├── mlp_50_pm_14-11-2018_20-05-53
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── pro2_50.png
        │   └── rnn_50_pm_14-11-2018_10-34-08
        │   │   └── 1
        │   │       ├── log.txt
        │   │       ├── params.json
        │   │       └── vars.pkl
        ├── data_pro2_60
        │   ├── mlp_60_pm_13-11-2018_23-01-39
        │   │   └── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   ├── pro2_60.png
        │   └── rnn_60_pm_13-11-2018_17-27-20
        │   │   └── 1
        │   │       ├── log.txt
        │   │       ├── params.json
        │   │       └── vars.pkl
        ├── data_pro3
        │   ├── pro3_rnn_60_g_1_pm_15-11-2018_01-30-55
        │   │   ├── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   │   ├── g_1.png
        │   │   ├── g_1_avg.png
        │   │   └── g_1_val.png
        │   ├── pro3_rnn_60_g_2_pm_14-11-2018_16-22-59
        │   │   ├── 1
        │   │   │   ├── log.txt
        │   │   │   ├── params.json
        │   │   │   └── vars.pkl
        │   │   ├── g_2.png
        │   │   ├── g_2_avg.png
        │   │   └── g_2_val.png
        │   └── pro3_rnn_60_g_4_pm_15-11-2018_01-34-18
        │   │   ├── 1
        │   │       ├── log.txt
        │   │       ├── params.json
        │   │       └── vars.pkl
        │   │   ├── g_4.png
        │   │   ├── g_4_avg.png
        │   │   └── g_4_val.png
        ├── logz.py
        ├── plot.py
        ├── point_mass.py
        ├── point_mass_observed.py
        ├── replay_buffer.py
        ├── requirements.txt
        └── train_policy.py
    └── sac
        ├── README.md
        ├── environment.yml
        ├── logz.py
        ├── nn.py
        ├── plot.py
        ├── sac.py
        ├── train_mujoco.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | 
 3 | 
 4 | */.DS_Store
 5 | 
 6 | __pycache__
 7 | 
 8 | Thumbs.db
 9 | 
10 | .ipynb_checkpoints/
11 | 
12 | .gitignore
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112-Deep-Reinforcement-Learning
 2 | 
 3 | 
 4 | --
 5 | 
 6 | **- This is my assignments and project of CS294-112 Deep Reinforcement Learning course at UC Berkeley in Fall 2018**
 7 | 
 8 | 
 9 | **- For assignments details, please step into specific homework folders.**
10 | 
11 | **- The course website is [CS294-112](http://rail.eecs.berkeley.edu/deeprlcourse/)**
12 | 


--------------------------------------------------------------------------------
/hw1/Readme.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 1: Imitation Learning
 2 | 
 3 | ---
 4 | 
 5 | ###Run the bash script
 6 | ###### `./hw1.bash` 
 7 | ###to get all results of hw1
 8 | 
 9 | 
10 | 
11 | 
12 | ---
13 | #####The following steps are detailed guidance to section 2 and section 3 of hw1 
14 | In order to run this assignment, first you need to make a folder names **expert_data** which saves the output data for expert_policy
15 | 
16 | `mkdir expert_data`
17 | 
18 | 1. Load up expert policy and run data<br>
19 | Run `python run_expert.py experts/task.pkl task --render --num_rollouts [num]` to run expert policy<br>
20 | Eg. 
21 | `python run_expert.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20` for the Hopper task
22 | `python run_expert.py experts/Reacher-v2.pkl Reacher-v2 --render --num_rollouts 400` for the Reacher task
23 | 
24 | 2. Implement Behavior_cloning<br>
25 | Run `python behavior_cloning.py experts/task.pkl task --render --num_rollouts [num]`to implement BC<br>
26 | Eg. 
27 | `python behavior_cloning.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20`for the Hopper task
28 | `python behavior_cloning.py experts/Reacher-v2.pkl Reacher-v2 --render --num_rollouts 400`for the Reacher task<br>
29 | This command will generate a `.pkl` file which saves mean value of reward and std of reward with epoch increasing
30 | 
31 | 3. Implement DAgger<br>
32 | Run `python DAgger.py experts/Hopper-v2.pkl Hopper-v2 --render --num_rollouts 20`for the Hopper task<br>
33 | This command will also generate a `.pkl` file which saves mean value of reward and std of reward with DAgger iterations.
34 | 
35 | 4. Plot<br>
36 | With the `.pkl` files generated from step2 and step3, run
37 | `python plot.py Hopper-v2 --num_rollouts 20` to generate figures for behavior cloning and DAgger
38 | 
39 | 


--------------------------------------------------------------------------------
/hw1/behavior_cloning.py:
--------------------------------------------------------------------------------
  1 | # Implement behavior cloning 
  2 | 
  3 | 
  4 | import tensorflow as tf 
  5 | import pickle
  6 | import numpy as np
  7 | import tf_util
  8 | import argparse
  9 | import load_policy
 10 | import gym
 11 | from sklearn.cross_validation import train_test_split
 12 | from sklearn.utils import shuffle
 13 | 
 14 | 
 15 | # Parameters
 16 | 
 17 | learning_rate = 0.001
 18 | num_epoch = 100
 19 | batch_size = 128
 20 | 
 21 | # Network Parameters
 22 | 
 23 | num_hid_1 = 128
 24 | num_hid_2 = 128
 25 | 
 26 | 
 27 | #Load training data from expert demonstrations generated by run_expert.py
 28 | def load_expert_data (filename):
 29 | 	with open (filename, 'rb') as f:
 30 | 		data = pickle.loads(f.read())
 31 | 	return data
 32 | 
 33 | def data_preprocessing(x, y):
 34 | 
 35 | 	x, y = shuffle(x, y, random_state=0)
 36 | 	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)
 37 | 	y_train = y_train.reshape(y_train.shape[0], y_train.shape[2])
 38 | 	y_test = y_test.reshape(y_test.shape[0], y_test.shape[2])
 39 | 
 40 | 	return x_train, x_test, y_train, y_test
 41 | 
 42 | def next_batch(batch_size, x, y):
 43 | 
 44 | 	indices = np.random.randint(low = 0, high = len(x), size = batch_size)
 45 | 	input_batch = x[indices]
 46 | 	label_batch = y[indices]
 47 | 
 48 | 	return input_batch, label_batch
 49 | 
 50 | def network_model(num_obs, num_act):
 51 | 
 52 | 	x = tf.placeholder(tf.float32, shape = [None, num_obs], name = 'x')
 53 | 	y = tf.placeholder(tf.float32, shape = [None, num_act], name = 'y')
 54 | 	layer_1 = tf.layers.dense(x, num_hid_1, activation = tf.nn.relu, use_bias=True)
 55 | 	layer_2 = tf.layers.dense(layer_1, num_hid_2, activation = tf.nn.relu, use_bias = True)
 56 | 	output = tf.layers.dense(layer_2, num_act, activation = None, use_bias = True)
 57 | 
 58 | 	return output, x, y
 59 | 
 60 | def train_network(output, y):
 61 | 	loss = tf.losses.mean_squared_error(output, y)
 62 | 	train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
 63 | 
 64 | 	return loss, train_op
 65 | 
 66 | 
 67 | 
 68 | def main():
 69 | 	parser = argparse.ArgumentParser();
 70 | 	parser.add_argument('expert_policy_file', type=str)
 71 | 	parser.add_argument('envname', type=str)
 72 | 	parser.add_argument('--render', action='store_true')
 73 | 	parser.add_argument("--max_timesteps", type=int)
 74 | 	parser.add_argument('--num_rollouts', type=int, default=20,
 75 |                       help='Number of expert roll outs')
 76 | 	args = parser.parse_args()
 77 | 
 78 | 	task = args.envname
 79 | 	dataset = 'expert_data/' + args.envname + '_' + str(args.num_rollouts) + '_data.pkl'
 80 | 
 81 | 
 82 |     #Load training data
 83 | 	data = load_expert_data(dataset)
 84 | 	observations = np.array(data['observations'])
 85 | 	actions = np.array(data['actions'])
 86 | 	num_obs = observations.shape[1]
 87 | 	num_act = actions.shape[2]
 88 | 
 89 | 	obs_train, obs_test, act_train, act_test = data_preprocessing(observations, actions)
 90 | 
 91 | 	output, x, y = network_model(num_obs, num_act)
 92 | 
 93 | 	lossfunction, train_op = train_network(output, y)
 94 | 
 95 | 	tf.add_to_collection('pred_network', output)
 96 | 
 97 | 	mean_reward = []
 98 | 	std_reward = []
 99 | 
100 | 
101 | 		# Train
102 | 	init = tf.global_variables_initializer()
103 | 		#model_path = './bc_policy/' + task + '_' + str(args.num_rollouts) + '_bc'
104 | 		#builder = tf.saved_model.builder.SavedModelBuilder(model_path)
105 | 	with tf.Session() as sess:
106 | 		sess.run(init)
107 | 
108 | 		for epoch in range(num_epoch + 1):
109 | 
110 | 			num_batch = int(len(obs_train) / batch_size)
111 | 
112 | 			for num in range(num_batch):
113 | 
114 | 				obs_train_batch, act_train_batch = next_batch(batch_size, obs_train, act_train)
115 | 
116 | 				sess.run(train_op, feed_dict = {x: obs_train_batch, y: act_train_batch})
117 | 
118 | 			if epoch % 10 == 0:
119 | 
120 | 				loss = sess.run(lossfunction, feed_dict = {x: obs_train, y: act_train})
121 | 
122 | 				print("Number of Epoch: %d, Training Loss = %.08f "%(epoch, loss))
123 | 
124 | 				test_output = sess.run(output, feed_dict = {x: obs_test})
125 | 
126 | 				testloss = np.mean((test_output - act_test)**2)
127 | 
128 | 				print ("Testing loss = %.08f" % testloss)
129 | 
130 | 				env = gym.make(args.envname)
131 | 				max_steps = args.max_timesteps or env.spec.timestep_limit
132 | 
133 | 				returns = []
134 | 				observations = []
135 | 				actions = []
136 | 				for i in range(args.num_rollouts):
137 | 					print('iter', i)
138 | 					obs = env.reset()
139 | 					done = False
140 | 					totalr = 0.
141 | 					steps = 0
142 | 					while not done: 
143 | 
144 | 						pre_action = sess.run(output, feed_dict = {x:obs[None,:]})
145 | 						observations.append(obs)
146 | 						actions.append(pre_action)
147 | 						obs, r, done, _ = env.step(pre_action)
148 | 						totalr += r
149 | 						steps += 1
150 | 						if args.render:
151 | 							env.render()
152 | 						#if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
153 | 						if steps >= max_steps:
154 | 							break
155 | 					returns.append(totalr)
156 | 
157 | 				print('returns', returns)
158 | 				print('mean return', np.mean(returns))
159 | 				print('std of return', np.std(returns))
160 | 				mean_reward.append(np.mean(returns))
161 | 				std_reward.append(np.std(returns))
162 | 
163 | 			#builder.add_meta_graph_and_variables(sess, ['Training'])
164 | 			#builder.save
165 | 	BC_result = {'mean_reward': np.array(mean_reward),
166 | 					 'std_reward': np.array(std_reward)}
167 | 
168 | 
169 | 	outfilename = './' + args.envname + '_' + str(args.num_rollouts) + '_bc_data.pkl'
170 | 
171 | 	with open((outfilename), 'wb') as f:
172 | 		pickle.dump(BC_result, f, pickle.HIGHEST_PROTOCOL)
173 | 
174 | 
175 | 
176 | 
177 | 
178 | if __name__ == '__main__':
179 |     main()
180 | 
181 | 


--------------------------------------------------------------------------------
/hw1/experts/Ant-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Ant-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/HalfCheetah-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/HalfCheetah-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/Hopper-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Hopper-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/Humanoid-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Humanoid-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/Reacher-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Reacher-v2.pkl


--------------------------------------------------------------------------------
/hw1/experts/Walker2d-v2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw1/experts/Walker2d-v2.pkl


--------------------------------------------------------------------------------
/hw1/hw1.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux
 3 | if [ ! -d "expert_data" ]; then
 4 |   mkdir expert_data
 5 | fi
 6 | 
 7 | python run_expert.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20
 8 | python run_expert.py experts/Reacher-v2.pkl Reacher-v2 --num_rollouts 400
 9 | python behavior_cloning.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20
10 | python behavior_cloning.py experts/Reacher-v2.pkl Reacher-v2 --num_rollouts 400
11 | python DAgger.py experts/Hopper-v2.pkl Hopper-v2 --num_rollouts 20
12 | python plot.py Hopper-v2 --num_rollouts 20


--------------------------------------------------------------------------------
/hw1/load_policy.py:
--------------------------------------------------------------------------------
 1 | import pickle, tensorflow as tf, tf_util, numpy as np
 2 | 
 3 | def load_policy(filename):
 4 |     with open(filename, 'rb') as f:
 5 |         data = pickle.loads(f.read())
 6 | 
 7 |     # assert len(data.keys()) == 2
 8 |     nonlin_type = data['nonlin_type']
 9 |     policy_type = [k for k in data.keys() if k != 'nonlin_type'][0]
10 | 
11 |     assert policy_type == 'GaussianPolicy', 'Policy type {} not supported'.format(policy_type)
12 |     policy_params = data[policy_type]
13 | 
14 |     assert set(policy_params.keys()) == {'logstdevs_1_Da', 'hidden', 'obsnorm', 'out'}
15 | 
16 |     # Keep track of input and output dims (i.e. observation and action dims) for the user
17 | 
18 |     def build_policy(obs_bo):
19 |         def read_layer(l):
20 |             assert list(l.keys()) == ['AffineLayer']
21 |             assert sorted(l['AffineLayer'].keys()) == ['W', 'b']
22 |             return l['AffineLayer']['W'].astype(np.float32), l['AffineLayer']['b'].astype(np.float32)
23 | 
24 |         def apply_nonlin(x):
25 |             if nonlin_type == 'lrelu':
26 |                 return tf_util.lrelu(x, leak=.01) # openai/imitation nn.py:233
27 |             elif nonlin_type == 'tanh':
28 |                 return tf.tanh(x)
29 |             else:
30 |                 raise NotImplementedError(nonlin_type)
31 | 
32 |         # Build the policy. First, observation normalization.
33 |         assert list(policy_params['obsnorm'].keys()) == ['Standardizer']
34 |         obsnorm_mean = policy_params['obsnorm']['Standardizer']['mean_1_D']
35 |         obsnorm_meansq = policy_params['obsnorm']['Standardizer']['meansq_1_D']
36 |         obsnorm_stdev = np.sqrt(np.maximum(0, obsnorm_meansq - np.square(obsnorm_mean)))
37 |         print('obs', obsnorm_mean.shape, obsnorm_stdev.shape)
38 |         normedobs_bo = (obs_bo - obsnorm_mean) / (obsnorm_stdev + 1e-6) # 1e-6 constant from Standardizer class in nn.py:409 in openai/imitation
39 | 
40 |         curr_activations_bd = normedobs_bo
41 | 
42 |         # Hidden layers next
43 |         assert list(policy_params['hidden'].keys()) == ['FeedforwardNet']
44 |         layer_params = policy_params['hidden']['FeedforwardNet']
45 |         for layer_name in sorted(layer_params.keys()):
46 |             l = layer_params[layer_name]
47 |             W, b = read_layer(l)
48 |             curr_activations_bd = apply_nonlin(tf.matmul(curr_activations_bd, W) + b)
49 | 
50 |         # Output layer
51 |         W, b = read_layer(policy_params['out'])
52 |         output_bo = tf.matmul(curr_activations_bd, W) + b
53 |         return output_bo
54 | 
55 |     obs_bo = tf.placeholder(tf.float32, [None, None])
56 |     a_ba = build_policy(obs_bo)
57 |     policy_fn = tf_util.function([obs_bo], a_ba)
58 |     return policy_fn


--------------------------------------------------------------------------------
/hw1/plot.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np 
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | def load_expert_data (filename):
 6 | 	with open (filename, 'rb') as f:
 7 | 		data = pickle.loads(f.read())
 8 | 	return data
 9 | 
10 | 
11 | def main():
12 | 
13 | 
14 | 	#Behavior cloning result with the number of epoch
15 | 	
16 | 	BC_path = 'Hopper-v2_20_bc_data.pkl'
17 | 	BC_result = load_expert_data(BC_path)
18 | 	BC_mean = np.array(BC_result['mean_reward'])
19 | 	BC_std = np.array(BC_result['std_reward'])
20 | 	epoch = np.arange(0, 101, 10)
21 | 	BC_plot = plt.figure(1)
22 | 	p1, = plt.plot(epoch, BC_mean, color='blue', label='Behavor_cloning' )
23 | 	plt.errorbar(epoch, BC_mean, ecolor='r', color='blue', yerr = BC_std, fmt = '-o',  elinewidth=2, capsize=4)
24 | 	plt.suptitle('Behavorial Cloning: Epoches vs. Reward', fontsize=20)
25 | 	plt.xlabel('Number of Training Epoches')
26 | 	plt.ylabel('Mean Reward')
27 | 	plt.legend()
28 | 	plt.show()
29 | 
30 | 
31 | 
32 | 
33 | 	DAgger_path = './Hopper-v2_20_data.pkl'
34 | 	DAgger_result = load_expert_data(DAgger_path)
35 | 	mean = np.array(DAgger_result['mean_reward'])
36 | 	std = np.array(DAgger_result['std_reward'])
37 | 	iteration = np.arange(std.shape[0])
38 | 	iteration = iteration + 1;
39 | 
40 | 
41 | 	DAgger_plot = plt.figure(2)
42 | 	Dag, = plt.plot(iteration, mean, marker = '*', color='b', label='DAgger Policy')
43 | 	plt.errorbar(iteration, mean, yerr = std, fmt = '-*',color='b',ecolor='r' , elinewidth=2, capsize=4)
44 | 	plt.suptitle('DAgger Iterations vs. Reward', fontsize=20)
45 | 	plt.xlabel('DAgger Iteration')
46 | 	plt.ylabel('Mean Reward')
47 | 	plt.xlim([0, 6.5])
48 | 	plt.ylim([1000, 4000])
49 | 	expert = plt.axhline(y=3778.4842779089204, color='k', label='Expert Policy')
50 | 	bc = plt.axhline(y=2009.9990, color='g', label='Behaviorial Cloning')
51 | 	plt.legend(loc= 4)
52 | 	plt.show()
53 | 
54 | 
55 | 
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 


--------------------------------------------------------------------------------
/hw1/run_expert.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """
 4 | Code to load an expert policy and generate roll-out data for behavioral cloning.
 5 | Example usage:
 6 |     python run_expert.py experts/Humanoid-v1.pkl Humanoid-v1 --render \
 7 |             --num_rollouts 20
 8 | 
 9 | Author of this script and included expert policies: Jonathan Ho (hoj@openai.com)
10 | """
11 | 
12 | import os
13 | import pickle
14 | import tensorflow as tf
15 | import numpy as np
16 | import tf_util
17 | import gym
18 | import load_policy
19 | 
20 | def main():
21 |     import argparse
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('expert_policy_file', type=str)
24 |     parser.add_argument('envname', type=str)
25 |     parser.add_argument('--render', action='store_true')
26 |     parser.add_argument("--max_timesteps", type=int)
27 |     parser.add_argument('--num_rollouts', type=int, default=20,
28 |                         help='Number of expert roll outs')
29 |     args = parser.parse_args()
30 | 
31 |     print('loading and building expert policy')
32 |     policy_fn = load_policy.load_policy(args.expert_policy_file)
33 |     print('loaded and built')
34 | 
35 |     with tf.Session():
36 |         tf_util.initialize()
37 | 
38 |         import gym
39 |         env = gym.make(args.envname)
40 |         max_steps = args.max_timesteps or env.spec.timestep_limit
41 | 
42 |         returns = []
43 |         observations = []
44 |         actions = []
45 |         for i in range(args.num_rollouts):
46 |             print('steps', max_steps)
47 |             print('iter', i)
48 |             obs = env.reset()
49 |             done = False
50 |             totalr = 0.
51 |             steps = 0
52 |             while not done:
53 |                 action = policy_fn(obs[None,:])
54 |                 observations.append(obs)
55 |                 actions.append(action)
56 |                 obs, r, done, _ = env.step(action)
57 |                 totalr += r
58 |                 steps += 1
59 |                 if args.render:
60 |                     env.render()
61 |                 if steps % 100 == 0: print("%i/%i"%(steps, max_steps))
62 |                 if steps >= max_steps:
63 |                     break
64 |             returns.append(totalr)
65 | 
66 |         print('returns', returns)
67 |         print('mean return', np.mean(returns))
68 |         print('std of return', np.std(returns))
69 | 
70 |         expert_data = {'observations': np.array(observations),
71 |                        'actions': np.array(actions)}
72 | 
73 |         outfilename = 'expert_data/' + args.envname + '_' + str(args.num_rollouts) + '_data.pkl'
74 | 
75 |         with open((outfilename), 'wb') as f:
76 |             pickle.dump(expert_data, f, pickle.HIGHEST_PROTOCOL)
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 


--------------------------------------------------------------------------------
/hw2/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 2: Policy Gradient
 2 | 
 3 | For all command-line expressions that used to run my experiments, they are stored in the `hw2.bash` script with annotations of different problems.
 4 | 
 5 | If you want to run the whole expriment, just run:
 6 | `./hw2.bash` in the master folder of `train_pg_f18.py`
 7 | 
 8 | (For this bash script, it will store all data file into the `./data` folder)
 9 | 
10 | I also provided the data I got with expriments:
11 | 
12 | 1. For problem 4, if you want to get the graph of small batch, the data is stored in `./data_small`, and run `python plot.py data_small/*` then you can get the graph. if you want to get the graph of large batch, the data is stored in `./data_large`, and run `python plot.py data_large/*` then you can get the graph.
13 | 
14 | 2. For problem 5, the data is stored in `./data_InvertedPendulum` folder and run `python plot.py data_InvertedPendulum/*` to get the graph.
15 | 
16 | 3. For problem 7, the data is stored in `./data_lunar` folder and run `python plot.py data_lunar/*` to get the graph.
17 | 
18 | 4. For problem 8, the folder `./data_HalfCheetah`contains all result with different batch size and learning rate. The folder `./data_HalfCheetah_8` stored the result of optimal for four runs. Run `python plot.py data_HalfCheetah/*` to get the graph.
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_None",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	false,
14 | "seed"	:	1,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_None",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	false,
14 | "seed"	:	11,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_None",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	false,
14 | "seed"	:	21,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_None_HalfCheetah-v2_19-09-2018_16-52-49/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_bl",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	false,
14 | "seed"	:	1,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_bl",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	false,
14 | "seed"	:	11,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_bl",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	false,
14 | "seed"	:	21,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_rtg",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	1,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_rtg",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	11,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_rtg",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	21,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_HalfCheetah-v2_19-09-2018_17-52-24/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_rtg_bl",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	1,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_rtg_bl",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	11,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"hc_b50000_r0.02_rtg_bl",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	50000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	21,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_HalfCheetah_8/hc_b50000_r0.02_rtg_bl_HalfCheetah-v2_19-09-2018_18-56-14/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"InvertedPendulum-v2",
 3 | "exp_name"	:	"hc_b400_r0.02",
 4 | "gamma"	:	0.9,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	400,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"InvertedPendulum-v2",
 3 | "exp_name"	:	"hc_b400_r0.02",
 4 | "gamma"	:	0.9,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	400,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"InvertedPendulum-v2",
 3 | "exp_name"	:	"hc_b400_r0.02",
 4 | "gamma"	:	0.9,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"data/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	400,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_InvertedPendulum/hc_b400_r0.02_InvertedPendulum-v2_19-09-2018_21-01-50/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lb_no_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	false,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lb_no_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	false,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lb_no_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	false,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_no_rtg_dna_CartPole-v0_18-09-2018_00-41-52/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lb_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	true,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lb_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	true,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lb_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	true,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lb_rtg_dna_CartPole-v0_18-09-2018_00-47-18/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lib_rtg_na",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lib_rtg_na",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"lib_rtg_na",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_large/lib_rtg_na_CartPole-v0_18-09-2018_00-58-19/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"LunarLanderContinuous-v2",
 3 | "exp_name"	:	"ll_b40000_r0.005",
 4 | "gamma"	:	0.99,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	40000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"LunarLanderContinuous-v2",
 3 | "exp_name"	:	"ll_b40000_r0.005",
 4 | "gamma"	:	0.99,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	40000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"LunarLanderContinuous-v2",
 3 | "exp_name"	:	"ll_b40000_r0.005",
 4 | "gamma"	:	0.99,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	40000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	true,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_lunar/ll_b40000_r0.005_LunarLanderContinuous-v2_19-09-2018_00-07-43/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_no_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	false,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_no_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	false,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_no_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	false,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_no_rtg_dna_CartPole-v0_18-09-2018_00-30-26/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	true,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	true,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_rtg_dna",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	false,
13 | "reward_to_go"	:	true,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_dna_CartPole-v0_18-09-2018_00-32-44/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_rtg_na",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/1/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_rtg_na",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/11/vars.pkl


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"sb_rtg_na",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"data/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "nn_baseline"	:	false,
12 | "normalize_advantages"	:	true,
13 | "reward_to_go"	:	true,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw2/data_small/sb_rtg_na_CartPole-v0_18-09-2018_00-36-45/21/vars.pkl


--------------------------------------------------------------------------------
/hw2/hw2.bash:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -eux
 3 | 
 4 | #Problem 4
 5 | 
 6 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -dna --exp_name sb_no_rtg_dna
 7 | 
 8 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -rtg -dna --exp_name sb_rtg_dna
 9 | 
10 | python train_pg_f18.py CartPole-v0 -n 100 -b 1000 -e 3 -rtg --exp_name sb_rtg_na
11 | 
12 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -dna --exp_name lb_no_rtg_dna
13 | 
14 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -rtg -dna --exp_name lb_rtg_dna
15 | 
16 | python train_pg_f18.py CartPole-v0 -n 100 -b 5000 -e 3 -rtg --exp_name lb_rtg_na
17 | 
18 | #Peoblem 5
19 | 
20 | python train_pg_f18.py InvertedPendulum-v2 -ep 1000 --discount 0.9 -n 100 -e 3 -l 2 -s 64 -b 400 -lr 0.02 -rtg --exp_name hc_b400_r0.02
21 | 
22 | #Problem 7
23 | 
24 | python train_pg_f18.py LunarLanderContinuous-v2 -ep 1000 --discount 0.99 -n 100 -e 3 -l 2 -s 64 -b 40000 -lr 0.005 -rtg --nn_baseline --exp_name ll_b40000_r0.005
25 | 
26 | #Problem 8
27 | 
28 | #Find the best batch size and learning rate
29 | for batch in 10000 30000 50000
30 | do
31 |     for lr in 0.005 0.01 0.02
32 |     do 
33 |         python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b $batch -lr $lr -rtg --nn_baseline --exp_name hc_b${batch}_r${lr}
34 |     done
35 | done
36 | 
37 | 
38 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 --exp_name hc_b50000_r0.02_None
39 | 
40 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 -rtg --exp_name hc_b50000_r0.02_rtg
41 | 
42 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 --nn_baseline --exp_name hc_b50000_r0.02_bl 
43 | 
44 | python train_pg_f18.py HalfCheetah-v2 -ep 150 --discount 0.95 -n 100 -e 3 -l 2 -s 32 -b 50000 -lr 0.02 -rtg --nn_baseline --exp_name hc_b50000_r0.02_rtg_bl
45 | 
46 | 


--------------------------------------------------------------------------------
/hw2/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw2/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     plt.show()
 59 | 
 60 | 
 61 | def get_datasets(fpath, condition=None):
 62 |     unit = 0
 63 |     datasets = []
 64 |     for root, dir, files in os.walk(fpath):
 65 |         if 'log.txt' in files:
 66 |             param_path = open(os.path.join(root,'params.json'))
 67 |             params = json.load(param_path)
 68 |             exp_name = params['exp_name']
 69 |             
 70 |             log_path = os.path.join(root,'log.txt')
 71 |             experiment_data = pd.read_table(log_path)
 72 | 
 73 |             experiment_data.insert(
 74 |                 len(experiment_data.columns),
 75 |                 'Unit',
 76 |                 unit
 77 |                 )        
 78 |             experiment_data.insert(
 79 |                 len(experiment_data.columns),
 80 |                 'Condition',
 81 |                 condition or exp_name
 82 |                 )
 83 | 
 84 |             datasets.append(experiment_data)
 85 |             unit += 1
 86 | 
 87 |     return datasets
 88 | 
 89 | 
 90 | def main():
 91 |     import argparse
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('logdir', nargs='*')
 94 |     parser.add_argument('--legend', nargs='*')
 95 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 96 |     args = parser.parse_args()
 97 | 
 98 |     use_legend = False
 99 |     if args.legend is not None:
100 |         assert len(args.legend) == len(args.logdir), \
101 |             "Must give a legend title for each set of experiments."
102 |         use_legend = True
103 | 
104 |     data = []
105 |     if use_legend:
106 |         for logdir, legend_title in zip(args.logdir, args.legend):
107 |             data += get_datasets(logdir, legend_title)
108 |     else:
109 |         for logdir in args.logdir:
110 |             data += get_datasets(logdir)
111 | 
112 |     if isinstance(args.value, list):
113 |         values = args.value
114 |     else:
115 |         values = [args.value]
116 |     for value in values:
117 |         plot_data(data, value=value)
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/hw3/DDQN_Pong.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DDQN_Pong.pkl


--------------------------------------------------------------------------------
/hw3/DQNAtari_Ponglr_multi0.1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi0.1.pkl


--------------------------------------------------------------------------------
/hw3/DQNAtari_Ponglr_multi10.0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi10.0.pkl


--------------------------------------------------------------------------------
/hw3/DQNAtari_Ponglr_multi5.0.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQNAtari_Ponglr_multi5.0.pkl


--------------------------------------------------------------------------------
/hw3/DQN_Pong.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/DQN_Pong.pkl


--------------------------------------------------------------------------------
/hw3/Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/Deep_RL_Assignment_3__Q_Learning_and_Actor_Critic.pdf


--------------------------------------------------------------------------------
/hw3/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 3: Q-Learning
 2 | 
 3 | 
 4 | ---
 5 | Before doing anything, first replace `gym/envs/box2d/lunar_lander.py` with the provided `lunar_lander.py` file.
 6 | 
 7 | ###Problem 1
 8 | 
 9 | #####Question 1
10 | 
11 | Run `python run_dqn_atari.py` directly with vanilla Q-learning and random seed with learning multiplier 1
12 | 
13 | Plot
14 | `python p1q1.py` (Replace the `.pkl` filename)
15 | 
16 | #####Question 2
17 | 
18 | Run `python run_dqn_atari.py --double` with double Q-learning and random seed.
19 | 
20 | Plot
21 | `python p1q2.py` (Replace the `.pkl` filename) to plot the vanilla Q-learning and Double Q-learning.
22 | 
23 | #####Question 3
24 | 
25 | Run `python run_dqn_atari.py -m <> --seed <--double>` with the learning multiplier and a fixed seed number **5000**, if `--double` then with double Q-learning else vanilla Q-learning.
26 | 
27 | Plot
28 | `python p1q3.py` (Replace the `.pkl` filename) to plot different learning curves with learning multiplier.
29 | 
30 | ###Problem 2
31 | 
32 | #####Question 1
33 | Run 
34 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 1_1 -ntu 1 -ngsptu 1`
35 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 1_100 -ntu 1-ngsptu 100`
36 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 100_1 -ntu100 -ngsptu 1`
37 | `python train_ac_f18.py CartPole-v0 -n 100 -b 1000 -e 3 --exp_name 10_10 -ntu10 -ngsptu 10`
38 | 
39 | Plot
40 | `python plot.py data_CartPole/*`
41 | 
42 | #####Question 2
43 | Run
44 | `python train_ac_f18.py InvertedPendulum-v2 -ep 1000 --discount 0.95 -n 100 -e3 -l 2 -s 64 -b 5000 -lr 0.01 --exp_name 10_10 -ntu 10 -ngsptu 10` for InvertedPendulum task
45 | Run
46 | `python train_ac_f18.py HalfCheetah-v2 -ep 150 --discount 0.90 -n 100 -e 3 -l 2-s 32 -b 30000 -lr 0.02 --exp_name 10_10 -ntu 10 -ngsptu 10` for HalfCheetah task
47 | 
48 | Plot
49 | `python plot.py data_InvertedPendulum/*`
50 | 
51 | `python plot.py data_HalfCheetah/*`
52 | 
53 | 


--------------------------------------------------------------------------------
/hw3/data:pkl/1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/1a898ddf-2704-4168-b92f-beca2086c5ffAtari_DDQN.pkl.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/3804ab6d-065b-4f94-aa54-ba957272c6b9Lander.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/43109373-50a0-47a8-b483-17921386ed82Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/43109373-50a0-47a8-b483-17921386ed82Lander.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/518f88f0-7ffa-47ae-b705-365b31717729Lander.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/63926721-2624-40a7-b029-cee54d11097aLander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/63926721-2624-40a7-b029-cee54d11097aLander.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/8425b8e8-19c8-418e-91c2-8131d6e72849Lander_vanilla.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/9e01eaef-6082-423a-9ff2-66798a5d1942Lander.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/Atari_DDQN.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/Atari_DDQN.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/DDQN-Lunar-test1.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQN-Lunar-test1.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNFalseLander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNFalseLander_1e4.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_1e4.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNFalseLander_lr2e3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_lr2e3.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNFalseLander_lr3e3.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNFalseLander_lr3e3.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/DDQNTrueLander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DDQNTrueLander.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/DQN-Atari-Pong.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Atari-Pong.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/DQN-Lunar-2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Lunar-2


--------------------------------------------------------------------------------
/hw3/data:pkl/DQN-Pong.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/DQN-Pong.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/b7445890-58aa-4fea-9628-bc1f08fdde62Lander.pkl


--------------------------------------------------------------------------------
/hw3/data:pkl/ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data:pkl/ba946a9b-c079-4ab6-b343-d1bccfc75be6Lander_DQN.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/.DS_Store


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"100_1",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	1,
13 | "num_target_updates"	:	100,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/1/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"100_1",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	1,
13 | "num_target_updates"	:	100,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/11/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"100_1",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	1,
13 | "num_target_updates"	:	100,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_100_1_CartPole-v0_02-10-2018_17-05-47/21/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/1/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/11/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_10_10_CartPole-v0_02-10-2018_17-09-03/21/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"1_100",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	100,
13 | "num_target_updates"	:	1,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/1/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"1_100",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	100,
13 | "num_target_updates"	:	1,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/11/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"1_100",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	100,
13 | "num_target_updates"	:	1,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_100_CartPole-v0_02-10-2018_17-07-35/21/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"1_1",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	1,
13 | "num_target_updates"	:	1,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/1/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"1_1",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	1,
13 | "num_target_updates"	:	1,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/11/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"CartPole-v0",
 3 | "exp_name"	:	"1_1",
 4 | "gamma"	:	1.0,
 5 | "learning_rate"	:	0.005,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21",
 7 | "max_path_length"	:	null,
 8 | "min_timesteps_per_batch"	:	1000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	1,
13 | "num_target_updates"	:	1,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_CartPole/ac_1_1_CartPole-v0_02-10-2018_09-37-30/21/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/.DS_Store


--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	0.9,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	30000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	1,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/1/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	0.9,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	30000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	11,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/11/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"HalfCheetah-v2",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	0.9,
 5 | "learning_rate"	:	0.02,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21",
 7 | "max_path_length"	:	150.0,
 8 | "min_timesteps_per_batch"	:	30000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	21,
15 | "size"	:	32}


--------------------------------------------------------------------------------
/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_HalfCheetah/ac_10_10_HalfCheetah-v2_08-10-2018_18-58-06/21/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"InvertedPendulum-v2",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.01,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	1,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/1/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"InvertedPendulum-v2",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.01,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	11,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/11/vars.pkl


--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"InvertedPendulum-v2",
 3 | "exp_name"	:	"10_10",
 4 | "gamma"	:	0.95,
 5 | "learning_rate"	:	0.01,
 6 | "logdir"	:	"/home/FanZhang/hw3/data/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21",
 7 | "max_path_length"	:	1000.0,
 8 | "min_timesteps_per_batch"	:	5000,
 9 | "n_iter"	:	100,
10 | "n_layers"	:	2,
11 | "normalize_advantages"	:	true,
12 | "num_grad_steps_per_target_update"	:	10,
13 | "num_target_updates"	:	10,
14 | "seed"	:	21,
15 | "size"	:	64}


--------------------------------------------------------------------------------
/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/data_InvertedPendulum/ac_10_10_InvertedPendulum-v2_02-10-2018_17-18-45/21/vars.pkl


--------------------------------------------------------------------------------
/hw3/figures/p1q1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q1.png


--------------------------------------------------------------------------------
/hw3/figures/p1q2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q2.png


--------------------------------------------------------------------------------
/hw3/figures/p1q3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p1q3.png


--------------------------------------------------------------------------------
/hw3/figures/p2q1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q1.png


--------------------------------------------------------------------------------
/hw3/figures/p2q2_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q2_1.png


--------------------------------------------------------------------------------
/hw3/figures/p2q2_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/figures/p2q2_2.png


--------------------------------------------------------------------------------
/hw3/hw3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw3/hw3.pdf


--------------------------------------------------------------------------------
/hw3/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw3/p1q1.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | with open('DQN_Pong.pkl', 'rb') as f:
 7 |     data = pickle.loads(f.read())
 8 | time_step = data['Timestep']
 9 | mean_reward = data['mean']
10 | best_reward = data['best']
11 | best_vanilla = best_reward[-1]
12 | plt.figure()
13 | plt.plot(time_step, mean_reward, color='red', linestyle = '-')
14 | plt.plot(time_step, best_reward, color='blue', linestyle = '--')
15 | plt.xlabel('Timesteps')
16 | plt.ylabel('Mean Episode Reward')
17 | plt.legend(['Mean_DQN','Best Mean_DQN'])
18 | plt.title('Vanilla Q-Learning on Pong', fontsize=12)
19 | plt.grid()
20 | ax = plt.gca()
21 | ax.xaxis.get_major_formatter().set_powerlimits((0,0))
22 | plt.show()


--------------------------------------------------------------------------------
/hw3/p1q2.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | with open('DQN_Pong.pkl', 'rb') as f:
 7 |     data = pickle.loads(f.read())
 8 | time_step = data['Timestep']
 9 | mean_reward = data['mean']
10 | best_reward = data['best']
11 | best_vanilla = best_reward[-1]
12 | print(best_vanilla)
13 | 
14 | with open('DDQN_Pong.pkl', 'rb') as l:
15 |     data_d = pickle.loads(l.read())
16 | time_step_d = data_d['Timestep']
17 | mean_reward_d = data_d['mean']
18 | best_reward_d = data_d['best']
19 | best_DDQN = best_reward_d[-1]
20 | print(best_DDQN)
21 | 
22 | plt.figure()
23 | plt.plot(time_step, mean_reward, color='green', linestyle = '-')
24 | plt.plot(time_step, best_reward, color='green', linestyle = '--')
25 | plt.plot(time_step_d, mean_reward_d, color='red', linestyle = '-')
26 | plt.plot(time_step_d, best_reward_d, color='red', linestyle = '--')
27 | plt.title('Vanilla Q-Learning Vs. Double Q-Learning on Pong', fontsize=11)
28 | plt.xlabel('Timesteps')
29 | plt.ylabel('Mean Episode Reward')
30 | plt.legend(['Mean_DQN', 'Best Mean_DQN', 'Mean_DDQN', 'Best Mean_DDQN'])
31 | plt.grid()
32 | ax = plt.gca()
33 | ax.xaxis.get_major_formatter().set_powerlimits((0,0))
34 | plt.show()


--------------------------------------------------------------------------------
/hw3/p1q3.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | with open('DQN_Pong.pkl', 'rb') as a:
 7 |     data = pickle.loads(a.read())
 8 | time_step_a = data['Timestep'][0:300]
 9 | mean_reward_a = data['mean'][0:300]
10 | best_reward_a = data['best'][0:300]
11 | 
12 | with open('DQNAtari_Ponglr_multi0.1.pkl', 'rb') as b:
13 |     data = pickle.loads(b.read())
14 | time_step_b = data['Timestep'][0:300]
15 | mean_reward_b = data['mean'][0:300]
16 | best_reward_b = data['best'][0:300]
17 | 
18 | with open('DQNAtari_Ponglr_multi5.0.pkl', 'rb') as c:
19 |     data = pickle.loads(c.read())
20 | time_step_c = data['Timestep'][0:300]
21 | mean_reward_c = data['mean'][0:300]
22 | best_reward_c = data['best'][0:300]
23 | 
24 | with open('DQNAtari_Ponglr_multi10.0.pkl', 'rb') as d:
25 |     data = pickle.loads(d.read())
26 | time_step_d = data['Timestep'][0:300]
27 | mean_reward_d = data['mean'][0:300]
28 | best_reward_d = data['best'][0:300]
29 | 
30 | 
31 | plt.figure()
32 | plt.plot(time_step_a, mean_reward_a, color='green', linestyle = '-')
33 | plt.plot(time_step_a, best_reward_a, color='green', linestyle = '--')
34 | 
35 | plt.plot(time_step_b, mean_reward_b, color='red', linestyle = '-')
36 | plt.plot(time_step_b, best_reward_b, color='red', linestyle = '--')
37 | 
38 | plt.plot(time_step_c, mean_reward_c, color='blue', linestyle = '-')
39 | plt.plot(time_step_c, best_reward_c, color='blue', linestyle = '--')
40 | 
41 | plt.plot(time_step_d, mean_reward_d, color='magenta', linestyle = '-')
42 | plt.plot(time_step_d, best_reward_d, color='magenta', linestyle = '--')
43 | 
44 | plt.title('Q-learning on Pong with different learning rate', fontsize=11)
45 | plt.xlabel('Timesteps')
46 | plt.ylabel('Mean Episode Reward')
47 | plt.grid()
48 | plt.legend(['Mean_lr_multi = 1', 'Best_lr_multi = 1', 'Mean_lr_multi = 0.1', 'Best_lr_multi = 0.1', 'Mean_lr_multi = 5', 'Best_lr_multi = 5', 'Mean_lr_multi = 10', 'Best_lr_multi = 10'])
49 | ax = plt.gca()
50 | ax.xaxis.get_major_formatter().set_powerlimits((0,0))
51 | plt.show()


--------------------------------------------------------------------------------
/hw3/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     plt.show()
 59 | 
 60 | 
 61 | def get_datasets(fpath, condition=None):
 62 |     unit = 0
 63 |     datasets = []
 64 |     for root, dir, files in os.walk(fpath):
 65 |         if 'log.txt' in files:
 66 |             param_path = open(os.path.join(root,'params.json'))
 67 |             params = json.load(param_path)
 68 |             exp_name = params['exp_name']
 69 |             
 70 |             log_path = os.path.join(root,'log.txt')
 71 |             experiment_data = pd.read_table(log_path)
 72 | 
 73 |             experiment_data.insert(
 74 |                 len(experiment_data.columns),
 75 |                 'Unit',
 76 |                 unit
 77 |                 )        
 78 |             experiment_data.insert(
 79 |                 len(experiment_data.columns),
 80 |                 'Condition',
 81 |                 condition or exp_name
 82 |                 )
 83 | 
 84 |             datasets.append(experiment_data)
 85 |             unit += 1
 86 | 
 87 |     return datasets
 88 | 
 89 | 
 90 | def main():
 91 |     import argparse
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('logdir', nargs='*')
 94 |     parser.add_argument('--legend', nargs='*')
 95 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 96 |     args = parser.parse_args()
 97 | 
 98 |     use_legend = False
 99 |     if args.legend is not None:
100 |         assert len(args.legend) == len(args.logdir), \
101 |             "Must give a legend title for each set of experiments."
102 |         use_legend = True
103 | 
104 |     data = []
105 |     if use_legend:
106 |         for logdir, legend_title in zip(args.logdir, args.legend):
107 |             data += get_datasets(logdir, legend_title)
108 |     else:
109 |         for logdir in args.logdir:
110 |             data += get_datasets(logdir)
111 | 
112 |     if isinstance(args.value, list):
113 |         values = args.value
114 |     else:
115 |         values = [args.value]
116 |     for value in values:
117 |         plot_data(data, value=value)
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/hw3/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.5
2 | gym[atari]
3 | box2d
4 | mujoco-py==1.50.1.56
5 | tensorflow
6 | numpy
7 | seaborn
8 | opencv-python
9 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_atari.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(img_in, num_actions, scope, reuse=False):
 16 |     # as described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf
 17 |     with tf.variable_scope(scope, reuse=reuse):
 18 |         out = img_in
 19 |         with tf.variable_scope("convnet"):
 20 |             # original architecture
 21 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
 22 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
 23 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
 24 | 
 25 |         out = layers.flatten(out)
 26 |         with tf.variable_scope("action_value"):
 27 |             out = layers.fully_connected(out, num_outputs=512,         activation_fn=tf.nn.relu)
 28 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 29 | 
 30 |         return out
 31 | 
 32 | def atari_learn(env,
 33 |                 session,
 34 |                 num_timesteps,
 35 |                 lr_multiplier,
 36 |                 double_q):
 37 |     # This is just a rough estimate
 38 |     num_iterations = float(num_timesteps) / 4.0
 39 | 
 40 |     lr_multiplier = lr_multiplier
 41 |     print("The learning rate multiplier is :", lr_multiplier)
 42 |     lr_schedule = PiecewiseSchedule([
 43 |                                          (0,                   1e-4 * lr_multiplier),
 44 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 45 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 46 |                                     ],
 47 |                                     outside_value=5e-5 * lr_multiplier)
 48 |     optimizer = dqn.OptimizerSpec(
 49 |         constructor=tf.train.AdamOptimizer,
 50 |         kwargs=dict(epsilon=1e-4),
 51 |         lr_schedule=lr_schedule
 52 |     )
 53 | 
 54 |     def stopping_criterion(env, t):
 55 |         # notice that here t is the number of steps of the wrapped env,
 56 |         # which is different from the number of steps in the underlying env
 57 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 58 | 
 59 |     exploration_schedule = PiecewiseSchedule(
 60 |         [
 61 |             (0, 1.0),
 62 |             (1e6, 0.1),
 63 |             (num_iterations / 2, 0.01),
 64 |         ], outside_value=0.01
 65 |     )
 66 | 
 67 |     dqn.learn(
 68 |         env=env,
 69 |         q_func=atari_model,
 70 |         optimizer_spec=optimizer,
 71 |         session=session,
 72 |         exploration=exploration_schedule,
 73 |         stopping_criterion=stopping_criterion,
 74 |         replay_buffer_size=1000000,
 75 |         batch_size=32,
 76 |         gamma=0.99,
 77 |         learning_starts=50000,
 78 |         learning_freq=4,
 79 |         frame_history_len=4,
 80 |         target_update_freq=10000,
 81 |         grad_norm_clipping=10,
 82 |         rew_file = 'Atari_Pong' + 'lr_multi' + str(lr_multiplier),
 83 |         double_q=double_q
 84 |         
 85 |     )
 86 |     env.close()
 87 | 
 88 | def get_available_gpus():
 89 |     from tensorflow.python.client import device_lib
 90 |     local_device_protos = device_lib.list_local_devices()
 91 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 92 | 
 93 | def set_global_seeds(i):
 94 |     try:
 95 |         import tensorflow as tf
 96 |     except ImportError:
 97 |         pass
 98 |     else:
 99 |         tf.set_random_seed(i)
100 |     np.random.seed(i)
101 |     random.seed(i)
102 | 
103 | def get_session():
104 |     tf.reset_default_graph()
105 |     tf_config = tf.ConfigProto(
106 |         inter_op_parallelism_threads=1,
107 |         intra_op_parallelism_threads=1)
108 |     session = tf.Session(config=tf_config)
109 |     print("AVAILABLE GPUS: ", get_available_gpus())
110 |     return session
111 | 
112 | def get_env(task, seed):
113 |     env = gym.make('PongNoFrameskip-v4')
114 | 
115 |     set_global_seeds(seed)
116 |     env.seed(seed)
117 | 
118 |     expt_dir = '/tmp/hw3_vid_dir2/'
119 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
120 |     env = wrap_deepmind(env)
121 | 
122 |     return env
123 | 
124 | def main():
125 |     import argparse
126 |     parser = argparse.ArgumentParser()
127 |     parser.add_argument('--multiplier', '-m', type = float, default = 1)
128 |     parser.add_argument('--seed', action='store_true')
129 |     parser.add_argument('--double', action = 'store_true')
130 |     args = parser.parse_args()
131 | 
132 | 
133 |     # Get Atari games.
134 |     task = gym.make('PongNoFrameskip-v4')
135 | 
136 |     if args.seed:
137 |         seed = 5000
138 |         print('seed = %d' % seed)
139 |     # Run training
140 |     else:
141 |         seed = random.randint(0, 9999)
142 |         print('random seed = %d' % seed)
143 |     env = get_env(task, seed)
144 |     session = get_session()
145 |     atari_learn(env, session, num_timesteps=2e8, lr_multiplier = args.multiplier, double_q = args.double)
146 | 
147 | if __name__ == "__main__":
148 |     main()
149 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_lander.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | 
 13 | import argparse
 14 | 
 15 | def lander_model(obs, num_actions, scope, reuse=False):
 16 |     with tf.variable_scope(scope, reuse=reuse):
 17 |         out = obs
 18 |         with tf.variable_scope("action_value"):
 19 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 20 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 21 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 22 |         return out
 23 | 
 24 | def lander_optimizer():
 25 |     return dqn.OptimizerSpec(
 26 |         constructor=tf.train.AdamOptimizer,
 27 |         lr_schedule=ConstantSchedule(1e-4),
 28 |         kwargs={}
 29 |     )
 30 | 
 31 | def lander_stopping_criterion(num_timesteps):
 32 |     def stopping_criterion(env, t):
 33 |         # notice that here t is the number of steps of the wrapped env,
 34 |         # which is different from the number of steps in the underlying env
 35 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 36 |     return stopping_criterion
 37 | 
 38 | def lander_exploration_schedule(num_timesteps):
 39 |     return PiecewiseSchedule(
 40 |         [
 41 |             (0, 1),
 42 |             (num_timesteps * 0.1, 0.02),
 43 |         ], outside_value=0.02
 44 |     )
 45 | 
 46 | def lander_kwargs():
 47 |     return {
 48 |         'optimizer_spec': lander_optimizer(),
 49 |         'q_func': lander_model,
 50 |         'replay_buffer_size': 50000,
 51 |         'batch_size': 32,
 52 |         'gamma': 1.00,
 53 |         'learning_starts': 1000,
 54 |         'learning_freq': 1,
 55 |         'frame_history_len': 1,
 56 |         'target_update_freq': 3000,
 57 |         'grad_norm_clipping': 10,
 58 |         'lander': True
 59 |     }
 60 | 
 61 | def lander_learn(env,
 62 |                  session,
 63 |                  num_timesteps,
 64 |                  seed):
 65 | 
 66 |     optimizer = lander_optimizer()
 67 |     stopping_criterion = lander_stopping_criterion(num_timesteps)
 68 |     exploration_schedule = lander_exploration_schedule(num_timesteps)
 69 | 
 70 |     dqn.learn(
 71 |         env=env,
 72 |         session=session,
 73 |         exploration=lander_exploration_schedule(num_timesteps),
 74 |         stopping_criterion=lander_stopping_criterion(num_timesteps),
 75 |         rew_file = 'Lander',
 76 |         double_q=False,
 77 |         **lander_kwargs()
 78 |     )
 79 |     env.close()
 80 | 
 81 | def set_global_seeds(i):
 82 |     tf.set_random_seed(i)
 83 |     np.random.seed(i)
 84 |     random.seed(i)
 85 | 
 86 | def get_session():
 87 |     tf.reset_default_graph()
 88 |     tf_config = tf.ConfigProto(
 89 |         inter_op_parallelism_threads=1,
 90 |         intra_op_parallelism_threads=1,
 91 |         device_count={'GPU': 0})
 92 |     # GPUs don't significantly speed up deep Q-learning for lunar lander,
 93 |     # since the observations are low-dimensional
 94 |     session = tf.Session(config=tf_config)
 95 |     return session
 96 | 
 97 | def get_env(seed):
 98 |     env = gym.make('LunarLander-v2')
 99 | 
100 |     set_global_seeds(seed)
101 |     env.seed(seed)
102 | 
103 |     expt_dir = '/tmp/hw3_vid_dir/'
104 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True, video_callable=False)
105 | 
106 |     return env
107 | 
108 | def main():
109 |     # Run training
110 |     seed = np.random.randint(9999) # you may want to randomize this
111 |     print('random seed = %d' % seed)
112 |     env = get_env(seed)
113 |     session = get_session()
114 |     set_global_seeds(seed)
115 |     lander_learn(env, session, num_timesteps=500000, seed=seed)
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 


--------------------------------------------------------------------------------
/hw3/run_dqn_ram.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | from gym import wrappers
  4 | import os.path as osp
  5 | import random
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | import tensorflow.contrib.layers as layers
  9 | 
 10 | import dqn
 11 | from dqn_utils import *
 12 | from atari_wrappers import *
 13 | 
 14 | 
 15 | def atari_model(ram_in, num_actions, scope, reuse=False):
 16 |     with tf.variable_scope(scope, reuse=reuse):
 17 |         out = ram_in
 18 |         #out = tf.concat(1,(ram_in[:,4:5],ram_in[:,8:9],ram_in[:,11:13],ram_in[:,21:22],ram_in[:,50:51], ram_in[:,60:61],ram_in[:,64:65]))
 19 |         with tf.variable_scope("action_value"):
 20 |             out = layers.fully_connected(out, num_outputs=256, activation_fn=tf.nn.relu)
 21 |             out = layers.fully_connected(out, num_outputs=128, activation_fn=tf.nn.relu)
 22 |             out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.relu)
 23 |             out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
 24 | 
 25 |         return out
 26 | 
 27 | def atari_learn(env,
 28 |                 session,
 29 |                 num_timesteps):
 30 |     # This is just a rough estimate
 31 |     num_iterations = float(num_timesteps) / 4.0
 32 | 
 33 |     lr_multiplier = 1.0
 34 |     lr_schedule = PiecewiseSchedule([
 35 |                                          (0,                   1e-4 * lr_multiplier),
 36 |                                          (num_iterations / 10, 1e-4 * lr_multiplier),
 37 |                                          (num_iterations / 2,  5e-5 * lr_multiplier),
 38 |                                     ],
 39 |                                     outside_value=5e-5 * lr_multiplier)
 40 |     optimizer = dqn.OptimizerSpec(
 41 |         constructor=tf.train.AdamOptimizer,
 42 |         kwargs=dict(epsilon=1e-4),
 43 |         lr_schedule=lr_schedule
 44 |     )
 45 | 
 46 |     def stopping_criterion(env, t):
 47 |         # notice that here t is the number of steps of the wrapped env,
 48 |         # which is different from the number of steps in the underlying env
 49 |         return get_wrapper_by_name(env, "Monitor").get_total_steps() >= num_timesteps
 50 | 
 51 |     exploration_schedule = PiecewiseSchedule(
 52 |         [
 53 |             (0, 0.2),
 54 |             (1e6, 0.1),
 55 |             (num_iterations / 2, 0.01),
 56 |         ], outside_value=0.01
 57 |     )
 58 | 
 59 |     dqn.learn(
 60 |         env,
 61 |         q_func=atari_model,
 62 |         optimizer_spec=optimizer,
 63 |         session=session,
 64 |         exploration=exploration_schedule,
 65 |         stopping_criterion=stopping_criterion,
 66 |         replay_buffer_size=1000000,
 67 |         batch_size=32,
 68 |         gamma=0.99,
 69 |         learning_starts=50000,
 70 |         learning_freq=4,
 71 |         frame_history_len=1,
 72 |         target_update_freq=10000,
 73 |         grad_norm_clipping=10
 74 |     )
 75 |     env.close()
 76 | 
 77 | def get_available_gpus():
 78 |     from tensorflow.python.client import device_lib
 79 |     local_device_protos = device_lib.list_local_devices()
 80 |     return [x.physical_device_desc for x in local_device_protos if x.device_type == 'GPU']
 81 | 
 82 | def set_global_seeds(i):
 83 |     try:
 84 |         import tensorflow as tf
 85 |     except ImportError:
 86 |         pass
 87 |     else:
 88 |         tf.set_random_seed(i)
 89 |     np.random.seed(i)
 90 |     random.seed(i)
 91 | 
 92 | def get_session():
 93 |     tf.reset_default_graph()
 94 |     tf_config = tf.ConfigProto(
 95 |         inter_op_parallelism_threads=1,
 96 |         intra_op_parallelism_threads=1)
 97 |     session = tf.Session(config=tf_config)
 98 |     print("AVAILABLE GPUS: ", get_available_gpus())
 99 |     return session
100 | 
101 | def get_env(seed):
102 |     env = gym.make('Pong-ram-v0')
103 | 
104 |     set_global_seeds(seed)
105 |     env.seed(seed)
106 | 
107 |     expt_dir = '/tmp/hw3_vid_dir/'
108 |     env = wrappers.Monitor(env, osp.join(expt_dir, "gym"), force=True)
109 |     env = wrap_deepmind_ram(env)
110 | 
111 |     return env
112 | 
113 | def main():
114 |     # Run training
115 |     seed = 0 # Use a seed of zero (you may want to randomize the seed!)
116 |     env = get_env(seed)
117 |     session = get_session()
118 |     atari_learn(env, session, num_timesteps=int(4e7))
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/hw4/Deep_RL_Assignment_4__Model_Based_RL.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw4/Deep_RL_Assignment_4__Model_Based_RL.pdf


--------------------------------------------------------------------------------
/hw4/Readme.md:
--------------------------------------------------------------------------------
1 | ###CS294-112 Assignment 4: Model-Based RL
2 | 
3 | ---
4 | 
5 | To run the whole solution to all problems, just run the `run_all.sh` in the terminal.
6 | 
7 | 
8 | The command is `bash ./run_all.sh` and all result data and figures will be saved in the related folders.


--------------------------------------------------------------------------------
/hw4/half_cheetah_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from gym import utils
 4 | from gym.envs.mujoco import mujoco_env
 5 | 
 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 7 |     def __init__(self):
 8 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 1)
 9 |         utils.EzPickle.__init__(self)
10 | 
11 |     def step(self, action):
12 |         xposbefore = self.sim.data.qpos[0]
13 |         self.do_simulation(action, self.frame_skip)
14 |         xposafter = self.sim.data.qpos[0]
15 |         ob = self._get_obs()
16 |         reward_ctrl = - 0.1 * np.square(action).sum()
17 |         reward_run = (xposafter - xposbefore)/self.dt
18 |         reward = reward_ctrl + reward_run
19 |         done = False
20 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
21 | 
22 |     def _get_obs(self):
23 |         return np.concatenate([
24 |             self.sim.data.qpos.flat[1:],
25 |             self.sim.data.qvel.flat,
26 |             self.get_body_com("torso").flat,
27 |             # self.get_body_comvel("torso").flat,
28 |         ])
29 | 
30 |     def reset_model(self):
31 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
32 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
33 |         self.set_state(qpos, qvel)
34 |         return self._get_obs()
35 | 
36 |     def viewer_setup(self):
37 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
38 | 
39 |     @staticmethod
40 |     def cost_fn(states, actions, next_states):
41 |         is_tf = tf.contrib.framework.is_tensor(states)
42 |         is_single_state = (len(states.get_shape()) == 1) if is_tf else (len(states.shape) == 1)
43 | 
44 |         if is_single_state:
45 |             states = states[None, ...]
46 |             actions = actions[None, ...]
47 |             next_states = next_states[None, ...]
48 | 
49 |         scores = tf.zeros(actions.get_shape()[0].value) if is_tf else np.zeros(actions.shape[0])
50 | 
51 |         heading_penalty_factor = 10
52 | 
53 |         # dont move front shin back so far that you tilt forward
54 |         front_leg = states[:, 5]
55 |         my_range = 0.2
56 |         if is_tf:
57 |             scores += tf.cast(front_leg >= my_range, tf.float32) * heading_penalty_factor
58 |         else:
59 |             scores += (front_leg >= my_range) * heading_penalty_factor
60 | 
61 |         front_shin = states[:, 6]
62 |         my_range = 0
63 |         if is_tf:
64 |             scores += tf.cast(front_shin >= my_range, tf.float32) * heading_penalty_factor
65 |         else:
66 |             scores += (front_shin >= my_range) * heading_penalty_factor
67 | 
68 |         front_foot = states[:, 7]
69 |         my_range = 0
70 |         if is_tf:
71 |             scores += tf.cast(front_foot >= my_range, tf.float32) * heading_penalty_factor
72 |         else:
73 |             scores += (front_foot >= my_range) * heading_penalty_factor
74 | 
75 |         scores -= (next_states[:, 17] - states[:, 17]) / 0.01
76 | 
77 |         if is_single_state:
78 |             scores = scores[0]
79 | 
80 |         return scores
81 | 


--------------------------------------------------------------------------------
/hw4/main.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import time
 4 | 
 5 | from half_cheetah_env import HalfCheetahEnv
 6 | from logger import logger
 7 | from model_based_rl import ModelBasedRL
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('question', type=str, choices=('q1, q2, q3'))
11 | parser.add_argument('--exp_name', type=str, default=None)
12 | parser.add_argument('--env', type=str, default='HalfCheetah', choices=('HalfCheetah',))
13 | parser.add_argument('--render', action='store_true')
14 | parser.add_argument('--mpc_horizon', type=int, default=15)
15 | parser.add_argument('--num_random_action_selection', type=int, default=4096)
16 | parser.add_argument('--nn_layers', type=int, default=1)
17 | args = parser.parse_args()
18 | 
19 | data_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
20 | exp_name = '{0}_{1}_{2}'.format(args.env,
21 |                                 args.question,
22 |                                 args.exp_name if args.exp_name else time.strftime("%d-%m-%Y_%H-%M-%S"))
23 | exp_dir = os.path.join(data_dir, exp_name)
24 | assert not os.path.exists(exp_dir),\
25 |     'Experiment directory {0} already exists. Either delete the directory, or run the experiment with a different name'.format(exp_dir)
26 | os.makedirs(exp_dir, exist_ok=True)
27 | logger.setup(exp_name, os.path.join(exp_dir, 'log.txt'), 'debug')
28 | 
29 | env = {
30 |     'HalfCheetah': HalfCheetahEnv()
31 | }[args.env]
32 | 
33 | mbrl = ModelBasedRL(env=env,
34 |                     render=args.render,
35 |                     mpc_horizon=args.mpc_horizon,
36 |                     num_random_action_selection=args.num_random_action_selection,
37 |                     nn_layers=args.nn_layers)
38 | 
39 | run_func = {
40 |     'q1': mbrl.run_q1,
41 |     'q2': mbrl.run_q2,
42 |     'q3': mbrl.run_q3
43 | }[args.question]
44 | run_func()
45 | 


--------------------------------------------------------------------------------
/hw4/plot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | 
 4 | import matplotlib.pyplot as plt
 5 | import matplotlib.cm as cm
 6 | import pandas
 7 | 
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--exps',  nargs='+', type=str)
11 | parser.add_argument('--save', type=str, default=None)
12 | args = parser.parse_args()
13 | 
14 | f, ax = plt.subplots(1, 1)
15 | for i, exp in enumerate(args.exps):
16 |     log_fname = os.path.join('data', exp, 'log.csv')
17 |     csv = pandas.read_csv(log_fname)
18 | 
19 |     color = cm.viridis(i / float(len(args.exps)))
20 |     ax.plot(csv['Itr'], csv['ReturnAvg'], color=color, label=exp)
21 |     ax.fill_between(csv['Itr'], csv['ReturnAvg'] - csv['ReturnStd'], csv['ReturnAvg'] + csv['ReturnStd'],
22 |                     color=color, alpha=0.2)
23 | 
24 | ax.legend()
25 | ax.set_xlabel('Iteration')
26 | ax.set_ylabel('Return')
27 | 
28 | if args.save:
29 |     os.makedirs('plots', exist_ok=True)
30 |     f.savefig(os.path.join('plots', args.save + '.jpg'))
31 | else:
32 |     plt.show()
33 | 


--------------------------------------------------------------------------------
/hw4/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | matplotlib
3 | colorlog


--------------------------------------------------------------------------------
/hw4/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ##########
 4 | ### Q1 ###
 5 | ##########
 6 | 
 7 | python main.py q1 --exp_name exp
 8 | 
 9 | ##########
10 | ### Q2 ###
11 | ##########
12 | 
13 | python main.py q2 --exp_name exp
14 | 
15 | ###########
16 | ### Q3a ###
17 | ###########
18 | 
19 | python main.py q3 --exp_name default
20 | python plot.py --exps HalfCheetah_q3_default --save HalfCheetah_q3_default
21 | 
22 | ###########
23 | ### Q3b ###
24 | ###########
25 | 
26 | python main.py q3 --exp_name action128 --num_random_action_selection 128
27 | python main.py q3 --exp_name action4096 --num_random_action_selection 4096
28 | python main.py q3 --exp_name action16384 --num_random_action_selection 16384
29 | python plot.py --exps HalfCheetah_q3_action128 HalfCheetah_q3_action4096 HalfCheetah_q3_action16384 --save HalfCheetah_q3_actions
30 | 
31 | python main.py q3 --exp_name horizon10 --mpc_horizon 10
32 | python main.py q3 --exp_name horizon15 --mpc_horizon 15
33 | python main.py q3 --exp_name horizon20 --mpc_horizon 20
34 | python plot.py --exps HalfCheetah_q3_horizon10 HalfCheetah_q3_horizon15 HalfCheetah_q3_horizon20 --save HalfCheetah_q3_mpc_horizon
35 | 
36 | python main.py q3 --exp_name layers1 --nn_layers 1
37 | python main.py q3 --exp_name layers2 --nn_layers 2
38 | python main.py q3 --exp_name layers3 --nn_layers 3
39 | python plot.py --exps HalfCheetah_q3_layers1 HalfCheetah_q3_layers2 HalfCheetah_q3_layers3 --save HalfCheetah_q3_nn_layers
40 | 


--------------------------------------------------------------------------------
/hw4/timer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from collections import defaultdict
 3 | 
 4 | class TimeIt(object):
 5 |     def __init__(self, prefix=''):
 6 |         self.prefix = prefix
 7 |         self.start_times = dict()
 8 |         self.elapsed_times = defaultdict(int)
 9 | 
10 |     def start(self, name):
11 |         assert(name not in self.start_times)
12 |         self.start_times[name] = time.time()
13 | 
14 |     def stop(self, name):
15 |         assert(name in self.start_times)
16 |         self.elapsed_times[name] += time.time() - self.start_times[name]
17 |         self.start_times.pop(name)
18 | 
19 |     def elapsed(self, name):
20 |         return self.elapsed_times[name]
21 | 
22 |     def reset(self):
23 |         self.start_times = dict()
24 |         self.elapsed_times = defaultdict(int)
25 | 
26 |     def __str__(self):
27 |         s = ''
28 |         names_elapsed = sorted(self.elapsed_times.items(), key=lambda x: x[1], reverse=True)
29 |         for name, elapsed in names_elapsed:
30 |             if 'total' not in self.elapsed_times:
31 |                 s += '{0}: {1: <10} {2:.1f}\n'.format(self.prefix, name, elapsed)
32 |             else:
33 |                 assert(self.elapsed_times['total'] >= max(self.elapsed_times.values()))
34 |                 pct = 100. * elapsed / self.elapsed_times['total']
35 |                 s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, name, elapsed, pct)
36 |         if 'total' in self.elapsed_times:
37 |             times_summed = sum([t for k, t in self.elapsed_times.items() if k != 'total'])
38 |             other_time = self.elapsed_times['total'] - times_summed
39 |             assert(other_time >= 0)
40 |             pct = 100. * other_time / self.elapsed_times['total']
41 |             s += '{0}: {1: <10} {2:.1f} ({3:.1f}%)\n'.format(self.prefix, 'other', other_time, pct)
42 |         return s
43 | 
44 | timeit = TimeIt()
45 | 


--------------------------------------------------------------------------------
/hw5/exp/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 5a: Exploration
 2 | 
 3 | Dependencies:
 4 |  * Python **3.5**
 5 |  * Numpy version **1.14.5**
 6 |  * TensorFlow version **1.10.5**
 7 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
 8 |  * seaborn
 9 |  * tqdm==**4.26.0**
10 | 
11 | Before doing anything, first replace `gym/envs/mujoco/half_cheetah.py` with the provided `sparse_half_cheetah.py` file. It is always a good idea to keep a copy of the original `gym/envs/mujoco/half_cheetah.py` just in case you need it for something else.
12 | 
13 | You will implement `density_model.py`, `exploration.py`, and `train_ac_exploration_f18.py`.
14 | 
15 | See the hw5a.pdf in this folder for further instructions.
16 | <!--See the [HW5 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5a.pdf) for further instructions-->.
17 | 


--------------------------------------------------------------------------------
/hw5/exp/ex_utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def build_mlp(input_placeholder, output_size, scope, n_layers, size, activation=tf.tanh, output_activation=None):
 4 |     """
 5 |         Builds a feedforward neural network
 6 |         
 7 |         arguments:
 8 |             input_placeholder: placeholder variable for the state (batch_size, input_size)
 9 |             output_size: size of the output layer
10 |             scope: variable scope of the network
11 |             n_layers: number of hidden layers
12 |             size: dimension of the hidden layer
13 |             activation: activation of the hidden layers
14 |             output_activation: activation of the ouput layers
15 | 
16 |         returns:
17 |             output placeholder of the network (the result of a forward pass) 
18 | 
19 |         Hint: use tf.layers.dense    
20 |     """
21 |     output_placeholder = input_placeholder
22 |     with tf.variable_scope(scope):
23 |         for _ in range(n_layers):
24 |             output_placeholder = tf.layers.dense(output_placeholder, size, activation=activation)
25 |         output_placeholder = tf.layers.dense(output_placeholder, output_size, activation=output_activation)
26 |     return output_placeholder


--------------------------------------------------------------------------------
/hw5/exp/hw5a.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/exp/hw5a.pdf


--------------------------------------------------------------------------------
/hw5/exp/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw5/exp/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     # plt.legend(loc='best', bbox_to_anchor=(1, 1), fontsize=8).draggable()
 59 |     plt.show()
 60 | 
 61 | 
 62 | def get_datasets(fpath, condition=None):
 63 |     unit = 0
 64 |     datasets = []
 65 |     for root, dir, files in os.walk(fpath):
 66 |         if 'log.txt' in files:
 67 |             param_path = open(os.path.join(root,'params.json'))
 68 |             params = json.load(param_path)
 69 |             exp_name = params['exp_name']
 70 |             
 71 |             log_path = os.path.join(root,'log.txt')
 72 |             experiment_data = pd.read_table(log_path)
 73 | 
 74 |             experiment_data.insert(
 75 |                 len(experiment_data.columns),
 76 |                 'Unit',
 77 |                 unit
 78 |                 )        
 79 |             experiment_data.insert(
 80 |                 len(experiment_data.columns),
 81 |                 'Condition',
 82 |                 condition or exp_name
 83 |                 )
 84 | 
 85 |             datasets.append(experiment_data)
 86 |             unit += 1
 87 | 
 88 |     return datasets
 89 | 
 90 | 
 91 | def main():
 92 |     import argparse
 93 |     parser = argparse.ArgumentParser()
 94 |     parser.add_argument('logdir', nargs='*')
 95 |     parser.add_argument('--legend', nargs='*')
 96 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 97 |     args = parser.parse_args()
 98 | 
 99 |     use_legend = False
100 |     if args.legend is not None:
101 |         assert len(args.legend) == len(args.logdir), \
102 |             "Must give a legend title for each set of experiments."
103 |         use_legend = True
104 | 
105 |     data = []
106 |     if use_legend:
107 |         for logdir, legend_title in zip(args.logdir, args.legend):
108 |             data += get_datasets(logdir, legend_title)
109 |     else:
110 |         for logdir in args.logdir:
111 |             data += get_datasets(logdir)
112 | 
113 |     if isinstance(args.value, list):
114 |         values = args.value
115 |     else:
116 |         values = [args.value]
117 |     for value in values:
118 |         plot_data(data, value=value)
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/hw5/exp/replay.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import numpy as np
 3 | import copy
 4 | 
 5 | class Replay_Buffer(object):
 6 |     def __init__(self, max_size=np.inf):
 7 |         self.memory = []
 8 |         self.max_size = int(max_size)
 9 | 
10 |     def adjust_size(self):
11 |         if len(self.memory) > self.max_size:
12 |             diff = int(len(self.memory) - self.max_size)
13 |             self.memory = self.memory[:-diff]  # FIFO
14 |             print('Adjusted replay size')
15 | 
16 |     def prepend(self, x):
17 |         # assume x is a list of states
18 |         self.memory = list(x) + self.memory
19 |         self.adjust_size()
20 | 
21 |     def sample(self, batch_size):
22 |         random_batch = random.sample(self.memory, batch_size)
23 |         return random_batch
24 | 
25 |     def __len__(self):
26 |         return len(self.memory)
27 | 
28 |     def __getitem__(self, indices):
29 |         return copy.deepcopy(np.array([self.memory[i] for i in indices]))
30 | 
31 |     def get_memory(self):
32 |         return copy.deepcopy(self.memory)
33 | 
34 |     def clear_buffer(self):
35 |         del self.memory[:]


--------------------------------------------------------------------------------
/hw5/exp/requirements.txt:
--------------------------------------------------------------------------------
1 | gym==0.10.5
2 | mujoco-py==1.50.1.56
3 | tensorflow
4 | numpy
5 | seaborn
6 | tqdm


--------------------------------------------------------------------------------
/hw5/exp/run_all.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | ##########################
 4 | ### P1 Hist PointMass  ###
 5 | ##########################
 6 | 
 7 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model none -s 8 --exp_name PM_bc0_s8
 8 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model hist -bc 0.01 -s 8 --exp_name PM_hist_bc0.01_s8
 9 | 
10 | ##########################
11 | ###  P2 RBF PointMass  ###
12 | ##########################
13 | 
14 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model rbf -bc 0.01 -s 8 -sig 0.2 --exp_name PM_rbf_bc0.01_s8_sig0.2
15 | 
16 | ##########################
17 | ###  P3 EX2 PointMass  ###
18 | ##########################
19 | 
20 | python train_ac_exploration_f18.py PointMass-v0 -n 100 -b 1000 -e 3 --density_model ex2 -s 8 -bc 0.05 -kl 0.1 -dlr 0.001 -dh 8 -dti 1000 --exp_name PM_ex2_s8_bc0.05_kl0.1_dlr0.001_dh8_dti1000
21 | 
22 | ###########################
23 | ###    P4 HalfCheetah   ###
24 | ###########################
25 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model none --exp_name HC_bc0
26 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.001 -kl 0.1 -dlr 0.005 -dti 1000 --exp_name HC_bc0.001_kl0.1_dlr0.005_dti1000
27 | python train_ac_exploration_f18.py HalfCheetah-v2 -ep 150 --discount 0.9 -n 100 -e 3 -l 2 -s 32 -b 30000 -lr 0.02 --density_model ex2 -bc 0.0001 -kl 0.1 -dlr 0.005 -dti 10000 --exp_name HC_bc0.0001_kl0.1_dlr0.005_dti10000
28 | 


--------------------------------------------------------------------------------
/hw5/exp/sparse_half_cheetah.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     def __init__(self):
 7 |         mujoco_env.MujocoEnv.__init__(self, 'half_cheetah.xml', 5)
 8 |         utils.EzPickle.__init__(self)
 9 | 
10 |     def step(self, action):
11 |         #################################################
12 |         ctrl = False
13 |         relu = False
14 |         threshold = 10.0
15 |         #################################################
16 |         xposbefore = self.sim.data.qpos[0]
17 |         self.do_simulation(action, self.frame_skip)
18 |         xposafter = self.sim.data.qpos[0]
19 |         ob = self._get_obs()
20 |         # reward_ctrl = - 0.1 * np.square(action).sum()
21 |         # reward_run = (xposafter - xposbefore)/self.dt
22 |         #################################################
23 |         if ctrl:
24 |             reward_ctrl = - 0.1 * np.square(action).sum()
25 |         else:
26 |             reward_ctrl = 0
27 |         if abs(xposafter) <= threshold:
28 |             reward_run = 0.0
29 |         else:
30 |             if relu:
31 |                 reward_run = np.sign(xposafter)*(xposafter - xposbefore)/self.dt
32 |             else:
33 |                 reward_run = 1.0
34 |         #################################################
35 |         reward = reward_ctrl + reward_run
36 |         done = False
37 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
38 | 
39 |     def _get_obs(self):
40 |         return np.concatenate([
41 |             self.sim.data.qpos.flat[1:],
42 |             self.sim.data.qvel.flat,
43 |         ])
44 | 
45 |     def reset_model(self):
46 |         qpos = self.init_qpos + self.np_random.uniform(low=-.1, high=.1, size=self.model.nq)
47 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
48 |         self.set_state(qpos, qvel)
49 |         return self._get_obs()
50 | 
51 |     def viewer_setup(self):
52 |         self.viewer.cam.distance = self.model.stat.extent * 0.5
53 | 


--------------------------------------------------------------------------------
/hw5/meta/Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/Deep_RL_Assignment_5__Meta_Reinforcement_Learning.pdf


--------------------------------------------------------------------------------
/hw5/meta/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 5c: Meta-Learning
 2 | 
 3 | Dependencies:
 4 | 
 5 |  * Python **3.5**
 6 |  * Numpy version 1.14.5
 7 |  * TensorFlow version 1.10.5
 8 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.56**
 9 |  * OpenAI Gym version **0.10.5**
10 |  * seaborn
11 |  * Box2D==2.3.2
12 | 
13 | Instructions: [HW5c PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5c.pdf)
14 | 
15 | ### 1. Problem1 Context as Task ID
16 | 
17 | Run the following command:
18 | 
19 | `python train_policy.py 'pm-obs' --exp_name <experiment_name> --history 1 -lr 5e-5 -n 200 --num_tasks 4`
20 | 
21 | ### 2. Problem2 Meta-Learned Context
22 | 
23 | Run the following command:
24 | 
25 | **With MLP model**
26 | 
27 | `python train_policy.py 'pm' --exp_name <experiment_name> --history <history> --discount 0.90 -lr 5e-4 -n 60`
28 | 
29 | 
30 | **With RNN model**
31 | 
32 | `python train_policy.py 'pm' --exp_name <experiment_name> --history <history> --discount 0.90 -lr 5e-4 -n 60 --recurrent`
33 | 
34 | ### 3. Problem3 Generalization
35 | 
36 | Run the following command:
37 | 
38 | `python train_policy.py 'pm' --exp_name <experiment_name> --history <history> --discount 0.90 -lr 5e-4 -n 60 --recurrent --generalized --granularity <granularity>`
39 | 
40 | if `--generalized`, the training goals and testing goals will be chosen from chessboard space where 1 corresponds to testing goals and 0 corresponds to training goals. The size of pattern in chessboard is defined by `--granularity`. The value can be chosen from the list `[1,2,4,5,10]` to construct a balanced chessboard.
41 | 
42 | 


--------------------------------------------------------------------------------
/hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"mlp_1",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	1,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/mlp_1_pm_13-11-2018_20-57-59/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	false,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"mlp_30",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	30,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/mlp_30_pm_13-11-2018_20-48-55/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	false,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"mlp_50",
 4 | "gamma"	:	0.9,
 5 | "generalized"	:	false,
 6 | "granularity"	:	1,
 7 | "gru_size"	:	32,
 8 | "history"	:	50,
 9 | "l2reg"	:	false,
10 | "learning_rate"	:	0.0005,
11 | "logdir"	:	"data/mlp_50_pm_14-11-2018_20-05-53/1",
12 | "max_path_length"	:	20,
13 | "min_timesteps_per_batch"	:	10000,
14 | "mini_batch_size"	:	64,
15 | "n_iter"	:	60,
16 | "n_layers"	:	1,
17 | "nn_critic"	:	false,
18 | "normalize_advantages"	:	true,
19 | "num_ppo_updates"	:	780,
20 | "num_tasks"	:	1,
21 | "num_value_iters"	:	1,
22 | "recurrent"	:	false,
23 | "seed"	:	1,
24 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"mlp_60",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	60,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/mlp_60_pm_13-11-2018_23-01-39/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	false,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm-obs",
 3 | "exp_name"	:	"pro1",
 4 | "gamma"	:	0.99,
 5 | "gru_size"	:	32,
 6 | "history"	:	1,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	5e-05,
 9 | "logdir"	:	"data/pro1_pm-obs_13-11-2018_01-08-37/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	2500,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	200,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	4,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	false,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"pro3_rnn_60_g_1",
 4 | "gamma"	:	0.9,
 5 | "generalized"	:	true,
 6 | "granularity"	:	1,
 7 | "gru_size"	:	32,
 8 | "history"	:	60,
 9 | "l2reg"	:	false,
10 | "learning_rate"	:	0.0005,
11 | "logdir"	:	"data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1",
12 | "max_path_length"	:	20,
13 | "min_timesteps_per_batch"	:	10000,
14 | "mini_batch_size"	:	64,
15 | "n_iter"	:	60,
16 | "n_layers"	:	1,
17 | "nn_critic"	:	false,
18 | "normalize_advantages"	:	true,
19 | "num_ppo_updates"	:	780,
20 | "num_tasks"	:	1,
21 | "num_value_iters"	:	1,
22 | "recurrent"	:	true,
23 | "seed"	:	1,
24 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"pro3_rnn_60_g_2",
 4 | "gamma"	:	0.9,
 5 | "generalized"	:	true,
 6 | "granularity"	:	2,
 7 | "gru_size"	:	32,
 8 | "history"	:	60,
 9 | "l2reg"	:	false,
10 | "learning_rate"	:	0.0005,
11 | "logdir"	:	"data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1",
12 | "max_path_length"	:	20,
13 | "min_timesteps_per_batch"	:	10000,
14 | "mini_batch_size"	:	64,
15 | "n_iter"	:	60,
16 | "n_layers"	:	1,
17 | "nn_critic"	:	false,
18 | "normalize_advantages"	:	true,
19 | "num_ppo_updates"	:	780,
20 | "num_tasks"	:	1,
21 | "num_value_iters"	:	1,
22 | "recurrent"	:	true,
23 | "seed"	:	1,
24 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"pro3_rnn_60_g_4",
 4 | "gamma"	:	0.9,
 5 | "generalized"	:	true,
 6 | "granularity"	:	4,
 7 | "gru_size"	:	32,
 8 | "history"	:	60,
 9 | "l2reg"	:	false,
10 | "learning_rate"	:	0.0005,
11 | "logdir"	:	"data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1",
12 | "max_path_length"	:	20,
13 | "min_timesteps_per_batch"	:	10000,
14 | "mini_batch_size"	:	64,
15 | "n_iter"	:	60,
16 | "n_layers"	:	1,
17 | "nn_critic"	:	false,
18 | "normalize_advantages"	:	true,
19 | "num_ppo_updates"	:	780,
20 | "num_tasks"	:	1,
21 | "num_value_iters"	:	1,
22 | "recurrent"	:	true,
23 | "seed"	:	1,
24 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"rnn_1",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	1,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/rnn_1_pm_13-11-2018_21-05-16/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	true,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"rnn_30",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	30,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/rnn_30_pm_13-11-2018_19-34-21/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	true,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"rnn_50",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	50,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/rnn_50_pm_14-11-2018_10-34-08/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	true,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"rnn_60",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	60,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/rnn_60_pm_13-11-2018_17-27-20/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	true,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm-obs",
 3 | "exp_name"	:	"pro1",
 4 | "gamma"	:	0.99,
 5 | "gru_size"	:	32,
 6 | "history"	:	1,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	5e-05,
 9 | "logdir"	:	"data/pro1_pm-obs_13-11-2018_01-08-37/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	2500,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	200,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	4,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	false,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro1/pro1_pm-obs_13-11-2018_01-08-37/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro1/prob1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro1/prob1.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"mlp_1",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	1,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/mlp_1_pm_13-11-2018_20-57-59/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	false,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/mlp_1_pm_13-11-2018_20-57-59/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/pro2_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/pro2_1.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"rnn_1",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	1,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/rnn_1_pm_13-11-2018_21-05-16/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	true,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_1/rnn_1_pm_13-11-2018_21-05-16/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"mlp_30",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	30,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/mlp_30_pm_13-11-2018_20-48-55/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	false,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/mlp_30_pm_13-11-2018_20-48-55/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/prob_30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/prob_30.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"rnn_30",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	30,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/rnn_30_pm_13-11-2018_19-34-21/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	true,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_30/rnn_30_pm_13-11-2018_19-34-21/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"mlp_50",
 4 | "gamma"	:	0.9,
 5 | "generalized"	:	false,
 6 | "granularity"	:	1,
 7 | "gru_size"	:	32,
 8 | "history"	:	50,
 9 | "l2reg"	:	false,
10 | "learning_rate"	:	0.0005,
11 | "logdir"	:	"data/mlp_50_pm_14-11-2018_20-05-53/1",
12 | "max_path_length"	:	20,
13 | "min_timesteps_per_batch"	:	10000,
14 | "mini_batch_size"	:	64,
15 | "n_iter"	:	60,
16 | "n_layers"	:	1,
17 | "nn_critic"	:	false,
18 | "normalize_advantages"	:	true,
19 | "num_ppo_updates"	:	780,
20 | "num_tasks"	:	1,
21 | "num_value_iters"	:	1,
22 | "recurrent"	:	false,
23 | "seed"	:	1,
24 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/mlp_50_pm_14-11-2018_20-05-53/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/pro2_50.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/pro2_50.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"rnn_50",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	50,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/rnn_50_pm_14-11-2018_10-34-08/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	true,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_50/rnn_50_pm_14-11-2018_10-34-08/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"mlp_60",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	60,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/mlp_60_pm_13-11-2018_23-01-39/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	false,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/mlp_60_pm_13-11-2018_23-01-39/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/pro2_60.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/pro2_60.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"rnn_60",
 4 | "gamma"	:	0.9,
 5 | "gru_size"	:	32,
 6 | "history"	:	60,
 7 | "l2reg"	:	false,
 8 | "learning_rate"	:	0.0005,
 9 | "logdir"	:	"data/rnn_60_pm_13-11-2018_17-27-20/1",
10 | "max_path_length"	:	20,
11 | "min_timesteps_per_batch"	:	10000,
12 | "mini_batch_size"	:	64,
13 | "n_iter"	:	60,
14 | "n_layers"	:	1,
15 | "nn_critic"	:	false,
16 | "normalize_advantages"	:	true,
17 | "num_ppo_updates"	:	780,
18 | "num_tasks"	:	1,
19 | "num_value_iters"	:	1,
20 | "recurrent"	:	true,
21 | "seed"	:	1,
22 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro2_60/rnn_60_pm_13-11-2018_17-27-20/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"pro3_rnn_60_g_1",
 4 | "gamma"	:	0.9,
 5 | "generalized"	:	true,
 6 | "granularity"	:	1,
 7 | "gru_size"	:	32,
 8 | "history"	:	60,
 9 | "l2reg"	:	false,
10 | "learning_rate"	:	0.0005,
11 | "logdir"	:	"data/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1",
12 | "max_path_length"	:	20,
13 | "min_timesteps_per_batch"	:	10000,
14 | "mini_batch_size"	:	64,
15 | "n_iter"	:	60,
16 | "n_layers"	:	1,
17 | "nn_critic"	:	false,
18 | "normalize_advantages"	:	true,
19 | "num_ppo_updates"	:	780,
20 | "num_tasks"	:	1,
21 | "num_value_iters"	:	1,
22 | "recurrent"	:	true,
23 | "seed"	:	1,
24 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_avg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_avg.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_val.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_1_pm_15-11-2018_01-30-55/g_1_val.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"pro3_rnn_60_g_2",
 4 | "gamma"	:	0.9,
 5 | "generalized"	:	true,
 6 | "granularity"	:	2,
 7 | "gru_size"	:	32,
 8 | "history"	:	60,
 9 | "l2reg"	:	false,
10 | "learning_rate"	:	0.0005,
11 | "logdir"	:	"data/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1",
12 | "max_path_length"	:	20,
13 | "min_timesteps_per_batch"	:	10000,
14 | "mini_batch_size"	:	64,
15 | "n_iter"	:	60,
16 | "n_layers"	:	1,
17 | "nn_critic"	:	false,
18 | "normalize_advantages"	:	true,
19 | "num_ppo_updates"	:	780,
20 | "num_tasks"	:	1,
21 | "num_value_iters"	:	1,
22 | "recurrent"	:	true,
23 | "seed"	:	1,
24 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_avg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_avg.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_val.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_2_pm_14-11-2018_16-22-59/g_2_val.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/params.json:
--------------------------------------------------------------------------------
 1 | {"animate"	:	false,
 2 | "env_name"	:	"pm",
 3 | "exp_name"	:	"pro3_rnn_60_g_4",
 4 | "gamma"	:	0.9,
 5 | "generalized"	:	true,
 6 | "granularity"	:	4,
 7 | "gru_size"	:	32,
 8 | "history"	:	60,
 9 | "l2reg"	:	false,
10 | "learning_rate"	:	0.0005,
11 | "logdir"	:	"data/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1",
12 | "max_path_length"	:	20,
13 | "min_timesteps_per_batch"	:	10000,
14 | "mini_batch_size"	:	64,
15 | "n_iter"	:	60,
16 | "n_layers"	:	1,
17 | "nn_critic"	:	false,
18 | "normalize_advantages"	:	true,
19 | "num_ppo_updates"	:	780,
20 | "num_tasks"	:	1,
21 | "num_value_iters"	:	1,
22 | "recurrent"	:	true,
23 | "seed"	:	1,
24 | "size"	:	64}


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/1/vars.pkl


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_avg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_avg.png


--------------------------------------------------------------------------------
/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_val.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/buaazhangfan/CS294-112-Deep-Reinforcement-Learning/5b9604ef6102e1f6d11afee444fb46937ae53363/hw5/meta/data_pro3/pro3_rnn_60_g_4_pm_15-11-2018_01-34-18/g_4_val.png


--------------------------------------------------------------------------------
/hw5/meta/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, separators=(',\n','\t:\t'), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw5/meta/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 |     sns.set(style="darkgrid", font_scale=1.5)
 55 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 56 | 
 57 |     plt.legend(loc='best').draggable()
 58 |     #plt.savefig('1.png')
 59 |     plt.show()
 60 | 
 61 | 
 62 | def get_datasets(fpath, condition=None):
 63 |     unit = 0
 64 |     datasets = []
 65 |     for root, dir, files in os.walk(fpath):
 66 |         if 'log.txt' in files:
 67 |             param_path = open(os.path.join(root,'params.json'))
 68 |             params = json.load(param_path)
 69 |             exp_name = params['exp_name']
 70 |             
 71 |             log_path = os.path.join(root,'log.txt')
 72 |             experiment_data = pd.read_table(log_path)
 73 | 
 74 |             experiment_data.insert(
 75 |                 len(experiment_data.columns),
 76 |                 'Unit',
 77 |                 unit
 78 |                 )        
 79 |             experiment_data.insert(
 80 |                 len(experiment_data.columns),
 81 |                 'Condition',
 82 |                 condition or exp_name
 83 |                 )
 84 | 
 85 |             datasets.append(experiment_data)
 86 |             unit += 1
 87 | 
 88 |     return datasets
 89 | 
 90 | 
 91 | def main():
 92 |     import argparse
 93 |     parser = argparse.ArgumentParser()
 94 |     parser.add_argument('logdir', nargs='*')
 95 |     parser.add_argument('--legend', nargs='*')
 96 |     parser.add_argument('--value', default='AverageReturn', nargs='*')
 97 |     args = parser.parse_args()
 98 | 
 99 |     use_legend = False
100 |     if args.legend is not None:
101 |         assert len(args.legend) == len(args.logdir), \
102 |             "Must give a legend title for each set of experiments."
103 |         use_legend = True
104 | 
105 |     data = []
106 |     if use_legend:
107 |         for logdir, legend_title in zip(args.logdir, args.legend):
108 |             data += get_datasets(logdir, legend_title)
109 |     else:
110 |         for logdir in args.logdir:
111 |             data += get_datasets(logdir)
112 | 
113 |     if isinstance(args.value, list):
114 |         values = args.value
115 |     else:
116 |         values = [args.value]
117 |     for value in values:
118 |         plot_data(data, value=value)
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 


--------------------------------------------------------------------------------
/hw5/meta/point_mass.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from gym import spaces
  3 | from gym import Env
  4 | 
  5 | 
  6 | class PointEnv(Env):
  7 |     """
  8 |     point mass on a 2-D plane
  9 |     goals are sampled randomly from a square
 10 |     """
 11 | 
 12 |     def __init__(self, num_tasks=1):
 13 |         self.reset_task()
 14 |         self.reset()
 15 |         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,))
 16 |         self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
 17 | 
 18 | 
 19 |     def reset_task(self, generalized=False, granularity=1, is_evaluation=False):
 20 |         '''
 21 |         sample a new task randomly
 22 | 
 23 |         Problem 3: make training and evaluation goals disjoint sets
 24 |         if `is_evaluation` is true, sample from the evaluation set,
 25 |         otherwise sample from the training set
 26 |         '''
 27 |         #====================================================================================#
 28 |         #                           ----------PROBLEM 3----------
 29 |         #====================================================================================#
 30 |         # YOUR CODE HERE
 31 |         # Construct the chessboard space with 20 x 20
 32 |         # The granularity is the size of squares, the value can be chosen from [1, 2, 4, 5, 10]
 33 |         if generalized:  
 34 |             print("Problem 3...")  
 35 |             print("The size of square is ", granularity)       
 36 |             size = int(20 / granularity)
 37 |             space = np.zeros((size, size))
 38 |             space[1::2,::2] = 1
 39 |             space[::2,1::2] = 1
 40 |             if is_evaluation:
 41 |                 dataset = np.where(space == 1)
 42 |             else:
 43 |                 dataset = np.where(space == 0)
 44 | 
 45 |             dataset = np.asarray(dataset).T
 46 |             nums = dataset.shape[0]
 47 |             idx = np.random.randint(0, nums)
 48 |             if is_evaluation:
 49 |                 print("Evaluation")
 50 |             else:
 51 |                 print("training")
 52 | 
 53 |             goal = dataset[idx]
 54 |             goal[0] = goal[0] * granularity
 55 |             goal[1] = goal[1] * granularity
 56 | 
 57 |             x = np.random.uniform(goal[0], goal[0] + granularity) - 10
 58 |             y = np.random.uniform(goal[1], goal[1] + granularity) - 10
 59 |             print((x, y))
 60 |         else:
 61 |             #print("Problem 2...")
 62 |             x = np.random.uniform(-10, 10)
 63 |             y = np.random.uniform(-10, 10)
 64 | 
 65 |         self._goal = np.array([x, y])
 66 | 
 67 |         #x = np.random.uniform(-10, 10)
 68 |         #y = np.random.uniform(-10, 10)
 69 |         #self._goal = np.array([x, y])
 70 | 
 71 |     def reset(self):
 72 |         self._state = np.array([0, 0], dtype=np.float32)
 73 |         return self._get_obs()
 74 | 
 75 |     def _get_obs(self):
 76 |         return np.copy(self._state)
 77 | 
 78 |     def reward_function(self, x, y):
 79 |         return - (x ** 2 + y ** 2) ** 0.5
 80 | 
 81 |     def step(self, action):
 82 |         x, y = self._state
 83 |         # compute reward, add penalty for large actions instead of clipping them
 84 |         x -= self._goal[0]
 85 |         y -= self._goal[1]
 86 |         # check if task is complete
 87 |         done = abs(x) < .01 and abs(y) < .01
 88 |         reward = self.reward_function(x, y)
 89 |         # move to next state
 90 |         self._state = self._state + action
 91 |         ob = self._get_obs()
 92 |         return ob, reward, done, dict()
 93 | 
 94 |     def viewer_setup(self):
 95 |         print('no viewer')
 96 |         pass
 97 | 
 98 |     def render(self):
 99 |         print('current state:', self._state)
100 | 
101 |     def seed(self, seed):
102 |         np.random.seed = seed
103 | 


--------------------------------------------------------------------------------
/hw5/meta/point_mass_observed.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import spaces
 3 | from gym import Env
 4 | 
 5 | 
 6 | class ObservedPointEnv(Env):
 7 |     """
 8 |     point mass on a 2-D plane
 9 |     four tasks: move to (-10, -10), (-10, 10), (10, -10), (10, 10)
10 | 
11 |     Problem 1: augment the observation with a one-hot vector encoding the task ID
12 |      - change the dimension of the observation space
13 |      - augment the observation with a one-hot vector that encodes the task ID
14 |     """
15 |     #====================================================================================#
16 |     #                           ----------PROBLEM 1----------
17 |     #====================================================================================#
18 |     # YOUR CODE SOMEWHERE HERE
19 |     def __init__(self, num_tasks=1):
20 |         self.tasks = [0, 1, 2, 3][:num_tasks]
21 |         self.task_idx = -1
22 |         #self.num_tasks = num_tasks
23 |         self.reset_task()
24 |         self.reset()
25 | 
26 |         self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2 + num_tasks,))
27 |         self.action_space = spaces.Box(low=-0.1, high=0.1, shape=(2,))
28 | 
29 |     def reset_task(self, generalized=False, granularity=1, is_evaluation=False):
30 |         # for evaluation, cycle deterministically through all tasks
31 |         if is_evaluation:
32 |             self.task_idx = (self.task_idx + 1) % len(self.tasks)
33 |         # during training, sample tasks randomly
34 |         else:
35 |             self.task_idx = np.random.randint(len(self.tasks))
36 |         self._task = self.tasks[self.task_idx]
37 |         goals = [[-1, -1], [-1, 1], [1, -1], [1, 1]]
38 |         self._goal = np.array(goals[self.task_idx])*10
39 | 
40 |     def reset(self):
41 |         self._state = np.array([0, 0], dtype=np.float32)
42 |         return self._get_obs()
43 | 
44 |     def _get_obs(self):
45 |         one_hot = np.zeros(len(self.tasks))
46 |         one_hot[self._task] = 1
47 | 
48 |         return np.concatenate((np.copy(self._state), one_hot))
49 | 
50 |     def step(self, action):
51 |         x, y = self._state
52 |         # compute reward, add penalty for large actions instead of clipping them
53 |         x -= self._goal[0]
54 |         y -= self._goal[1]
55 |         reward = - (x ** 2 + y ** 2) ** 0.5
56 |         # check if task is complete
57 |         done = abs(x) < 0.01 and abs(y) < 0.01
58 |         # move to next state
59 |         self._state = self._state + action
60 |         ob = self._get_obs()
61 |         
62 |         return ob, reward, done, dict()
63 | 
64 |     def viewer_setup(self):
65 |         print('no viewer')
66 |         pass
67 | 
68 |     def render(self):
69 |         print('current state:', self._state)
70 | 
71 |     def seed(self, seed):
72 |         np.random.seed = seed
73 | 


--------------------------------------------------------------------------------
/hw5/meta/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class ReplayBuffer(object):
  4 |     '''
  5 |     minimalistic replay buffer
  6 | 
  7 |     a sample consists of
  8 |      - observation
  9 |      - action
 10 |      - reward
 11 |      - terminal
 12 |      - hidden state for recurrent policy
 13 | 
 14 |      it is memory inefficient to store windowed observations this way
 15 |      so do not run on tasks with large observations (e.g. from vision)
 16 |     '''
 17 | 
 18 |     def __init__(self, max_size, ob_dim, ac_dim, hidden_dim, task_dim):
 19 |         self.max_size = max_size
 20 |         self.ob_dim = ob_dim
 21 |         self.ac_dim = ac_dim
 22 |         self.hidden_dim = hidden_dim
 23 |         self.task_dim = task_dim
 24 |         self.flush()
 25 | 
 26 |     def flush(self):
 27 |         '''
 28 |         set buffer to empty
 29 |         '''
 30 |         self._observations = np.zeros((self.max_size, *self.ob_dim))
 31 |         self._actions = np.zeros((self.max_size, *self.ac_dim))
 32 |         self._rewards = np.zeros((self.max_size, 1))
 33 |         self._terminals = np.zeros((self.max_size, 1))
 34 |         self._hiddens = np.zeros((self.max_size, self.hidden_dim))
 35 |         self._tasks = np.zeros((self.max_size, self.task_dim))
 36 |         self._top = 0
 37 |         self._size = 0
 38 | 
 39 |     def _advance(self):
 40 |         '''
 41 |         move pointer to top of buffer
 42 |         if end of buffer is reached, overwrite oldest data
 43 |         '''
 44 |         self._top = (self._top + 1) % self.max_size
 45 |         if self._size < self.max_size:
 46 |             self._size += 1
 47 | 
 48 |     def add_sample(self, ob, ac, re, te, hi, task):
 49 |         '''
 50 |         add sample to buffer
 51 |         '''
 52 |         self._observations[self._top] = ob
 53 |         self._actions[self._top] = ac
 54 |         self._rewards[self._top] = re
 55 |         self._terminals[self._top] = te
 56 |         self._hiddens[self._top] = hi
 57 |         self._tasks[self._top] = task
 58 | 
 59 |         self._advance()
 60 | 
 61 |     def get_samples(self, indices):
 62 |         '''
 63 |         return buffer data indexed by `indices`
 64 |         '''
 65 |         return dict(
 66 |             observations=self._observations[indices],
 67 |             actions=self._actions[indices],
 68 |             rewards=self._rewards[indices],
 69 |             terminals=self._terminals[indices],
 70 |             hiddens=self._hiddens[indices],
 71 |             tasks=self._tasks[indices],
 72 |         )
 73 | 
 74 |     def random_batch(self, batch_size):
 75 |         '''
 76 |         return random sample of `batch_size` transitions
 77 |         '''
 78 |         indices = np.random.randint(0, self._size, batch_size)
 79 |         return self.get_samples(indices)
 80 | 
 81 |     def all_batch(self):
 82 |         '''
 83 |         return all data in the buffer
 84 |         '''
 85 |         indices = list(range(self._size))
 86 |         return self.get_samples(indices)
 87 | 
 88 |     def num_steps_can_sample(self):
 89 |         return self._size
 90 | 
 91 | 
 92 | 
 93 | class PPOReplayBuffer(object):
 94 |     '''
 95 |     replay buffer for PPO algorithm
 96 |     store fixed log probs, advantages, and returns for use in multiple updates
 97 | 
 98 |     n.b. samples must be added as a batch, and we assume that the
 99 |     batch is the same size as that of the simple buffer
100 |     '''
101 | 
102 |     def __init__(self, simple_buffer):
103 |         self.simple_buffer = simple_buffer
104 |         self.max_size = self.simple_buffer.max_size
105 |         self.flush()
106 | 
107 |     def flush(self):
108 |         self.simple_buffer.flush()
109 |         self._log_probs = np.zeros((self.max_size, 1))
110 |         self._advantages = np.zeros((self.max_size, 1))
111 |         self._returns = np.zeros((self.max_size, 1))
112 | 
113 |     def add_samples(self, lp, adv, ret):
114 |         self._log_probs = lp
115 |         self._advantages = adv
116 |         self._returns = ret
117 | 
118 |     def get_samples(self, indices):
119 |         return dict(
120 |             log_probs = self._log_probs[indices],
121 |             advantages = self._advantages[indices],
122 |             returns = self._returns[indices],
123 |         )
124 | 
125 |     def random_batch(self, batch_size):
126 |         indices = np.random.randint(0, self.simple_buffer._size, batch_size)
127 |         simple = self.simple_buffer.get_samples(indices)
128 |         ppo = self.get_samples(indices)
129 |         return {**simple, **ppo}
130 | 


--------------------------------------------------------------------------------
/hw5/meta/requirements.txt:
--------------------------------------------------------------------------------
1 | mujoco-py==1.50.1.56
2 | gym==0.10.5
3 | tensorflow==1.10.0
4 | numpy==1.14.5
5 | scipy==1.1.0
6 | tensorflow-probability==0.3.0
7 | seaborn
8 | Box2D==2.3.2
9 | 


--------------------------------------------------------------------------------
/hw5/sac/README.md:
--------------------------------------------------------------------------------
 1 | # CS294-112 HW 5b: Soft Actor Critic
 2 | Original code from Tuomas Haarnoja, Soroush Nasiriany, and Aurick Zhou for CS294-112 Fall 2018
 3 | 
 4 | Dependencies:
 5 |  * Python **3.4.5**
 6 |  * Numpy version **1.15.2**
 7 |  * TensorFlow version **1.10.0**
 8 |  * tensorflow-probability version **0.4.0**
 9 |  * OpenAI Gym version **0.10.8**
10 |  * MuJoCo version **1.50** and mujoco-py **1.50.1.59**
11 |  * seaborn version **0.9.0**
12 | 
13 | You will implement `sac.py`, and `nn.py`.
14 | 
15 | See the [HW5 PDF](http://rail.eecs.berkeley.edu/deeprlcourse/static/homeworks/hw5b.pdf) for further instructions.
16 | 


--------------------------------------------------------------------------------
/hw5/sac/environment.yml:
--------------------------------------------------------------------------------
 1 | name: hw5-sac
 2 | dependencies:
 3 |     - python==3.4.5
 4 |     - pip:
 5 |         - gym==0.10.8
 6 |         - numpy==1.15.2
 7 |         - tensorflow==1.10.0
 8 |         - tensorflow-probability==0.4.0
 9 |         - mujoco-py==1.50.1.59
10 |         - seaborn==0.9.0
11 | 


--------------------------------------------------------------------------------
/hw5/sac/logz.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | 
  3 | """
  4 | 
  5 | Some simple logging functionality, inspired by rllab's logging.
  6 | Assumes that each diagnostic gets logged each iteration
  7 | 
  8 | Call logz.configure_output_dir() to start logging to a 
  9 | tab-separated-values file (some_folder_name/log.txt)
 10 | 
 11 | To load the learning curves, you can do, for example
 12 | 
 13 | A = np.genfromtxt('/tmp/expt_1468984536/log.txt',delimiter='\t',dtype=None, names=True)
 14 | A['EpRewMean']
 15 | 
 16 | """
 17 | 
 18 | import os.path as osp, shutil, time, atexit, os, subprocess
 19 | import pickle
 20 | import tensorflow as tf
 21 | 
 22 | color2num = dict(
 23 |     gray=30,
 24 |     red=31,
 25 |     green=32,
 26 |     yellow=33,
 27 |     blue=34,
 28 |     magenta=35,
 29 |     cyan=36,
 30 |     white=37,
 31 |     crimson=38
 32 | )
 33 | 
 34 | def colorize(string, color, bold=False, highlight=False):
 35 |     attr = []
 36 |     num = color2num[color]
 37 |     if highlight: num += 10
 38 |     attr.append(str(num))
 39 |     if bold: attr.append('1')
 40 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
 41 | 
 42 | class G:
 43 |     output_dir = None
 44 |     output_file = None
 45 |     first_row = True
 46 |     log_headers = []
 47 |     log_current_row = {}
 48 | 
 49 | def configure_output_dir(d=None):
 50 |     """
 51 |     Set output directory to d, or to /tmp/somerandomnumber if d is None
 52 |     """
 53 |     G.output_dir = d or "/tmp/experiments/%i"%int(time.time())
 54 |     assert not osp.exists(G.output_dir), "Log dir %s already exists! Delete it first or use a different dir"%G.output_dir
 55 |     os.makedirs(G.output_dir)
 56 |     G.output_file = open(osp.join(G.output_dir, "log.txt"), 'w')
 57 |     atexit.register(G.output_file.close)
 58 |     print(colorize("Logging data to %s"%G.output_file.name, 'green', bold=True))
 59 | 
 60 | def log_tabular(key, val):
 61 |     """
 62 |     Log a value of some diagnostic
 63 |     Call this once for each diagnostic quantity, each iteration
 64 |     """
 65 |     if G.first_row:
 66 |         G.log_headers.append(key)
 67 |     else:
 68 |         assert key in G.log_headers, "Trying to introduce a new key %s that you didn't include in the first iteration"%key
 69 |     assert key not in G.log_current_row, "You already set %s this iteration. Maybe you forgot to call dump_tabular()"%key
 70 |     G.log_current_row[key] = val
 71 | 
 72 | def save_params(params):
 73 |     with open(osp.join(G.output_dir, "params.json"), 'w') as out:
 74 |         out.write(json.dumps(params, indent=2, separators=(',', ': '), sort_keys=True))
 75 | 
 76 | def pickle_tf_vars():  
 77 |     """
 78 |     Saves tensorflow variables
 79 |     Requires them to be initialized first, also a default session must exist
 80 |     """
 81 |     _dict = {v.name : v.eval() for v in tf.global_variables()}
 82 |     with open(osp.join(G.output_dir, "vars.pkl"), 'wb') as f:
 83 |         pickle.dump(_dict, f)
 84 |     
 85 | 
 86 | def dump_tabular():
 87 |     """
 88 |     Write all of the diagnostics from the current iteration
 89 |     """
 90 |     vals = []
 91 |     key_lens = [len(key) for key in G.log_headers]
 92 |     max_key_len = max(15,max(key_lens))
 93 |     keystr = '%'+'%d'%max_key_len
 94 |     fmt = "| " + keystr + "s | %15s |"
 95 |     n_slashes = 22 + max_key_len
 96 |     print("-"*n_slashes)
 97 |     for key in G.log_headers:
 98 |         val = G.log_current_row.get(key, "")
 99 |         if hasattr(val, "__float__"): valstr = "%8.3g"%val
100 |         else: valstr = val
101 |         print(fmt%(key, valstr))
102 |         vals.append(val)
103 |     print("-"*n_slashes)
104 |     if G.output_file is not None:
105 |         if G.first_row:
106 |             G.output_file.write("\t".join(G.log_headers))
107 |             G.output_file.write("\n")
108 |         G.output_file.write("\t".join(map(str,vals)))
109 |         G.output_file.write("\n")
110 |         G.output_file.flush()
111 |     G.log_current_row.clear()
112 |     G.first_row=False
113 | 


--------------------------------------------------------------------------------
/hw5/sac/nn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from tensorflow.keras import layers
  4 | from tensorflow_probability import distributions
  5 | from tensorflow.python import keras
  6 | from tensorflow.python.keras.engine.network import Network
  7 | 
  8 | 
  9 | class QFunction(Network):
 10 |     def __init__(self, hidden_layer_sizes, **kwargs):
 11 |         super(QFunction, self).__init__(**kwargs)
 12 |         self._hidden_layer_sizes = hidden_layer_sizes
 13 | 
 14 |     def build(self, input_shape):
 15 |         inputs = [
 16 |             layers.Input(batch_shape=input_shape[0], name='observations'),
 17 |             layers.Input(batch_shape=input_shape[1], name='actions')
 18 |         ]
 19 | 
 20 |         x = layers.Concatenate(axis=1)(inputs)
 21 |         for hidden_units in self._hidden_layer_sizes:
 22 |             x = layers.Dense(hidden_units, activation='relu')(x)
 23 |         q_values = layers.Dense(1, activation=None)(x)
 24 | 
 25 |         self._init_graph_network(inputs, q_values)
 26 |         super(QFunction, self).build(input_shape)
 27 | 
 28 | 
 29 | class ValueFunction(Network):
 30 |     def __init__(self, hidden_layer_sizes, **kwargs):
 31 |         super(ValueFunction, self).__init__(**kwargs)
 32 |         self._hidden_layer_sizes = hidden_layer_sizes
 33 | 
 34 |     def build(self, input_shape):
 35 |         inputs = layers.Input(batch_shape=input_shape, name='observations')
 36 | 
 37 |         x = inputs
 38 |         for hidden_units in self._hidden_layer_sizes:
 39 |             x = layers.Dense(hidden_units, activation='relu')(x)
 40 |         values = layers.Dense(1, activation=None)(x)
 41 | 
 42 |         self._init_graph_network(inputs, values)
 43 |         super(ValueFunction, self).build(input_shape)
 44 | 
 45 | 
 46 | class GaussianPolicy(Network):
 47 |     def __init__(self, action_dim, hidden_layer_sizes, reparameterize, **kwargs):
 48 |         super(GaussianPolicy, self).__init__(**kwargs)
 49 |         self._action_dim = action_dim
 50 |         self._f = None
 51 |         self._hidden_layer_sizes = hidden_layer_sizes
 52 |         self._reparameterize = reparameterize
 53 | 
 54 |     def build(self, input_shape):
 55 |         inputs = layers.Input(batch_shape=input_shape, name='observations')
 56 | 
 57 |         x = inputs
 58 |         for hidden_units in self._hidden_layer_sizes:
 59 |             x = layers.Dense(hidden_units, activation='relu')(x)
 60 | 
 61 |         mean_and_log_std = layers.Dense(
 62 |             self._action_dim * 2, activation=None)(x)
 63 | 
 64 |         def create_distribution_layer(mean_and_log_std):
 65 |             mean, log_std = tf.split(
 66 |                 mean_and_log_std, num_or_size_splits=2, axis=1)
 67 |             log_std = tf.clip_by_value(log_std, -20., 2.)
 68 | 
 69 |             distribution = distributions.MultivariateNormalDiag(
 70 |                 loc=mean,
 71 |                 scale_diag=tf.exp(log_std))
 72 | 
 73 |             raw_actions = distribution.sample()
 74 |             if not self._reparameterize:
 75 |                 ### Problem 1.3.A
 76 |                 ### YOUR CODE HERE
 77 |                 raise NotImplementedError
 78 |             log_probs = distribution.log_prob(raw_actions)
 79 |             log_probs -= self._squash_correction(raw_actions)
 80 | 
 81 |             actions = None
 82 |             ### Problem 2.A
 83 |             ### YOUR CODE HERE
 84 |             raise NotImplementedError
 85 | 
 86 |             return actions, log_probs
 87 | 
 88 |         samples, log_probs = layers.Lambda(create_distribution_layer)(
 89 |             mean_and_log_std)
 90 | 
 91 |         self._init_graph_network(inputs=inputs, outputs=[samples, log_probs])
 92 |         super(GaussianPolicy, self).build(input_shape)
 93 | 
 94 |     def _squash_correction(self, raw_actions):
 95 |         ### Problem 2.B
 96 |         ### YOUR CODE HERE
 97 |         raise NotImplementedError
 98 | 
 99 |     def eval(self, observation):
100 |         assert self.built and observation.ndim == 1
101 | 
102 |         if self._f is None:
103 |             self._f = keras.backend.function(self.inputs, [self.outputs[0]])
104 | 
105 |         action, = self._f([observation[None]])
106 |         return action.flatten()
107 | 


--------------------------------------------------------------------------------
/hw5/sac/plot.py:
--------------------------------------------------------------------------------
  1 | import seaborn as sns
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import json
  5 | import os
  6 | 
  7 | """
  8 | Using the plotter:
  9 | 
 10 | Call it from the command line, and supply it with logdirs to experiments.
 11 | Suppose you ran an experiment with name 'test', and you ran 'test' for 10 
 12 | random seeds. The runner code stored it in the directory structure
 13 | 
 14 |     data
 15 |     L test_EnvName_DateTime
 16 |       L  0
 17 |         L log.txt
 18 |         L params.json
 19 |       L  1
 20 |         L log.txt
 21 |         L params.json
 22 |        .
 23 |        .
 24 |        .
 25 |       L  9
 26 |         L log.txt
 27 |         L params.json
 28 | 
 29 | To plot learning curves from the experiment, averaged over all random
 30 | seeds, call
 31 | 
 32 |     python plot.py data/test_EnvName_DateTime --value AverageReturn
 33 | 
 34 | and voila. To see a different statistics, change what you put in for
 35 | the keyword --value. You can also enter /multiple/ values, and it will 
 36 | make all of them in order.
 37 | 
 38 | 
 39 | Suppose you ran two experiments: 'test1' and 'test2'. In 'test2' you tried
 40 | a different set of hyperparameters from 'test1', and now you would like 
 41 | to compare them -- see their learning curves side-by-side. Just call
 42 | 
 43 |     python plot.py data/test1 data/test2
 44 | 
 45 | and it will plot them both! They will be given titles in the legend according
 46 | to their exp_name parameters. If you want to use custom legend titles, use
 47 | the --legend flag and then provide a title for each logdir.
 48 | 
 49 | """
 50 | 
 51 | def plot_data(data, value="AverageReturn"):
 52 |     if isinstance(data, list):
 53 |         data = pd.concat(data, ignore_index=True)
 54 | 
 55 |     sns.set(style="darkgrid", font_scale=1.5)
 56 |     sns.tsplot(data=data, time="Iteration", value=value, unit="Unit", condition="Condition")
 57 |     plt.legend(loc='best').draggable()
 58 |     plt.show()
 59 | 
 60 | 
 61 | def get_datasets(fpath, condition=None):
 62 |     unit = 0
 63 |     datasets = []
 64 |     for root, dir, files in os.walk(fpath):
 65 |         if 'log.txt' in files:
 66 |             param_path = open(os.path.join(root,'params.json'))
 67 |             params = json.load(param_path)
 68 |             exp_name = params['exp_name']
 69 |             
 70 |             log_path = os.path.join(root,'log.txt')
 71 |             experiment_data = pd.read_table(log_path)
 72 | 
 73 |             experiment_data.insert(
 74 |                 len(experiment_data.columns),
 75 |                 'Unit',
 76 |                 unit
 77 |                 )        
 78 |             experiment_data.insert(
 79 |                 len(experiment_data.columns),
 80 |                 'Condition',
 81 |                 condition or exp_name
 82 |                 )
 83 | 
 84 |             datasets.append(experiment_data)
 85 |             unit += 1
 86 | 
 87 |     return datasets
 88 | 
 89 | 
 90 | def main():
 91 |     import argparse
 92 |     parser = argparse.ArgumentParser()
 93 |     parser.add_argument('logdir', nargs='*')
 94 |     parser.add_argument('--legend', nargs='*')
 95 |     parser.add_argument('--value', default='LastEpReturn', nargs='*')
 96 |     args = parser.parse_args()
 97 | 
 98 |     use_legend = False
 99 |     if args.legend is not None:
100 |         assert len(args.legend) == len(args.logdir), \
101 |             "Must give a legend title for each set of experiments."
102 |         use_legend = True
103 | 
104 |     data = []
105 |     if use_legend:
106 |         for logdir, legend_title in zip(args.logdir, args.legend):
107 |             data += get_datasets(logdir, legend_title)
108 |     else:
109 |         for logdir in args.logdir:
110 |             data += get_datasets(logdir)
111 | 
112 |     if isinstance(args.value, list):
113 |         values = args.value
114 |     else:
115 |         values = [args.value]
116 |     for value in values:
117 |         plot_data(data, value=value)
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/hw5/sac/train_mujoco.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import gym
  3 | import logz
  4 | import numpy as np
  5 | import os
  6 | import tensorflow as tf
  7 | import time
  8 | 
  9 | import nn
 10 | from sac import SAC
 11 | import utils
 12 | 
 13 | from multiprocessing import Process
 14 | 
 15 | def train_SAC(env_name, exp_name, seed, logdir):
 16 |     alpha = {
 17 |         'Ant-v2': 0.1,
 18 |         'HalfCheetah-v2': 0.2,
 19 |         'Hopper-v2': 0.2,
 20 |         'Humanoid-v2': 0.05,
 21 |         'Walker2d-v2': 0.2,
 22 |     }.get(env_name, 0.2)
 23 | 
 24 |     algorithm_params = {
 25 |         'alpha': alpha,
 26 |         'batch_size': 256,
 27 |         'discount': 0.99,
 28 |         'learning_rate': 1e-3,
 29 |         'reparameterize': False,
 30 |         'tau': 0.01,
 31 |         'epoch_length': 1000,
 32 |         'n_epochs': 500,
 33 |         'two_qf': False,
 34 |     }
 35 |     sampler_params = {
 36 |         'max_episode_length': 1000,
 37 |         'prefill_steps': 1000,
 38 |     }
 39 |     replay_pool_params = {
 40 |         'max_size': 1e6,
 41 |     }
 42 | 
 43 |     value_function_params = {
 44 |         'hidden_layer_sizes': (128, 128),
 45 |     }
 46 | 
 47 |     q_function_params = {
 48 |         'hidden_layer_sizes': (128, 128),
 49 |     }
 50 | 
 51 |     policy_params = {
 52 |         'hidden_layer_sizes': (128, 128),
 53 |     }
 54 | 
 55 |     logz.configure_output_dir(logdir)
 56 |     params = {
 57 |         'exp_name': exp_name,
 58 |         'env_name': env_name,
 59 |         'algorithm_params': algorithm_params,
 60 |         'sampler_params': sampler_params,
 61 |         'replay_pool_params': replay_pool_params,
 62 |         'value_function_params': value_function_params,
 63 |         'q_function_params': q_function_params,
 64 |         'policy_params': policy_params
 65 |     }
 66 |     logz.save_params(params)
 67 | 
 68 |     env = gym.envs.make(env_name)
 69 |     # Set random seeds
 70 |     tf.set_random_seed(seed)
 71 |     np.random.seed(seed)
 72 |     env.seed(seed)
 73 | 
 74 |     sampler = utils.SimpleSampler(**sampler_params)
 75 |     replay_pool = utils.SimpleReplayPool(
 76 |         observation_shape=env.observation_space.shape,
 77 |         action_shape=env.action_space.shape,
 78 |         **replay_pool_params)
 79 | 
 80 |     q_function = nn.QFunction(name='q_function', **q_function_params)
 81 |     if algorithm_params.get('two_qf', False):
 82 |         q_function2 = nn.QFunction(name='q_function2', **q_function_params)
 83 |     else:
 84 |         q_function2 = None
 85 |     value_function = nn.ValueFunction(
 86 |         name='value_function', **value_function_params)
 87 |     target_value_function = nn.ValueFunction(
 88 |         name='target_value_function', **value_function_params)
 89 |     policy = nn.GaussianPolicy(
 90 |         action_dim=env.action_space.shape[0],
 91 |         reparameterize=algorithm_params['reparameterize'],
 92 |         **policy_params)
 93 | 
 94 |     sampler.initialize(env, policy, replay_pool)
 95 | 
 96 |     algorithm = SAC(**algorithm_params)
 97 | 
 98 |     tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
 99 |     tf_config.gpu_options.allow_growth = True  # may need if using GPU
100 |     with tf.Session(config=tf_config):
101 |         algorithm.build(
102 |             env=env,
103 |             policy=policy,
104 |             q_function=q_function,
105 |             q_function2=q_function2,
106 |             value_function=value_function,
107 |             target_value_function=target_value_function)
108 | 
109 |         for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)):
110 |             logz.log_tabular('Iteration', epoch)
111 |             for k, v in algorithm.get_statistics().items():
112 |                 logz.log_tabular(k, v)
113 |             for k, v in replay_pool.get_statistics().items():
114 |                 logz.log_tabular(k, v)
115 |             for k, v in sampler.get_statistics().items():
116 |                 logz.log_tabular(k, v)
117 |             logz.dump_tabular()
118 | 
119 | def main():
120 |     parser = argparse.ArgumentParser()
121 |     parser.add_argument('--env_name', type=str, default='HalfCheetah-v2')
122 |     parser.add_argument('--exp_name', type=str, default=None)
123 |     parser.add_argument('--seed', type=int, default=1)
124 |     parser.add_argument('--n_experiments', '-e', type=int, default=1)
125 |     args = parser.parse_args()
126 | 
127 |     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
128 | 
129 |     if not (os.path.exists(data_path)):
130 |         os.makedirs(data_path)
131 |     logdir = 'sac_' + args.env_name + '_' + args.exp_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
132 |     logdir = os.path.join(data_path, logdir)
133 | 
134 |     processes = []
135 | 
136 |     for e in range(args.n_experiments):
137 |         seed = args.seed + 10*e
138 |         print('Running experiment with seed %d'%seed)
139 | 
140 |         def train_func():
141 |             train_SAC(
142 |                 env_name=args.env_name,
143 |                 exp_name=args.exp_name,
144 |                 seed=seed,
145 |                 logdir=os.path.join(logdir, '%d' % seed),
146 |             )
147 |         # # Awkward hacky process runs, because Tensorflow does not like
148 |         # # repeatedly calling train_AC in the same thread.
149 |         p = Process(target=train_func, args=tuple())
150 |         p.start()
151 |         processes.append(p)
152 |         # if you comment in the line below, then the loop will block
153 |         # until this process finishes
154 |         # p.join()
155 | 
156 |     for p in processes:
157 |         p.join()
158 | 
159 | if __name__ == '__main__':
160 |     main()
161 | 


--------------------------------------------------------------------------------