├── .gitignore
├── LICENSE
├── README.md
├── gailtf
    ├── __init__.py
    ├── algo
    │   ├── __init__.py
    │   ├── behavior_clone.py
    │   └── trpo_mpi.py
    ├── baselines
    │   ├── __init__.py
    │   ├── bench
    │   │   ├── __init__.py
    │   │   ├── benchmarks.py
    │   │   └── monitor.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── atari_wrappers.py
    │   │   ├── atari_wrappers_deprecated.py
    │   │   ├── azure_utils.py
    │   │   ├── cg.py
    │   │   ├── console_util.py
    │   │   ├── dataset.py
    │   │   ├── distributions.py
    │   │   ├── math_util.py
    │   │   ├── misc_util.py
    │   │   ├── mpi_adam.py
    │   │   ├── mpi_fork.py
    │   │   ├── mpi_moments.py
    │   │   ├── mpi_running_mean_std.py
    │   │   ├── schedules.py
    │   │   ├── segment_tree.py
    │   │   ├── tests
    │   │   │   ├── test_schedules.py
    │   │   │   ├── test_segment_tree.py
    │   │   │   └── test_tf_util.py
    │   │   ├── tf_util.py
    │   │   └── vec_env
    │   │   │   ├── __init__.py
    │   │   │   └── subproc_vec_env.py
    │   ├── logger.py
    │   ├── ppo1
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── cnn_policy.py
    │   │   ├── mlp_policy.py
    │   │   ├── pposgd_simple.py
    │   │   └── run_mujoco.py
    │   └── trpo_mpi
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── nosharing_cnn_policy.py
    │   │   ├── run_mujoco.py
    │   │   └── trpo_mpi.py
    ├── common
    │   ├── __init__.py
    │   ├── statistics.py
    │   └── tf_util.py
    ├── dataset
    │   ├── __init__.py
    │   └── mujoco.py
    └── network
    │   └── adversary.py
├── main.py
└── misc
    ├── HalfCheetah-D.png
    ├── HalfCheetah-length-reward(D).png
    ├── HalfCheetah-true-reward.png
    ├── Hopper-D.png
    ├── Hopper-length-reward(D).png
    ├── Hopper-true-reward.png
    ├── Walker2d-D.png
    ├── Walker2d-length-reward(D).png
    ├── Walker2d-true-reward.png
    ├── exp.md
    ├── halfcheetah.png
    ├── hopper.png
    └── walker2d.png


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Platform-specific files and text editors
  2 | .DS_Store
  3 | *.swp
  4 | 
  5 | # Mujoco
  6 | MUJOCO_LOG.TXT
  7 | 
  8 | # TensorFlow checkpoints and logs
  9 | log/
 10 | checkpoint/
 11 | 
 12 | # Pickle files
 13 | *.pkl
 14 | 
 15 | 
 16 | ####### https://github.com/github/gitignore/blob/master/Python.gitignore #######
 17 | # Byte-compiled / optimized / DLL files
 18 | __pycache__/
 19 | *.py[cod]
 20 | *$py.class
 21 | 
 22 | # C extensions
 23 | *.so
 24 | 
 25 | # Distribution / packaging
 26 | .Python
 27 | build/
 28 | develop-eggs/
 29 | dist/
 30 | downloads/
 31 | eggs/
 32 | .eggs/
 33 | lib/
 34 | lib64/
 35 | parts/
 36 | sdist/
 37 | var/
 38 | wheels/
 39 | *.egg-info/
 40 | .installed.cfg
 41 | *.egg
 42 | MANIFEST
 43 | 
 44 | # PyInstaller
 45 | #  Usually these files are written by a python script from a template
 46 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 47 | *.manifest
 48 | *.spec
 49 | 
 50 | # Installer logs
 51 | pip-log.txt
 52 | pip-delete-this-directory.txt
 53 | 
 54 | # Unit test / coverage reports
 55 | htmlcov/
 56 | .tox/
 57 | .coverage
 58 | .coverage.*
 59 | .cache
 60 | nosetests.xml
 61 | coverage.xml
 62 | *.cover
 63 | .hypothesis/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | .static_storage/
 72 | .media/
 73 | local_settings.py
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # pyenv
 92 | .python-version
 93 | 
 94 | # celery beat schedule file
 95 | celerybeat-schedule
 96 | 
 97 | # SageMath parsed files
 98 | *.sage.py
 99 | 
100 | # Environments
101 | .env
102 | .venv
103 | env/
104 | venv/
105 | ENV/
106 | env.bak/
107 | venv.bak/
108 | 
109 | # Spyder project settings
110 | .spyderproject
111 | .spyproject
112 | 
113 | # Rope project settings
114 | .ropeproject
115 | 
116 | # mkdocs documentation
117 | /site
118 | 
119 | # mypy
120 | .mypy_cache/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017 Andrew Liao
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Check out the simpler version at [openai/baselines/gail](https://github.com/openai/baselines/blob/master/baselines/gail/README.md)!
  2 |        
  3 |        
  4 |        
  5 |        
  6 |        
  7 |        
  8 | # gail-tf
  9 | Tensorflow implementation of Generative Adversarial Imitation Learning (and 
 10 | behavior cloning)
 11 | 
 12 | **disclaimers**: some code is borrowed from @openai/baselines
 13 | 
 14 | ## What's GAIL?
 15 | - model free imtation learning -> low sample efficiency in training time
 16 |   - model-based GAIL: End-to-End Differentiable Adversarial Imitation Learning
 17 | - Directly extract policy from demonstrations
 18 | - Remove the RL optimization from the inner loop od inverse RL
 19 | - Some work based on GAIL:
 20 |   - Inferring The Latent Structure of Human Decision-Making from Raw Visual 
 21 |     Inputs
 22 |   - Multi-Modal Imitation Learning from Unstructured Demonstrations using 
 23 |   Generative Adversarial Nets
 24 |   - Robust Imitation of Diverse Behaviors
 25 |   
 26 | ## Requirements
 27 | - python==3.5.2
 28 | - mujoco-py==0.5.7
 29 | - tensorflow==1.1.0
 30 | - gym==0.9.3
 31 | 
 32 | ## Run the code
 33 | I separate the code into two parts: (1) Sampling expert data, (2) Imitation 
 34 | learning with GAIL/BC
 35 | 
 36 | ### Step 1: Generate expert data
 37 | 
 38 | #### Train the expert policy using PPO/TRPO, from openai/baselines
 39 | Ensure that `$GAILTF` is set to the path to your gail-tf repository, and 
 40 | `$ENV_ID` is any valid OpenAI gym environment (e.g. Hopper-v1, HalfCheetah-v1, 
 41 | etc.)
 42 | 
 43 | ##### Configuration
 44 | ``` bash
 45 | export GAILTF=/path/to/your/gail-tf
 46 | export ENV_ID="Hopper-v1"
 47 | export BASELINES_PATH=$GAILTF/gailtf/baselines/ppo1 # use gailtf/baselines/trpo_mpi for TRPO
 48 | export SAMPLE_STOCHASTIC="False"            # use True for stochastic sampling
 49 | export STOCHASTIC_POLICY="False"            # use True for a stochastic policy
 50 | export PYTHONPATH=$GAILTF:$PYTHONPATH       # as mentioned below
 51 | cd $GAILTF
 52 | ```
 53 | 
 54 | ##### Train the expert
 55 | ```bash
 56 | python3 $BASELINES_PATH/run_mujoco.py --env_id $ENV_ID
 57 | ```
 58 | 
 59 | The trained model will save in ```./checkpoint```, and its precise name will
 60 | vary based on your optimization method and environment ID. Choose the last 
 61 | checkpoint in the series.
 62 | 
 63 | ```bash
 64 | export PATH_TO_CKPT=./checkpoint/trpo.Hopper.0.00/trpo.Hopper.00-900
 65 | ```
 66 | 
 67 | ##### Sample from the generated expert policy
 68 | ```bash
 69 | python3 $BASELINES_PATH/run_mujoco.py --env_id $ENV_ID --task sample_trajectory --sample_stochastic $SAMPLE_STOCHASTIC --load_model_path $PATH_TO_CKPT
 70 | ```
 71 | 
 72 | This will generate a pickle file that store the expert trajectories in 
 73 | ```./XXX.pkl``` (e.g. deterministic.ppo.Hopper.0.00.pkl)
 74 | 
 75 | ```bash
 76 | export PICKLE_PATH=./stochastic.trpo.Hopper.0.00.pkl
 77 | ```
 78 | 
 79 | ### Step 2: Imitation learning
 80 | 
 81 | #### Imitation learning via GAIL
 82 | 
 83 | ```bash
 84 | python3 main.py --env_id $ENV_ID --expert_path $PICKLE_PATH
 85 | ```
 86 | 
 87 | Usage:
 88 | ```bash
 89 | --env_id:          The environment id
 90 | --num_cpu:         Number of CPU available during sampling
 91 | --expert_path:     The path to the pickle file generated in the [previous section]()
 92 | --traj_limitation: Limitation of the exerpt trajectories
 93 | --g_step:          Number of policy optimization steps in each iteration
 94 | --d_step:          Number of discriminator optimization steps in each iteration
 95 | --num_timesteps:   Number of timesteps to train (limit the number of timesteps to interact with environment)
 96 | ```
 97 | 
 98 | To view the summary plots in TensorBoard, issue
 99 | ```bash
100 | tensorboard --logdir $GAILTF/log
101 | ```
102 | 
103 | ##### Evaluate your GAIL agent
104 | ```bash
105 | python3 main.py --env_id $ENV_ID --task evaluate --stochastic_policy $STOCHASTIC_POLICY --load_model_path $PATH_TO_CKPT --expert_path $PICKLE_PATH
106 | ```
107 | 
108 | #### Imitation learning via Behavioral Cloning
109 | ```bash
110 | python3 main.py --env_id $ENV_ID --algo bc --expert_path $PICKLE_PATH
111 | ```
112 | 
113 | ##### Evaluate your BC agent
114 | ```bash
115 | python3 main.py --env_id $ENV_ID --algo bc --task evalaute --stochastic_policy $STOCHASTIC_POLICY --load_model_path $PATH_TO_CKPT --expert_path $PICKLE_PATH
116 | ```
117 | 
118 | ## Results
119 | 
120 | Note: The following hyper-parameter setting is the best that I've tested (simple 
121 | grid search on setting with 1500 trajectories), just for your information.
122 | 
123 | The different curves below correspond to different expert size (1000,100,10,5).
124 | 
125 | - Hopper-v1 (Average total return of expert policy: 3589)
126 | 
127 | ```bash
128 | python3 main.py --env_id Hopper-v1 --expert_path baselines/ppo1/deterministic.ppo.Hopper.0.00.pkl --g_step 3 --adversary_entcoeff 0
129 | ```
130 | 
131 | ![](misc/Hopper-true-reward.png)
132 | 
133 | - Walker-v1 (Average total return of expert policy: 4392)
134 | 
135 | ```bash
136 | python3 main.py --env_id Walker2d-v1 --expert_path baselines/ppo1/deterministic.ppo.Walker2d.0.00.pkl --g_step 3 --adversary_entcoeff 1e-3
137 | ```
138 | 
139 | ![](misc/Walker2d-true-reward.png)
140 | 
141 | - HalfCheetah-v1 (Average total return of expert policy: 2110)
142 | 
143 | For HalfCheetah-v1 and Ant-v1, using behavior cloning is needed:
144 | ```bash
145 | python3 main.py --env_id HalfCheetah-v1 --expert_path baselines/ppo1/deterministic.ppo.HalfCheetah.0.00.pkl --pretrained True --BC_max_iter 10000 --g_step 3 --adversary_entcoeff 1e-3
146 | ```
147 | 
148 | ![](misc/HalfCheetah-true-reward.png)
149 | 
150 | **You can find more details [here](https://github.com/andrewliao11/gail-tf/blob/master/misc/exp.md), 
151 | GAIL policy [here](https://drive.google.com/drive/folders/0B3fKFm-j0RqeRnZMTUJHSmdIdlU?usp=sharing), 
152 | and BC policy [here](https://drive.google.com/drive/folders/0B3fKFm-j0RqeVFFmMWpHMk85cUk?usp=sharing)**
153 | 
154 | ## Hacking
155 | We don't have a pip package yet, so you'll need to add this repo to your 
156 | PYTHONPATH manually.
157 | ```bash
158 | export PYTHONPATH=/path/to/your/repo/with/gailtf:$PYTHONPATH
159 | ```
160 | 
161 | ## TODO
162 | * Create pip package/setup.py
163 | * Make style PEP8 compliant
164 | * Create requirements.txt
165 | * Depend on openai/baselines directly and modularize modifications
166 | * openai/robotschool support
167 | 
168 | ## TroubleShooting
169 | 
170 | - encounter `error: Cannot compile MPI programs. Check your configuration!!!` or the systme complain about `mpi/h` 
171 | ```bash
172 | sudo apt install libopenmpi-dev
173 | ```
174 | 
175 | ## Reference
176 | - Jonathan Ho and Stefano Ermon. Generative adversarial imitation learning, [[arxiv](https://arxiv.org/abs/1606.03476)]
177 | - @openai/imitation
178 | - @openai/baselines
179 | 


--------------------------------------------------------------------------------
/gailtf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/gailtf/__init__.py


--------------------------------------------------------------------------------
/gailtf/algo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/gailtf/algo/__init__.py


--------------------------------------------------------------------------------
/gailtf/algo/behavior_clone.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import gailtf.baselines.common.tf_util as U
 3 | from gailtf.baselines import logger
 4 | from tqdm import tqdm
 5 | from gailtf.baselines.common.mpi_adam import MpiAdam
 6 | import tempfile, os
 7 | from common.statistics import stats
 8 | import ipdb
 9 | 
10 | def evaluate(env, policy_func, load_model_path, stochastic_policy=False, number_trajs=10):
11 |   from algo.trpo_mpi import traj_episode_generator
12 |   ob_space = env.observation_space
13 |   ac_space = env.action_space
14 |   pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
15 |   # placeholder
16 |   ob = U.get_placeholder_cached(name="ob")
17 |   ac = pi.pdtype.sample_placeholder([None])
18 |   stochastic = U.get_placeholder_cached(name="stochastic")
19 |   ep_gen = traj_episode_generator(pi, env, 1024, stochastic=stochastic_policy)
20 |   U.load_state(load_model_path)
21 |   len_list = []
22 |   ret_list = []
23 |   for _ in tqdm(range(number_trajs)):
24 |     traj = ep_gen.__next__()
25 |     ep_len, ep_ret = traj['ep_len'], traj['ep_ret']
26 |     len_list.append(ep_len)
27 |     ret_list.append(ep_ret)
28 |   if stochastic_policy:
29 |     print ('stochastic policy:')
30 |   else:
31 |     print ('deterministic policy:' )
32 |   print ("Average length:", sum(len_list)/len(len_list))
33 |   print ("Average return:", sum(ret_list)/len(ret_list))
34 | 
35 | def learn(env, policy_func, dataset, pretrained, optim_batch_size=128, max_iters=1e4,
36 |            adam_epsilon=1e-5, optim_stepsize=3e-4, ckpt_dir=None, log_dir=None, task_name=None):
37 |   val_per_iter = int(max_iters/10)
38 |   ob_space = env.observation_space
39 |   ac_space = env.action_space
40 |   pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
41 |   # placeholder
42 |   ob = U.get_placeholder_cached(name="ob")
43 |   ac = pi.pdtype.sample_placeholder([None])
44 |   stochastic = U.get_placeholder_cached(name="stochastic")
45 |   loss = tf.reduce_mean(tf.square(ac-pi.ac))
46 |   var_list = pi.get_trainable_variables()
47 |   adam = MpiAdam(var_list, epsilon=adam_epsilon)
48 |   lossandgrad = U.function([ob, ac, stochastic], [loss]+[U.flatgrad(loss, var_list)])
49 | 
50 |   if not pretrained:
51 |     writer = U.FileWriter(log_dir)
52 |     ep_stats = stats(["Loss"])
53 |   U.initialize()
54 |   adam.sync()
55 |   logger.log("Pretraining with Behavior Cloning...")
56 |   for iter_so_far in tqdm(range(int(max_iters))):
57 |     ob_expert, ac_expert = dataset.get_next_batch(optim_batch_size, 'train')
58 |     loss, g = lossandgrad(ob_expert, ac_expert, True)
59 |     adam.update(g, optim_stepsize)
60 |     if not pretrained:
61 |       ep_stats.add_all_summary(writer, [loss], iter_so_far)
62 |     if iter_so_far % val_per_iter == 0:
63 |       ob_expert, ac_expert = dataset.get_next_batch(-1, 'val')
64 |       loss, g = lossandgrad(ob_expert, ac_expert, False)
65 |       logger.log("Validation:")
66 |       logger.log("Loss: %f"%loss)
67 |       if not pretrained:
68 |         U.save_state(os.path.join(ckpt_dir, task_name), counter=iter_so_far)
69 |   if pretrained:
70 |     savedir_fname = tempfile.TemporaryDirectory().name
71 |     U.save_state(savedir_fname, var_list=pi.get_variables())
72 |     return savedir_fname
73 | 


--------------------------------------------------------------------------------
/gailtf/algo/trpo_mpi.py:
--------------------------------------------------------------------------------
  1 | from gailtf.baselines.common import explained_variance, zipsame, dataset, Dataset, fmt_row
  2 | from gailtf.baselines import logger
  3 | import gailtf.baselines.common.tf_util as U
  4 | import tensorflow as tf, numpy as np
  5 | import time, os
  6 | from gailtf.baselines.common import colorize
  7 | from mpi4py import MPI
  8 | from collections import deque
  9 | from gailtf.baselines.common.mpi_adam import MpiAdam
 10 | from gailtf.baselines.common.cg import cg
 11 | from contextlib import contextmanager
 12 | from gailtf.common.statistics import stats
 13 | import ipdb
 14 | 
 15 | def traj_segment_generator(pi, env, discriminator, horizon, stochastic):
 16 |     # Initialize state variables
 17 |     t = 0
 18 |     ac = env.action_space.sample()
 19 |     new = True
 20 |     rew = 0.0
 21 |     true_rew = 0.0
 22 |     ob = env.reset()
 23 | 
 24 |     cur_ep_ret = 0
 25 |     cur_ep_len = 0
 26 |     cur_ep_true_ret = 0
 27 |     ep_true_rets = []
 28 |     ep_rets = []
 29 |     ep_lens = []
 30 | 
 31 |     # Initialize history arrays
 32 |     obs = np.array([ob for _ in range(horizon)])
 33 |     true_rews = np.zeros(horizon, 'float32')
 34 |     rews = np.zeros(horizon, 'float32')
 35 |     vpreds = np.zeros(horizon, 'float32')
 36 |     news = np.zeros(horizon, 'int32')
 37 |     acs = np.array([ac for _ in range(horizon)])
 38 |     prevacs = acs.copy()
 39 | 
 40 |     while True:
 41 |         prevac = ac
 42 |         ac, vpred = pi.act(stochastic, ob)
 43 |         # Slight weirdness here because we need value function at time T
 44 |         # before returning segment [0, T-1] so we get the correct
 45 |         # terminal value
 46 |         if t > 0 and t % horizon == 0:
 47 |             yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
 48 |                     "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
 49 |                     "ep_rets" : ep_rets, "ep_lens" : ep_lens, "ep_true_rets": ep_true_rets}
 50 |             _, vpred = pi.act(stochastic, ob)            
 51 |             # Be careful!!! if you change the downstream algorithm to aggregate
 52 |             # several of these batches, then be sure to do a deepcopy
 53 |             ep_rets = []
 54 |             ep_true_rets = []
 55 |             ep_lens = []
 56 |         i = t % horizon
 57 |         obs[i] = ob
 58 |         vpreds[i] = vpred
 59 |         news[i] = new
 60 |         acs[i] = ac
 61 |         prevacs[i] = prevac
 62 | 
 63 |         rew = discriminator.get_reward(ob, ac)
 64 |         ob, true_rew, new, _ = env.step(ac)
 65 |         rews[i] = rew
 66 |         true_rews[i] = true_rew
 67 | 
 68 |         cur_ep_ret += rew
 69 |         cur_ep_true_ret += true_rew
 70 |         cur_ep_len += 1
 71 |         if new:
 72 |             ep_rets.append(cur_ep_ret)
 73 |             ep_true_rets.append(cur_ep_true_ret)
 74 |             ep_lens.append(cur_ep_len)
 75 |             cur_ep_ret = 0
 76 |             cur_ep_true_ret = 0
 77 |             cur_ep_len = 0
 78 |             ob = env.reset()
 79 |         t += 1
 80 | 
 81 | def add_vtarg_and_adv(seg, gamma, lam):
 82 |     new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
 83 |     vpred = np.append(seg["vpred"], seg["nextvpred"])
 84 |     T = len(seg["rew"])
 85 |     seg["adv"] = gaelam = np.empty(T, 'float32')
 86 |     rew = seg["rew"]
 87 |     lastgaelam = 0
 88 |     for t in reversed(range(T)):
 89 |         nonterminal = 1-new[t+1]
 90 |         delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
 91 |         gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
 92 |     seg["tdlamret"] = seg["adv"] + seg["vpred"]
 93 | 
 94 | def learn(env, policy_func, discriminator, expert_dataset,
 95 |         pretrained, pretrained_weight, *,
 96 |         g_step, d_step,
 97 |         timesteps_per_batch, # what to train on
 98 |         max_kl, cg_iters,
 99 |         gamma, lam, # advantage estimation
100 |         entcoeff=0.0,
101 |         cg_damping=1e-2,
102 |         vf_stepsize=3e-4, d_stepsize=3e-4,
103 |         vf_iters =3,
104 |         max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
105 |         callback=None,
106 |         save_per_iter=100, ckpt_dir=None, log_dir=None, 
107 |         load_model_path=None, task_name=None
108 |         ):
109 |     nworkers = MPI.COMM_WORLD.Get_size()
110 |     rank = MPI.COMM_WORLD.Get_rank()
111 |     np.set_printoptions(precision=3)    
112 |     # Setup losses and stuff
113 |     # ----------------------------------------
114 |     ob_space = env.observation_space
115 |     ac_space = env.action_space
116 |     pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight!=None))
117 |     oldpi = policy_func("oldpi", ob_space, ac_space)
118 |     atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
119 |     ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
120 | 
121 |     ob = U.get_placeholder_cached(name="ob")
122 |     ac = pi.pdtype.sample_placeholder([None])
123 | 
124 |     kloldnew = oldpi.pd.kl(pi.pd)
125 |     ent = pi.pd.entropy()
126 |     meankl = U.mean(kloldnew)
127 |     meanent = U.mean(ent)
128 |     entbonus = entcoeff * meanent
129 | 
130 |     vferr = U.mean(tf.square(pi.vpred - ret))
131 | 
132 |     ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
133 |     surrgain = U.mean(ratio * atarg)
134 | 
135 |     optimgain = surrgain + entbonus
136 |     losses = [optimgain, meankl, entbonus, surrgain, meanent]
137 |     loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
138 | 
139 |     dist = meankl
140 | 
141 |     all_var_list = pi.get_trainable_variables()
142 |     var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
143 |     vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
144 |     d_adam = MpiAdam(discriminator.get_trainable_variables())
145 |     vfadam = MpiAdam(vf_var_list)
146 | 
147 |     get_flat = U.GetFlat(var_list)
148 |     set_from_flat = U.SetFromFlat(var_list)
149 |     klgrads = tf.gradients(dist, var_list)
150 |     flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
151 |     shapes = [var.get_shape().as_list() for var in var_list]
152 |     start = 0
153 |     tangents = []
154 |     for shape in shapes:
155 |         sz = U.intprod(shape)
156 |         tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
157 |         start += sz
158 |     gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
159 |     fvp = U.flatgrad(gvp, var_list)
160 | 
161 |     assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
162 |         for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
163 |     compute_losses = U.function([ob, ac, atarg], losses)
164 |     compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
165 |     compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
166 |     compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
167 | 
168 |     @contextmanager
169 |     def timed(msg):
170 |         if rank == 0:
171 |             print(colorize(msg, color='magenta'))
172 |             tstart = time.time()
173 |             yield
174 |             print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
175 |         else:
176 |             yield
177 |     
178 |     def allmean(x):
179 |         assert isinstance(x, np.ndarray)
180 |         out = np.empty_like(x)
181 |         MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
182 |         out /= nworkers
183 |         return out
184 | 
185 |     writer = U.FileWriter(log_dir)
186 |     U.initialize()
187 |     th_init = get_flat()
188 |     MPI.COMM_WORLD.Bcast(th_init, root=0)
189 |     set_from_flat(th_init)
190 |     d_adam.sync()
191 |     vfadam.sync()
192 |     print("Init param sum", th_init.sum(), flush=True)
193 | 
194 |     # Prepare for rollouts
195 |     # ----------------------------------------
196 |     seg_gen = traj_segment_generator(pi, env, discriminator, timesteps_per_batch, stochastic=True)
197 | 
198 |     episodes_so_far = 0
199 |     timesteps_so_far = 0
200 |     iters_so_far = 0
201 |     tstart = time.time()
202 |     lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
203 |     rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
204 |     true_rewbuffer = deque(maxlen=40)
205 | 
206 |     assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1
207 | 
208 |     g_loss_stats = stats(loss_names)
209 |     d_loss_stats = stats(discriminator.loss_name)
210 |     ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
211 |     # if provide pretrained weight
212 |     if pretrained_weight is not None:
213 |         U.load_state(pretrained_weight, var_list=pi.get_variables())
214 |     # if provieded model path
215 |     if load_model_path is not None:
216 |         U.load_state(load_model_path)
217 | 
218 |     while True:        
219 |         if callback: callback(locals(), globals())
220 |         if max_timesteps and timesteps_so_far >= max_timesteps:
221 |             break
222 |         elif max_episodes and episodes_so_far >= max_episodes:
223 |             break
224 |         elif max_iters and iters_so_far >= max_iters:
225 |             break
226 | 
227 |         # Save model
228 |         if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
229 |             U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far)
230 | 
231 |         logger.log("********** Iteration %i ************"%iters_so_far)
232 | 
233 |         def fisher_vector_product(p):
234 |             return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
235 |         # ------------------ Update G ------------------
236 |         logger.log("Optimizing Policy...")
237 |         for _ in range(g_step):
238 |             with timed("sampling"):
239 |                 seg = seg_gen.__next__()
240 |             add_vtarg_and_adv(seg, gamma, lam)
241 |             # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
242 |             ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
243 |             vpredbefore = seg["vpred"] # predicted value function before udpate
244 |             atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
245 | 
246 |             if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
247 | 
248 |             args = seg["ob"], seg["ac"], atarg
249 |             fvpargs = [arr[::5] for arr in args]
250 | 
251 |             assign_old_eq_new() # set old parameter values to new parameter values
252 |             with timed("computegrad"):
253 |                 *lossbefore, g = compute_lossandgrad(*args)
254 |             lossbefore = allmean(np.array(lossbefore))
255 |             g = allmean(g)
256 |             if np.allclose(g, 0):
257 |                 logger.log("Got zero gradient. not updating")
258 |             else:
259 |                 with timed("cg"):
260 |                     stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0)
261 |                 assert np.isfinite(stepdir).all()
262 |                 shs = .5*stepdir.dot(fisher_vector_product(stepdir))
263 |                 lm = np.sqrt(shs / max_kl)
264 |                 # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
265 |                 fullstep = stepdir / lm
266 |                 expectedimprove = g.dot(fullstep)
267 |                 surrbefore = lossbefore[0]
268 |                 stepsize = 1.0
269 |                 thbefore = get_flat()
270 |                 for _ in range(10):
271 |                     thnew = thbefore + fullstep * stepsize
272 |                     set_from_flat(thnew)
273 |                     meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
274 |                     improve = surr - surrbefore
275 |                     logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve))
276 |                     if not np.isfinite(meanlosses).all():
277 |                         logger.log("Got non-finite value of losses -- bad!")
278 |                     elif kl > max_kl * 1.5:
279 |                         logger.log("violated KL constraint. shrinking step.")
280 |                     elif improve < 0:
281 |                         logger.log("surrogate didn't improve. shrinking step.")
282 |                     else:
283 |                         logger.log("Stepsize OK!")
284 |                         break
285 |                     stepsize *= .5
286 |                 else:
287 |                     logger.log("couldn't compute a good step")
288 |                     set_from_flat(thbefore)
289 |                 if nworkers > 1 and iters_so_far % 20 == 0:
290 |                     paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples
291 |                     assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
292 |             with timed("vf"):
293 |                 for _ in range(vf_iters):
294 |                     for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]),
295 |                     include_final_partial_batch=False, batch_size=128):
296 |                         if hasattr(pi, "ob_rms"): pi.ob_rms.update(mbob) # update running mean/std for policy
297 |                         g = allmean(compute_vflossandgrad(mbob, mbret))
298 |                         vfadam.update(g, vf_stepsize)
299 | 
300 |         g_losses = meanlosses
301 |         for (lossname, lossval) in zip(loss_names, meanlosses):
302 |             logger.record_tabular(lossname, lossval)
303 |         logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
304 |         # ------------------ Update D ------------------
305 |         logger.log("Optimizing Discriminator...")
306 |         logger.log(fmt_row(13, discriminator.loss_name))
307 |         ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob))
308 |         batch_size = len(ob) // d_step
309 |         d_losses = [] # list of tuples, each of which gives the loss for a minibatch
310 |         for ob_batch, ac_batch in dataset.iterbatches((ob, ac), 
311 |                include_final_partial_batch=False, batch_size=batch_size):
312 |             ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
313 |             # update running mean/std for discriminator
314 |             if hasattr(discriminator, "obs_rms"): discriminator.obs_rms.update(np.concatenate((ob_batch, ob_expert), 0))
315 |             *newlosses, g = discriminator.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert)
316 |             d_adam.update(allmean(g), d_stepsize)
317 |             d_losses.append(newlosses)
318 |         logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
319 | 
320 | 
321 |         lrlocal = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values
322 |         listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
323 |         lens, rews, true_rets = map(flatten_lists, zip(*listoflrpairs))
324 |         true_rewbuffer.extend(true_rets)
325 |         lenbuffer.extend(lens)
326 |         rewbuffer.extend(rews)
327 | 
328 |         logger.record_tabular("EpLenMean", np.mean(lenbuffer))
329 |         logger.record_tabular("EpRewMean", np.mean(rewbuffer))
330 |         logger.record_tabular("EpTrueRewMean", np.mean(true_rewbuffer))
331 |         logger.record_tabular("EpThisIter", len(lens))
332 |         episodes_so_far += len(lens)
333 |         timesteps_so_far += sum(lens)
334 |         iters_so_far += 1
335 | 
336 |         logger.record_tabular("EpisodesSoFar", episodes_so_far)
337 |         logger.record_tabular("TimestepsSoFar", timesteps_so_far)
338 |         logger.record_tabular("TimeElapsed", time.time() - tstart)
339 | 
340 |         if rank==0:
341 |             logger.dump_tabular()
342 |             g_loss_stats.add_all_summary(writer, g_losses, iters_so_far)
343 |             d_loss_stats.add_all_summary(writer, np.mean(d_losses, axis=0), iters_so_far)
344 |             ep_stats.add_all_summary(writer, [np.mean(true_rewbuffer), np.mean(rewbuffer),
345 |                            np.mean(lenbuffer)], iters_so_far)
346 | 
347 | # Sample one trajectory (until trajectory end)
348 | def traj_episode_generator(pi, env, horizon, stochastic):
349 |     t = 0
350 |     ac = env.action_space.sample() # not used, just so we have the datatype
351 |     new = True # marks if we're on first timestep of an episode
352 | 
353 |     ob = env.reset()
354 |     cur_ep_ret = 0 # return in current episode
355 |     cur_ep_len = 0 # len of current episode
356 | 
357 |     # Initialize history arrays
358 |     obs = []; rews = []; news = []; acs = []
359 | 
360 |     while True:
361 |         prevac = ac
362 |         ac, vpred = pi.act(stochastic, ob)
363 |         obs.append(ob)
364 |         news.append(new)
365 |         acs.append(ac)
366 | 
367 |         ob, rew, new, _ = env.step(ac)
368 |         rews.append(rew)
369 | 
370 |         cur_ep_ret += rew
371 |         cur_ep_len += 1
372 |         if t > 0 and (new or t % horizon == 0):
373 |             # convert list into numpy array
374 |             obs = np.array(obs)
375 |             rews = np.array(rews)
376 |             news = np.array(news)
377 |             acs = np.array(acs)
378 |             yield {"ob":obs, "rew":rews, "new":news, "ac":acs,
379 |                     "ep_ret":cur_ep_ret, "ep_len":cur_ep_len}
380 |             ob = env.reset()
381 |             cur_ep_ret = 0; cur_ep_len = 0; t = 0
382 | 
383 |             # Initialize history arrays
384 |             obs = []; rews = []; news = []; acs = []
385 |         t += 1
386 | 
387 | def evaluate(env, policy_func, load_model_path, timesteps_per_batch, number_trajs=10, 
388 |          stochastic_policy=False):
389 |     
390 |     from tqdm import tqdm
391 |     # Setup network
392 |     # ----------------------------------------
393 |     ob_space = env.observation_space
394 |     ac_space = env.action_space
395 |     pi = policy_func("pi", ob_space, ac_space, reuse=False)
396 |     U.initialize()
397 |     # Prepare for rollouts
398 |     # ----------------------------------------
399 |     ep_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=stochastic_policy)
400 |     U.load_state(load_model_path)
401 | 
402 |     len_list = []
403 |     ret_list = []
404 |     for _ in tqdm(range(number_trajs)):
405 |         traj = ep_gen.__next__()
406 |         ep_len, ep_ret = traj['ep_len'], traj['ep_ret']
407 |         len_list.append(ep_len)
408 |         ret_list.append(ep_ret)
409 |     if stochastic_policy: 
410 |         print ('stochastic policy:')
411 |     else:
412 |         print ('deterministic policy:' )
413 |     print ("Average length:", sum(len_list)/len(len_list))
414 |     print ("Average return:", sum(ret_list)/len(ret_list))
415 | 
416 | def flatten_lists(listoflists):
417 |     return [el for list_ in listoflists for el in list_]
418 | 


--------------------------------------------------------------------------------
/gailtf/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/gailtf/baselines/__init__.py


--------------------------------------------------------------------------------
/gailtf/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from gailtf.baselines.bench.benchmarks import *
2 | from gailtf.baselines.bench.monitor import *
3 | 
4 | 


--------------------------------------------------------------------------------
/gailtf/baselines/bench/benchmarks.py:
--------------------------------------------------------------------------------
  1 | _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
  2 | _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
  3 | 
  4 | _BENCHMARKS = []
  5 | 
  6 | def register_benchmark(benchmark):
  7 |     for b in _BENCHMARKS:
  8 |         if b['name'] == benchmark['name']:
  9 |             raise ValueError('Benchmark with name %s already registered!'%b['name'])
 10 |     _BENCHMARKS.append(benchmark)
 11 | 
 12 | def list_benchmarks():
 13 |     return [b['name'] for b in _BENCHMARKS]
 14 | 
 15 | def get_benchmark(benchmark_name):
 16 |     for b in _BENCHMARKS:
 17 |         if b['name'] == benchmark_name:
 18 |             return b
 19 |     raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
 20 | 
 21 | def get_task(benchmark, env_id):
 22 |     """Get a task by env_id. Return None if the benchmark doesn't have the env"""
 23 |     return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
 24 | 
 25 | def find_task_for_env_id_in_any_benchmark(env_id):
 26 |     for bm in _BENCHMARKS:
 27 |         for task in bm["tasks"]:
 28 |             if task["env_id"]==env_id:
 29 |                 return bm, task
 30 |     return None, None
 31 | 
 32 | _ATARI_SUFFIX = 'NoFrameskip-v4'
 33 | 
 34 | register_benchmark({
 35 |     'name' : 'Atari200M',
 36 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 200M frames',
 37 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(200e6)} for _game in _atari7]
 38 | })
 39 | 
 40 | register_benchmark({
 41 |     'name' : 'Atari40M',
 42 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
 43 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atari7]
 44 | })
 45 | 
 46 | register_benchmark({
 47 |     'name' : 'Atari1Hr',
 48 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
 49 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_seconds' : 60*60} for _game in _atari7]
 50 | })
 51 | 
 52 | register_benchmark({
 53 |     'name' : 'AtariExploration40M',
 54 |     'description' :'7 Atari games emphasizing exploration, with pixel observations, 40M frames',
 55 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atariexpl7]
 56 | })
 57 | 
 58 | 
 59 | # MuJoCo
 60 | 
 61 | _mujocosmall = [
 62 |     'InvertedDoublePendulum-v1', 'InvertedPendulum-v1',
 63 |     'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
 64 |     'Reacher-v1', 'Swimmer-v1']
 65 | register_benchmark({
 66 |     'name' : 'Mujoco1M',
 67 |     'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps',
 68 |     'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall]
 69 | })
 70 | register_benchmark({
 71 |     'name' : 'MujocoWalkers',
 72 |     'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M',
 73 |     'tasks' : [
 74 |         {'env_id' : "Hopper-v1",          'trials' : 4, 'num_timesteps' :   8*1000000 },
 75 |         {'env_id' : "Walker2d-v1",        'trials' : 4, 'num_timesteps' :   8*1000000 },
 76 |         {'env_id' : "Humanoid-v1",        'trials' : 4, 'num_timesteps' : 100*1000000 },
 77 |     ]
 78 | })
 79 | # To reproduce:
 80 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
 81 | # (observation input filters necessary)
 82 | 
 83 | 
 84 | # Roboschool
 85 | 
 86 | register_benchmark({
 87 |     'name' : 'Roboschool8M',
 88 |     'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
 89 |     'tasks' : [
 90 |         {'env_id' : "RoboschoolReacher-v1",                 'trials' : 4, 'num_timesteps' :  2*1000000 },
 91 |         {'env_id' : "RoboschoolAnt-v1",                     'trials' : 4, 'num_timesteps' :  8*1000000 },
 92 |         {'env_id' : "RoboschoolHalfCheetah-v1",             'trials' : 4, 'num_timesteps' :  8*1000000 },
 93 |         {'env_id' : "RoboschoolHopper-v1",                  'trials' : 4, 'num_timesteps' :  8*1000000 },
 94 |         {'env_id' : "RoboschoolWalker2d-v1",                'trials' : 4, 'num_timesteps' :  8*1000000 },
 95 |         ]
 96 | })
 97 | register_benchmark({
 98 |     'name' : 'RoboschoolHarder',
 99 |     'description' : 'Test your might!!! Up to 12 hours on 32 cores',
100 |     'tasks' : [
101 |         {'env_id' : "RoboschoolHumanoid-v1",              'trials' : 4, 'num_timesteps' : 100*1000000 },
102 |         {'env_id' : "RoboschoolHumanoidFlagrun-v1",       'trials' : 4, 'num_timesteps' : 200*1000000 },
103 |         {'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 },
104 |         ]
105 | })
106 | # To reproduce:
107 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M     myrun_ppo2_cpu8
108 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
109 | # (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)
110 | 
111 | 
112 | # Other
113 | 
114 | _atari50 =  [ # actually 49
115 |             'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 
116 |             'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider',  'Bowling', 
117 |             'Boxing', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', 
118 |             'DemonAttack', 'DoubleDunk',  'Enduro', 'FishingDerby', 'Freeway', 
119 |             'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',  
120 |             'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 
121 |             'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert', 
122 |             'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 
123 |             'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 
124 |             'Venture', 'VideoPinball', 'WizardOfWor', 'Zaxxon', 
125 | ]
126 | 
127 | register_benchmark({
128 |     'name' : 'Atari50_40M',
129 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
130 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50]
131 | })
132 | 
133 | def env_shortname(s):
134 |     "Make typical names above shorter, while keeping recognizable"
135 |     s = s.replace("NoFrameskip", "")
136 |     if s[:10]=="Roboschool": s = s[10:]
137 |     i = s.rfind("-v")
138 |     if i!=-1: s = s[:i]
139 | 
140 |     return s.lower()
141 | 


--------------------------------------------------------------------------------
/gailtf/baselines/bench/monitor.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['Monitor', 'get_monitor_files', 'load_results']
  2 | 
  3 | import gym
  4 | from gym.core import Wrapper
  5 | from os import path
  6 | import time
  7 | from glob import glob
  8 | 
  9 | try:
 10 |     import ujson as json # Not necessary for monitor writing, but very useful for monitor loading
 11 | except ImportError:
 12 |     import json
 13 | 
 14 | class Monitor(Wrapper):
 15 |     EXT = "monitor.json"
 16 |     f = None
 17 | 
 18 |     def __init__(self, env, filename, allow_early_resets=False):
 19 |         Wrapper.__init__(self, env=env)
 20 |         self.tstart = time.time()
 21 |         if filename is None:
 22 |             self.f = None
 23 |             self.logger = None
 24 |         else:
 25 |             if not filename.endswith(Monitor.EXT):
 26 |                 filename = filename + "." + Monitor.EXT
 27 |             self.f = open(filename, "wt")
 28 |             self.logger = JSONLogger(self.f)
 29 |             self.logger.writekvs({"t_start": self.tstart, "gym_version": gym.__version__,
 30 |                 "env_id": env.spec.id if env.spec else 'Unknown'})
 31 |         self.allow_early_resets = allow_early_resets
 32 |         self.rewards = None
 33 |         self.needs_reset = True
 34 |         self.episode_rewards = []
 35 |         self.episode_lengths = []
 36 |         self.total_steps = 0
 37 |         self.current_metadata = {} # extra info that gets injected into each log entry
 38 |         # Useful for metalearning where we're modifying the environment externally
 39 |         # But want our logs to know about these modifications
 40 | 
 41 |     def __getstate__(self): # XXX
 42 |         d = self.__dict__.copy()
 43 |         if self.f:
 44 |             del d['f'], d['logger']
 45 |             d['_filename'] = self.f.name
 46 |             d['_num_episodes'] = len(self.episode_rewards)
 47 |         else:
 48 |             d['_filename'] = None
 49 |         return d
 50 |     def __setstate__(self, d):
 51 |         filename = d.pop('_filename')
 52 |         self.__dict__ = d
 53 |         if filename is not None:
 54 |             nlines = d.pop('_num_episodes') + 1
 55 |             self.f = open(filename, "r+t")
 56 |             for _ in range(nlines):
 57 |                 self.f.readline()
 58 |             self.f.truncate()        
 59 |             self.logger = JSONLogger(self.f)
 60 | 
 61 | 
 62 |     def reset(self):
 63 |         if not self.allow_early_resets and not self.needs_reset:
 64 |             raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
 65 |         self.rewards = []
 66 |         self.needs_reset = False
 67 |         return self.env.reset()
 68 | 
 69 |     def step(self, action):
 70 |         if self.needs_reset:
 71 |             raise RuntimeError("Tried to step environment that needs reset")
 72 |         ob, rew, done, info = self.env.step(action)
 73 |         self.rewards.append(rew)
 74 |         if done:
 75 |             self.needs_reset = True
 76 |             eprew = sum(self.rewards)
 77 |             eplen = len(self.rewards)
 78 |             epinfo = {"r": eprew, "l": eplen, "t": round(time.time() - self.tstart, 6)}
 79 |             epinfo.update(self.current_metadata)
 80 |             if self.logger:
 81 |                 self.logger.writekvs(epinfo)
 82 |             self.episode_rewards.append(eprew)
 83 |             self.episode_lengths.append(eplen)
 84 |             info['episode'] = epinfo
 85 |         self.total_steps += 1
 86 |         return (ob, rew, done, info)
 87 | 
 88 |     def close(self):
 89 |         if self.f is not None:
 90 |             self.f.close()
 91 | 
 92 |     def get_total_steps(self):
 93 |         return self.total_steps
 94 | 
 95 |     def get_episode_rewards(self):
 96 |         return self.episode_rewards
 97 | 
 98 |     def get_episode_lengths(self):
 99 |         return self.episode_lengths
100 | 
101 | class JSONLogger(object):
102 |     def __init__(self, file):
103 |         self.file = file
104 | 
105 |     def writekvs(self, kvs):
106 |         for k,v in kvs.items():
107 |             if hasattr(v, 'dtype'):
108 |                 v = v.tolist()
109 |                 kvs[k] = float(v)
110 |         self.file.write(json.dumps(kvs) + '\n')
111 |         self.file.flush()
112 | 
113 | 
114 | class LoadMonitorResultsError(Exception):
115 |     pass
116 | 
117 | def get_monitor_files(dir):
118 |     return glob(path.join(dir, "*" + Monitor.EXT))
119 | 
120 | def load_results(dir, raw_episodes=False):
121 |     fnames = get_monitor_files(dir)
122 |     if not fnames:
123 |         raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
124 |     episodes = []
125 |     headers = []
126 |     for fname in fnames:
127 |         with open(fname, 'rt') as fh:
128 |             lines = fh.readlines()
129 |         header = json.loads(lines[0])
130 |         headers.append(header)
131 |         for line in lines[1:]:
132 |             episode = json.loads(line)
133 |             episode['abstime'] = header['t_start'] + episode['t']
134 |             del episode['t']
135 |             episodes.append(episode)
136 |     header0 = headers[0]
137 |     for header in headers[1:]:
138 |         assert header['env_id'] == header0['env_id'], "mixing data from two envs"
139 |     episodes = sorted(episodes, key=lambda e: e['abstime'])
140 |     if raw_episodes: 
141 |         return episodes
142 |     else:
143 |         return {
144 |             'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
145 |             'episode_end_times': [e['abstime'] for e in episodes],
146 |             'episode_lengths': [e['l'] for e in episodes],
147 |             'episode_rewards': [e['r'] for e in episodes],
148 |             'initial_reset_time': min([min(header['t_start'] for header in headers)])
149 |         }
150 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | from gailtf.baselines.common.console_util import *
2 | from gailtf.baselines.common.dataset import Dataset
3 | from gailtf.baselines.common.math_util import *
4 | from gailtf.baselines.common.misc_util import *
5 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import deque
  3 | from PIL import Image
  4 | import gym
  5 | from gym import spaces
  6 | 
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         gym.Wrapper.__init__(self, env)
 14 |         self.noop_max = noop_max
 15 |         self.override_num_noops = None
 16 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 17 | 
 18 |     def _reset(self):
 19 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 20 |         self.env.reset()
 21 |         if self.override_num_noops is not None:
 22 |             noops = self.override_num_noops
 23 |         else:
 24 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 25 |         assert noops > 0
 26 |         obs = None
 27 |         for _ in range(noops):
 28 |             obs, _, done, _ = self.env.step(0)
 29 |             if done:
 30 |                 obs = self.env.reset()
 31 |         return obs
 32 | 
 33 | class FireResetEnv(gym.Wrapper):
 34 |     def __init__(self, env):
 35 |         """Take action on reset for environments that are fixed until firing."""
 36 |         gym.Wrapper.__init__(self, env)
 37 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 38 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 39 | 
 40 |     def _reset(self):
 41 |         self.env.reset()
 42 |         obs, _, done, _ = self.env.step(1)
 43 |         if done:
 44 |             self.env.reset()
 45 |         obs, _, done, _ = self.env.step(2)
 46 |         if done:
 47 |             self.env.reset()
 48 |         return obs
 49 | 
 50 | class EpisodicLifeEnv(gym.Wrapper):
 51 |     def __init__(self, env):
 52 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 53 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 54 |         """
 55 |         gym.Wrapper.__init__(self, env)
 56 |         self.lives = 0
 57 |         self.was_real_done  = True
 58 | 
 59 |     def _step(self, action):
 60 |         obs, reward, done, info = self.env.step(action)
 61 |         self.was_real_done = done
 62 |         # check current lives, make loss of life terminal,
 63 |         # then update lives to handle bonus lives
 64 |         lives = self.env.unwrapped.ale.lives()
 65 |         if lives < self.lives and lives > 0:
 66 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 67 |             # so its important to keep lives > 0, so that we only reset once
 68 |             # the environment advertises done.
 69 |             done = True
 70 |         self.lives = lives
 71 |         return obs, reward, done, info
 72 | 
 73 |     def _reset(self):
 74 |         """Reset only when lives are exhausted.
 75 |         This way all states are still reachable even though lives are episodic,
 76 |         and the learner need not know about any of this behind-the-scenes.
 77 |         """
 78 |         if self.was_real_done:
 79 |             obs = self.env.reset()
 80 |         else:
 81 |             # no-op step to advance from terminal/lost life state
 82 |             obs, _, _, _ = self.env.step(0)
 83 |         self.lives = self.env.unwrapped.ale.lives()
 84 |         return obs
 85 | 
 86 | class MaxAndSkipEnv(gym.Wrapper):
 87 |     def __init__(self, env, skip=4):
 88 |         """Return only every `skip`-th frame"""
 89 |         gym.Wrapper.__init__(self, env)
 90 |         # most recent raw observations (for max pooling across time steps)
 91 |         self._obs_buffer = deque(maxlen=2)
 92 |         self._skip       = skip
 93 | 
 94 |     def _step(self, action):
 95 |         """Repeat action, sum reward, and max over last observations."""
 96 |         total_reward = 0.0
 97 |         done = None
 98 |         for _ in range(self._skip):
 99 |             obs, reward, done, info = self.env.step(action)
100 |             self._obs_buffer.append(obs)
101 |             total_reward += reward
102 |             if done:
103 |                 break
104 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
105 | 
106 |         return max_frame, total_reward, done, info
107 | 
108 |     def _reset(self):
109 |         """Clear past frame buffer and init. to first obs. from inner env."""
110 |         self._obs_buffer.clear()
111 |         obs = self.env.reset()
112 |         self._obs_buffer.append(obs)
113 |         return obs
114 | 
115 | class ClipRewardEnv(gym.RewardWrapper):
116 |     def _reward(self, reward):
117 |         """Bin reward to {+1, 0, -1} by its sign."""
118 |         return np.sign(reward)
119 | 
120 | class WarpFrame(gym.ObservationWrapper):
121 |     def __init__(self, env):
122 |         """Warp frames to 84x84 as done in the Nature paper and later work."""
123 |         gym.ObservationWrapper.__init__(self, env)
124 |         self.res = 84
125 |         self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1))
126 | 
127 |     def _observation(self, obs):
128 |         frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32'))
129 |         frame = np.array(Image.fromarray(frame).resize((self.res, self.res),
130 |             resample=Image.BILINEAR), dtype=np.uint8)
131 |         return frame.reshape((self.res, self.res, 1))
132 | 
133 | class FrameStack(gym.Wrapper):
134 |     def __init__(self, env, k):
135 |         """Buffer observations and stack across channels (last axis)."""
136 |         gym.Wrapper.__init__(self, env)
137 |         self.k = k
138 |         self.frames = deque([], maxlen=k)
139 |         shp = env.observation_space.shape
140 |         assert shp[2] == 1  # can only stack 1-channel frames
141 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k))
142 | 
143 |     def _reset(self):
144 |         """Clear buffer and re-fill by duplicating the first observation."""
145 |         ob = self.env.reset()
146 |         for _ in range(self.k): self.frames.append(ob)
147 |         return self._observation()
148 | 
149 |     def _step(self, action):
150 |         ob, reward, done, info = self.env.step(action)
151 |         self.frames.append(ob)
152 |         return self._observation(), reward, done, info
153 | 
154 |     def _observation(self):
155 |         assert len(self.frames) == self.k
156 |         return np.concatenate(self.frames, axis=2)
157 | 
158 | def wrap_deepmind(env, episode_life=True, clip_rewards=True):
159 |     """Configure environment for DeepMind-style Atari.
160 | 
161 |     Note: this does not include frame stacking!"""
162 |     assert 'NoFrameskip' in env.spec.id  # required for DeepMind-style skip
163 |     if episode_life:
164 |         env = EpisodicLifeEnv(env)
165 |     env = NoopResetEnv(env, noop_max=30)
166 |     env = MaxAndSkipEnv(env, skip=4)
167 |     if 'FIRE' in env.unwrapped.get_action_meanings():
168 |         env = FireResetEnv(env)
169 |     env = WarpFrame(env)
170 |     if clip_rewards:
171 |         env = ClipRewardEnv(env)
172 |     return env
173 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/atari_wrappers_deprecated.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import gym
  3 | import numpy as np
  4 | 
  5 | from collections import deque
  6 | from gym import spaces
  7 | 
  8 | 
  9 | class NoopResetEnv(gym.Wrapper):
 10 |     def __init__(self, env=None, noop_max=30):
 11 |         """Sample initial states by taking random number of no-ops on reset.
 12 |         No-op is assumed to be action 0.
 13 |         """
 14 |         super(NoopResetEnv, self).__init__(env)
 15 |         self.noop_max = noop_max
 16 |         self.override_num_noops = None
 17 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 18 | 
 19 |     def _reset(self):
 20 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 21 |         self.env.reset()
 22 |         if self.override_num_noops is not None:
 23 |             noops = self.override_num_noops
 24 |         else:
 25 |             noops = np.random.randint(1, self.noop_max + 1)
 26 |         assert noops > 0
 27 |         obs = None
 28 |         for _ in range(noops):
 29 |             obs, _, done, _ = self.env.step(0)
 30 |             if done:
 31 |                 obs = self.env.reset()
 32 |         return obs
 33 | 
 34 | 
 35 | class FireResetEnv(gym.Wrapper):
 36 |     def __init__(self, env=None):
 37 |         """For environments where the user need to press FIRE for the game to start."""
 38 |         super(FireResetEnv, self).__init__(env)
 39 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 40 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 41 | 
 42 |     def _reset(self):
 43 |         self.env.reset()
 44 |         obs, _, done, _ = self.env.step(1)
 45 |         if done:
 46 |             self.env.reset()
 47 |         obs, _, done, _ = self.env.step(2)
 48 |         if done:
 49 |             self.env.reset()
 50 |         return obs
 51 | 
 52 | 
 53 | class EpisodicLifeEnv(gym.Wrapper):
 54 |     def __init__(self, env=None):
 55 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 56 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 57 |         """
 58 |         super(EpisodicLifeEnv, self).__init__(env)
 59 |         self.lives = 0
 60 |         self.was_real_done = True
 61 |         self.was_real_reset = False
 62 | 
 63 |     def _step(self, action):
 64 |         obs, reward, done, info = self.env.step(action)
 65 |         self.was_real_done = done
 66 |         # check current lives, make loss of life terminal,
 67 |         # then update lives to handle bonus lives
 68 |         lives = self.env.unwrapped.ale.lives()
 69 |         if lives < self.lives and lives > 0:
 70 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 71 |             # so its important to keep lives > 0, so that we only reset once
 72 |             # the environment advertises done.
 73 |             done = True
 74 |         self.lives = lives
 75 |         return obs, reward, done, info
 76 | 
 77 |     def _reset(self):
 78 |         """Reset only when lives are exhausted.
 79 |         This way all states are still reachable even though lives are episodic,
 80 |         and the learner need not know about any of this behind-the-scenes.
 81 |         """
 82 |         if self.was_real_done:
 83 |             obs = self.env.reset()
 84 |             self.was_real_reset = True
 85 |         else:
 86 |             # no-op step to advance from terminal/lost life state
 87 |             obs, _, _, _ = self.env.step(0)
 88 |             self.was_real_reset = False
 89 |         self.lives = self.env.unwrapped.ale.lives()
 90 |         return obs
 91 | 
 92 | 
 93 | class MaxAndSkipEnv(gym.Wrapper):
 94 |     def __init__(self, env=None, skip=4):
 95 |         """Return only every `skip`-th frame"""
 96 |         super(MaxAndSkipEnv, self).__init__(env)
 97 |         # most recent raw observations (for max pooling across time steps)
 98 |         self._obs_buffer = deque(maxlen=2)
 99 |         self._skip = skip
100 | 
101 |     def _step(self, action):
102 |         total_reward = 0.0
103 |         done = None
104 |         for _ in range(self._skip):
105 |             obs, reward, done, info = self.env.step(action)
106 |             self._obs_buffer.append(obs)
107 |             total_reward += reward
108 |             if done:
109 |                 break
110 | 
111 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
112 | 
113 |         return max_frame, total_reward, done, info
114 | 
115 |     def _reset(self):
116 |         """Clear past frame buffer and init. to first obs. from inner env."""
117 |         self._obs_buffer.clear()
118 |         obs = self.env.reset()
119 |         self._obs_buffer.append(obs)
120 |         return obs
121 | 
122 | 
123 | class ProcessFrame84(gym.ObservationWrapper):
124 |     def __init__(self, env=None):
125 |         super(ProcessFrame84, self).__init__(env)
126 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
127 | 
128 |     def _observation(self, obs):
129 |         return ProcessFrame84.process(obs)
130 | 
131 |     @staticmethod
132 |     def process(frame):
133 |         if frame.size == 210 * 160 * 3:
134 |             img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
135 |         elif frame.size == 250 * 160 * 3:
136 |             img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
137 |         else:
138 |             assert False, "Unknown resolution."
139 |         img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
140 |         resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
141 |         x_t = resized_screen[18:102, :]
142 |         x_t = np.reshape(x_t, [84, 84, 1])
143 |         return x_t.astype(np.uint8)
144 | 
145 | 
146 | class ClippedRewardsWrapper(gym.RewardWrapper):
147 |     def _reward(self, reward):
148 |         """Change all the positive rewards to 1, negative to -1 and keep zero."""
149 |         return np.sign(reward)
150 | 
151 | 
152 | class LazyFrames(object):
153 |     def __init__(self, frames):
154 |         """This object ensures that common frames between the observations are only stored once.
155 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
156 |         buffers.
157 | 
158 |         This object should only be converted to numpy array before being passed to the model.
159 | 
160 |         You'd not belive how complex the previous solution was."""
161 |         self._frames = frames
162 | 
163 |     def __array__(self, dtype=None):
164 |         out = np.concatenate(self._frames, axis=2)
165 |         if dtype is not None:
166 |             out = out.astype(dtype)
167 |         return out
168 | 
169 | 
170 | class FrameStack(gym.Wrapper):
171 |     def __init__(self, env, k):
172 |         """Stack k last frames.
173 | 
174 |         Returns lazy array, which is much more memory efficient.
175 | 
176 |         See Also
177 |         --------
178 |         baselines.common.atari_wrappers.LazyFrames
179 |         """
180 |         gym.Wrapper.__init__(self, env)
181 |         self.k = k
182 |         self.frames = deque([], maxlen=k)
183 |         shp = env.observation_space.shape
184 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
185 | 
186 |     def _reset(self):
187 |         ob = self.env.reset()
188 |         for _ in range(self.k):
189 |             self.frames.append(ob)
190 |         return self._get_ob()
191 | 
192 |     def _step(self, action):
193 |         ob, reward, done, info = self.env.step(action)
194 |         self.frames.append(ob)
195 |         return self._get_ob(), reward, done, info
196 | 
197 |     def _get_ob(self):
198 |         assert len(self.frames) == self.k
199 |         return LazyFrames(list(self.frames))
200 | 
201 | 
202 | class ScaledFloatFrame(gym.ObservationWrapper):
203 |     def _observation(self, obs):
204 |         # careful! This undoes the memory optimization, use
205 |         # with smaller replay buffers only.
206 |         return np.array(obs).astype(np.float32) / 255.0
207 | 
208 | 
209 | def wrap_dqn(env):
210 |     """Apply a common set of wrappers for Atari games."""
211 |     assert 'NoFrameskip' in env.spec.id
212 |     env = EpisodicLifeEnv(env)
213 |     env = NoopResetEnv(env, noop_max=30)
214 |     env = MaxAndSkipEnv(env, skip=4)
215 |     if 'FIRE' in env.unwrapped.get_action_meanings():
216 |         env = FireResetEnv(env)
217 |     env = ProcessFrame84(env)
218 |     env = FrameStack(env, 4)
219 |     env = ClippedRewardsWrapper(env)
220 |     return env
221 | 
222 | 
223 | class A2cProcessFrame(gym.Wrapper):
224 |     def __init__(self, env):
225 |         gym.Wrapper.__init__(self, env)
226 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
227 | 
228 |     def _step(self, action):
229 |         ob, reward, done, info = self.env.step(action)
230 |         return A2cProcessFrame.process(ob), reward, done, info
231 | 
232 |     def _reset(self):
233 |         return A2cProcessFrame.process(self.env.reset())
234 | 
235 |     @staticmethod
236 |     def process(frame):
237 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
238 |         frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
239 |         return frame.reshape(84, 84, 1)
240 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/azure_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import zipfile
  4 | 
  5 | from azure.common import AzureMissingResourceHttpError
  6 | try:
  7 |     from azure.storage.blob import BlobService
  8 | except ImportError:
  9 |     from azure.storage.blob import BlockBlobService as BlobService
 10 | from shutil import unpack_archive
 11 | from threading import Event
 12 | 
 13 | """TODOS:
 14 |    - use Azure snapshots instead of hacky backups
 15 | """
 16 | 
 17 | 
 18 | def fixed_list_blobs(service, *args, **kwargs):
 19 |     """By defualt list_containers only returns a subset of results.
 20 | 
 21 |     This function attempts to fix this.
 22 |     """
 23 |     res = []
 24 |     next_marker = None
 25 |     while next_marker is None or len(next_marker) > 0:
 26 |         kwargs['marker'] = next_marker
 27 |         gen = service.list_blobs(*args, **kwargs)
 28 |         for b in gen:
 29 |             res.append(b.name)
 30 |         next_marker = gen.next_marker
 31 |     return res
 32 | 
 33 | 
 34 | def make_archive(source_path, dest_path):
 35 |     if source_path.endswith(os.path.sep):
 36 |         source_path = source_path.rstrip(os.path.sep)
 37 |     prefix_path = os.path.dirname(source_path)
 38 |     with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
 39 |         if os.path.isdir(source_path):
 40 |             for dirname, subdirs, files in os.walk(source_path):
 41 |                 zf.write(dirname, os.path.relpath(dirname, prefix_path))
 42 |                 for filename in files:
 43 |                     filepath = os.path.join(dirname, filename)
 44 |                     zf.write(filepath, os.path.relpath(filepath, prefix_path))
 45 |         else:
 46 |             zf.write(source_path, os.path.relpath(source_path, prefix_path))
 47 | 
 48 | 
 49 | class Container(object):
 50 |     services = {}
 51 | 
 52 |     def __init__(self, account_name, account_key, container_name, maybe_create=False):
 53 |         self._account_name = account_name
 54 |         self._container_name = container_name
 55 |         if account_name not in Container.services:
 56 |             Container.services[account_name] = BlobService(account_name, account_key)
 57 |         self._service = Container.services[account_name]
 58 |         if maybe_create:
 59 |             self._service.create_container(self._container_name, fail_on_exist=False)
 60 | 
 61 |     def put(self, source_path, blob_name, callback=None):
 62 |         """Upload a file or directory from `source_path` to azure blob `blob_name`.
 63 | 
 64 |         Upload progress can be traced by an optional callback.
 65 |         """
 66 |         upload_done = Event()
 67 | 
 68 |         def progress_callback(current, total):
 69 |             if callback:
 70 |                 callback(current, total)
 71 |             if current >= total:
 72 |                 upload_done.set()
 73 | 
 74 |         # Attempt to make backup if an existing version is already available
 75 |         try:
 76 |             x_ms_copy_source = "https://{}.blob.core.windows.net/{}/{}".format(
 77 |                 self._account_name,
 78 |                 self._container_name,
 79 |                 blob_name
 80 |             )
 81 |             self._service.copy_blob(
 82 |                 container_name=self._container_name,
 83 |                 blob_name=blob_name + ".backup",
 84 |                 x_ms_copy_source=x_ms_copy_source
 85 |             )
 86 |         except AzureMissingResourceHttpError:
 87 |             pass
 88 | 
 89 |         with tempfile.TemporaryDirectory() as td:
 90 |             arcpath = os.path.join(td, "archive.zip")
 91 |             make_archive(source_path, arcpath)
 92 |             self._service.put_block_blob_from_path(
 93 |                 container_name=self._container_name,
 94 |                 blob_name=blob_name,
 95 |                 file_path=arcpath,
 96 |                 max_connections=4,
 97 |                 progress_callback=progress_callback,
 98 |                 max_retries=10)
 99 |             upload_done.wait()
100 | 
101 |     def get(self, dest_path, blob_name, callback=None):
102 |         """Download a file or directory to `dest_path` to azure blob `blob_name`.
103 | 
104 |         Warning! If directory is downloaded the `dest_path` is the parent directory.
105 | 
106 |         Upload progress can be traced by an optional callback.
107 |         """
108 |         download_done = Event()
109 | 
110 |         def progress_callback(current, total):
111 |             if callback:
112 |                 callback(current, total)
113 |             if current >= total:
114 |                 download_done.set()
115 | 
116 |         with tempfile.TemporaryDirectory() as td:
117 |             arcpath = os.path.join(td, "archive.zip")
118 |             for backup_blob_name in [blob_name, blob_name + '.backup']:
119 |                 try:
120 |                     properties = self._service.get_blob_properties(
121 |                         blob_name=backup_blob_name,
122 |                         container_name=self._container_name
123 |                     )
124 |                     if hasattr(properties, 'properties'):
125 |                         # Annoyingly, Azure has changed the API and this now returns a blob
126 |                         # instead of it's properties with up-to-date azure package.
127 |                         blob_size = properties.properties.content_length
128 |                     else:
129 |                         blob_size = properties['content-length']
130 |                     if int(blob_size) > 0:
131 |                         self._service.get_blob_to_path(
132 |                             container_name=self._container_name,
133 |                             blob_name=backup_blob_name,
134 |                             file_path=arcpath,
135 |                             max_connections=4,
136 |                             progress_callback=progress_callback)
137 |                         unpack_archive(arcpath, dest_path)
138 |                         download_done.wait()
139 |                         return True
140 |                 except AzureMissingResourceHttpError:
141 |                     pass
142 |         return False
143 | 
144 |     def list(self, prefix=None):
145 |         """List all blobs in the container."""
146 |         return fixed_list_blobs(self._service, self._container_name, prefix=prefix)
147 | 
148 |     def exists(self, blob_name):
149 |         """Returns true if `blob_name` exists in container."""
150 |         try:
151 |             self._service.get_blob_properties(
152 |                 blob_name=blob_name,
153 |                 container_name=self._container_name
154 |             )
155 |             return True
156 |         except AzureMissingResourceHttpError:
157 |             return False
158 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/cg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
 3 |     """
 4 |     Demmel p 312
 5 |     """
 6 |     p = b.copy()
 7 |     r = b.copy()
 8 |     x = np.zeros_like(b)
 9 |     rdotr = r.dot(r)
10 | 
11 |     fmtstr =  "%10i %10.3g %10.3g"
12 |     titlestr =  "%10s %10s %10s"
13 |     if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 | 
15 |     for i in range(cg_iters):
16 |         if callback is not None:
17 |             callback(x)
18 |         if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 |         z = f_Ax(p)
20 |         v = rdotr / p.dot(z)
21 |         x += v*p
22 |         r -= v*z
23 |         newrdotr = r.dot(r)
24 |         mu = newrdotr/rdotr
25 |         p = r + mu*p
26 | 
27 |         rdotr = newrdotr
28 |         if rdotr < residual_tol:
29 |             break
30 | 
31 |     if callback is not None:
32 |         callback(x)
33 |     if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
34 |     return x


--------------------------------------------------------------------------------
/gailtf/baselines/common/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from contextlib import contextmanager
 3 | import numpy as np
 4 | import time
 5 | 
 6 | # ================================================================
 7 | # Misc
 8 | # ================================================================
 9 | 
10 | def fmt_row(width, row, header=False):
11 |     out = " | ".join(fmt_item(x, width) for x in row)
12 |     if header: out = out + "\n" + "-"*len(out)
13 |     return out
14 | 
15 | def fmt_item(x, l):
16 |     if isinstance(x, np.ndarray):
17 |         assert x.ndim==0
18 |         x = x.item()
19 |     if isinstance(x, float): rep = "%g"%x
20 |     else: rep = str(x)
21 |     return " "*(l - len(rep)) + rep
22 | 
23 | color2num = dict(
24 |     gray=30,
25 |     red=31,
26 |     green=32,
27 |     yellow=33,
28 |     blue=34,
29 |     magenta=35,
30 |     cyan=36,
31 |     white=37,
32 |     crimson=38
33 | )
34 | 
35 | def colorize(string, color, bold=False, highlight=False):
36 |     attr = []
37 |     num = color2num[color]
38 |     if highlight: num += 10
39 |     attr.append(str(num))
40 |     if bold: attr.append('1')
41 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
42 | 
43 | 
44 | MESSAGE_DEPTH = 0
45 | 
46 | @contextmanager
47 | def timed(msg):
48 |     global MESSAGE_DEPTH #pylint: disable=W0603
49 |     print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
50 |     tstart = time.time()
51 |     MESSAGE_DEPTH += 1
52 |     yield
53 |     MESSAGE_DEPTH -= 1
54 |     print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
55 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Dataset(object):
 4 |     def __init__(self, data_map, deterministic=False, shuffle=True):
 5 |         self.data_map = data_map
 6 |         self.deterministic = deterministic
 7 |         self.enable_shuffle = shuffle
 8 |         self.n = next(iter(data_map.values())).shape[0]
 9 |         self._next_id = 0
10 |         self.shuffle()
11 | 
12 |     def shuffle(self):
13 |         if self.deterministic:
14 |             return
15 |         perm = np.arange(self.n)
16 |         np.random.shuffle(perm)
17 | 
18 |         for key in self.data_map:
19 |             self.data_map[key] = self.data_map[key][perm]
20 | 
21 |         self._next_id = 0
22 | 
23 |     def next_batch(self, batch_size):
24 |         if self._next_id >= self.n and self.enable_shuffle:
25 |             self.shuffle()
26 | 
27 |         cur_id = self._next_id
28 |         cur_batch_size = min(batch_size, self.n - self._next_id)
29 |         self._next_id += cur_batch_size
30 | 
31 |         data_map = dict()
32 |         for key in self.data_map:
33 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 |         return data_map
35 | 
36 |     def iterate_once(self, batch_size):
37 |         if self.enable_shuffle: self.shuffle()
38 | 
39 |         while self._next_id <= self.n - batch_size:
40 |             yield self.next_batch(batch_size)
41 |         self._next_id = 0
42 | 
43 |     def subset(self, num_elements, deterministic=True):
44 |         data_map = dict()
45 |         for key in self.data_map:
46 |             data_map[key] = self.data_map[key][:num_elements]
47 |         return Dataset(data_map, deterministic)
48 | 
49 | 
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 |     arrays = tuple(map(np.asarray, arrays))
53 |     n = arrays[0].shape[0]
54 |     assert all(a.shape[0] == n for a in arrays[1:])
55 |     inds = np.arange(n)
56 |     if shuffle: np.random.shuffle(inds)
57 |     sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 |     for batch_inds in np.array_split(inds, sections):
59 |         if include_final_partial_batch or len(batch_inds) == batch_size:
60 |             yield tuple(a[batch_inds] for a in arrays)
61 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/distributions.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import gailtf.baselines.common.tf_util as U
  4 | from tensorflow.python.ops import math_ops
  5 | from tensorflow.python.ops import nn
  6 | 
  7 | class Pd(object):
  8 |     """
  9 |     A particular probability distribution
 10 |     """
 11 |     def flatparam(self):
 12 |         raise NotImplementedError
 13 |     def mode(self):
 14 |         raise NotImplementedError
 15 |     def neglogp(self, x):
 16 |         # Usually it's easier to define the negative logprob
 17 |         raise NotImplementedError
 18 |     def kl(self, other):
 19 |         raise NotImplementedError
 20 |     def entropy(self):
 21 |         raise NotImplementedError
 22 |     def sample(self):
 23 |         raise NotImplementedError
 24 |     def logp(self, x):
 25 |         return - self.neglogp(x)
 26 | 
 27 | class PdType(object):
 28 |     """
 29 |     Parametrized family of probability distributions
 30 |     """
 31 |     def pdclass(self):
 32 |         raise NotImplementedError
 33 |     def pdfromflat(self, flat):
 34 |         return self.pdclass()(flat)
 35 |     def param_shape(self):
 36 |         raise NotImplementedError
 37 |     def sample_shape(self):
 38 |         raise NotImplementedError
 39 |     def sample_dtype(self):
 40 |         raise NotImplementedError
 41 | 
 42 |     def param_placeholder(self, prepend_shape, name=None):
 43 |         return tf.placeholder(dtype=tf.float32, shape=prepend_shape+self.param_shape(), name=name)
 44 |     def sample_placeholder(self, prepend_shape, name=None):
 45 |         return tf.placeholder(dtype=self.sample_dtype(), shape=prepend_shape+self.sample_shape(), name=name)
 46 | 
 47 | class CategoricalPdType(PdType):
 48 |     def __init__(self, ncat):
 49 |         self.ncat = ncat
 50 |     def pdclass(self):
 51 |         return CategoricalPd
 52 |     def param_shape(self):
 53 |         return [self.ncat]
 54 |     def sample_shape(self):
 55 |         return []
 56 |     def sample_dtype(self):
 57 |         return tf.int32
 58 | 
 59 | 
 60 | class MultiCategoricalPdType(PdType):
 61 |     def __init__(self, low, high):
 62 |         self.low = low
 63 |         self.high = high
 64 |         self.ncats = high - low + 1
 65 |     def pdclass(self):
 66 |         return MultiCategoricalPd
 67 |     def pdfromflat(self, flat):
 68 |         return MultiCategoricalPd(self.low, self.high, flat)
 69 |     def param_shape(self):
 70 |         return [sum(self.ncats)]
 71 |     def sample_shape(self):
 72 |         return [len(self.ncats)]
 73 |     def sample_dtype(self):
 74 |         return tf.int32
 75 | 
 76 | class DiagGaussianPdType(PdType):
 77 |     def __init__(self, size):
 78 |         self.size = size
 79 |     def pdclass(self):
 80 |         return DiagGaussianPd
 81 |     def param_shape(self):
 82 |         return [2*self.size]
 83 |     def sample_shape(self):
 84 |         return [self.size]
 85 |     def sample_dtype(self):
 86 |         return tf.float32
 87 | 
 88 | class BernoulliPdType(PdType):
 89 |     def __init__(self, size):
 90 |         self.size = size
 91 |     def pdclass(self):
 92 |         return BernoulliPd
 93 |     def param_shape(self):
 94 |         return [self.size]
 95 |     def sample_shape(self):
 96 |         return [self.size]
 97 |     def sample_dtype(self):
 98 |         return tf.int32
 99 | 
100 | # WRONG SECOND DERIVATIVES
101 | # class CategoricalPd(Pd):
102 | #     def __init__(self, logits):
103 | #         self.logits = logits
104 | #         self.ps = tf.nn.softmax(logits)
105 | #     @classmethod
106 | #     def fromflat(cls, flat):
107 | #         return cls(flat)
108 | #     def flatparam(self):
109 | #         return self.logits
110 | #     def mode(self):
111 | #         return U.argmax(self.logits, axis=-1)
112 | #     def logp(self, x):
113 | #         return -tf.nn.sparse_softmax_cross_entropy_with_logits(self.logits, x)
114 | #     def kl(self, other):
115 | #         return tf.nn.softmax_cross_entropy_with_logits(other.logits, self.ps) \
116 | #                 - tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
117 | #     def entropy(self):
118 | #         return tf.nn.softmax_cross_entropy_with_logits(self.logits, self.ps)
119 | #     def sample(self):
120 | #         u = tf.random_uniform(tf.shape(self.logits))
121 | #         return U.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
122 | 
123 | class CategoricalPd(Pd):
124 |     def __init__(self, logits):
125 |         self.logits = logits
126 |     def flatparam(self):
127 |         return self.logits
128 |     def mode(self):
129 |         return U.argmax(self.logits, axis=-1)
130 |     def neglogp(self, x):
131 |         # return tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, labels=x)
132 |         # Note: we can't use sparse_softmax_cross_entropy_with_logits because
133 |         #       the implementation does not allow second-order derivatives...
134 |         one_hot_actions = tf.one_hot(x, self.logits.get_shape().as_list()[-1])
135 |         return tf.nn.softmax_cross_entropy_with_logits(
136 |             logits=self.logits,
137 |             labels=one_hot_actions)
138 |     def kl(self, other):
139 |         a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
140 |         a1 = other.logits - U.max(other.logits, axis=-1, keepdims=True)
141 |         ea0 = tf.exp(a0)
142 |         ea1 = tf.exp(a1)
143 |         z0 = U.sum(ea0, axis=-1, keepdims=True)
144 |         z1 = U.sum(ea1, axis=-1, keepdims=True)
145 |         p0 = ea0 / z0
146 |         return U.sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=-1)
147 |     def entropy(self):
148 |         a0 = self.logits - U.max(self.logits, axis=-1, keepdims=True)
149 |         ea0 = tf.exp(a0)
150 |         z0 = U.sum(ea0, axis=-1, keepdims=True)
151 |         p0 = ea0 / z0
152 |         return U.sum(p0 * (tf.log(z0) - a0), axis=-1)
153 |     def sample(self):
154 |         u = tf.random_uniform(tf.shape(self.logits))
155 |         return tf.argmax(self.logits - tf.log(-tf.log(u)), axis=-1)
156 |     @classmethod
157 |     def fromflat(cls, flat):
158 |         return cls(flat)
159 | 
160 | class MultiCategoricalPd(Pd):
161 |     def __init__(self, low, high, flat):
162 |         self.flat = flat
163 |         self.low = tf.constant(low, dtype=tf.int32)
164 |         self.categoricals = list(map(CategoricalPd, tf.split(flat, high - low + 1, axis=len(flat.get_shape()) - 1)))
165 |     def flatparam(self):
166 |         return self.flat
167 |     def mode(self):
168 |         return self.low + tf.cast(tf.stack([p.mode() for p in self.categoricals], axis=-1), tf.int32)
169 |     def neglogp(self, x):
170 |         return tf.add_n([p.neglogp(px) for p, px in zip(self.categoricals, tf.unstack(x - self.low, axis=len(x.get_shape()) - 1))])
171 |     def kl(self, other):
172 |         return tf.add_n([
173 |                 p.kl(q) for p, q in zip(self.categoricals, other.categoricals)
174 |             ])
175 |     def entropy(self):
176 |         return tf.add_n([p.entropy() for p in self.categoricals])
177 |     def sample(self):
178 |         return self.low + tf.cast(tf.stack([p.sample() for p in self.categoricals], axis=-1), tf.int32)
179 |     @classmethod
180 |     def fromflat(cls, flat):
181 |         raise NotImplementedError
182 | 
183 | class DiagGaussianPd(Pd):
184 |     def __init__(self, flat):
185 |         self.flat = flat
186 |         mean, logstd = tf.split(axis=len(flat.shape)-1, num_or_size_splits=2, value=flat)
187 |         self.mean = mean
188 |         self.logstd = logstd
189 |         self.std = tf.exp(logstd)
190 |     def flatparam(self):
191 |         return self.flat
192 |     def mode(self):
193 |         return self.mean
194 |     def neglogp(self, x):
195 |         return 0.5 * U.sum(tf.square((x - self.mean) / self.std), axis=-1) \
196 |                + 0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[-1]) \
197 |                + U.sum(self.logstd, axis=-1)
198 |     def kl(self, other):
199 |         assert isinstance(other, DiagGaussianPd)
200 |         return U.sum(other.logstd - self.logstd + (tf.square(self.std) + tf.square(self.mean - other.mean)) / (2.0 * tf.square(other.std)) - 0.5, axis=-1)
201 |     def entropy(self):
202 |         return U.sum(self.logstd + .5 * np.log(2.0 * np.pi * np.e), axis=-1)
203 |     def sample(self):
204 |         return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
205 |     @classmethod
206 |     def fromflat(cls, flat):
207 |         return cls(flat)
208 | 
209 | class BernoulliPd(Pd):
210 |     def __init__(self, logits):
211 |         self.logits = logits
212 |         self.ps = tf.sigmoid(logits)
213 |     def flatparam(self):
214 |         return self.logits
215 |     def mode(self):
216 |         return tf.round(self.ps)
217 |     def neglogp(self, x):
218 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=tf.to_float(x)), axis=-1)
219 |     def kl(self, other):
220 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=other.logits, labels=self.ps), axis=-1) - U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
221 |     def entropy(self):
222 |         return U.sum(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, labels=self.ps), axis=-1)
223 |     def sample(self):
224 |         u = tf.random_uniform(tf.shape(self.ps))
225 |         return tf.to_float(math_ops.less(u, self.ps))
226 |     @classmethod
227 |     def fromflat(cls, flat):
228 |         return cls(flat)
229 | 
230 | def make_pdtype(ac_space):
231 |     from gym import spaces
232 |     if isinstance(ac_space, spaces.Box):
233 |         assert len(ac_space.shape) == 1
234 |         return DiagGaussianPdType(ac_space.shape[0])
235 |     elif isinstance(ac_space, spaces.Discrete):
236 |         return CategoricalPdType(ac_space.n)
237 |     elif isinstance(ac_space, spaces.MultiDiscrete):
238 |         return MultiCategoricalPdType(ac_space.low, ac_space.high)
239 |     elif isinstance(ac_space, spaces.MultiBinary):
240 |         return BernoulliPdType(ac_space.n)
241 |     else:
242 |         raise NotImplementedError
243 | 
244 | def shape_el(v, i):
245 |     maybe = v.get_shape()[i]
246 |     if maybe is not None:
247 |         return maybe
248 |     else:
249 |         return tf.shape(v)[i]
250 | 
251 | @U.in_session
252 | def test_probtypes():
253 |     np.random.seed(0)
254 | 
255 |     pdparam_diag_gauss = np.array([-.2, .3, .4, -.5, .1, -.5, .1, 0.8])
256 |     diag_gauss = DiagGaussianPdType(pdparam_diag_gauss.size // 2) #pylint: disable=E1101
257 |     validate_probtype(diag_gauss, pdparam_diag_gauss)
258 | 
259 |     pdparam_categorical = np.array([-.2, .3, .5])
260 |     categorical = CategoricalPdType(pdparam_categorical.size) #pylint: disable=E1101
261 |     validate_probtype(categorical, pdparam_categorical)
262 | 
263 |     pdparam_bernoulli = np.array([-.2, .3, .5])
264 |     bernoulli = BernoulliPdType(pdparam_bernoulli.size) #pylint: disable=E1101
265 |     validate_probtype(bernoulli, pdparam_bernoulli)
266 | 
267 | 
268 | def validate_probtype(probtype, pdparam):
269 |     N = 100000
270 |     # Check to see if mean negative log likelihood == differential entropy
271 |     Mval = np.repeat(pdparam[None, :], N, axis=0)
272 |     M = probtype.param_placeholder([N])
273 |     X = probtype.sample_placeholder([N])
274 |     pd = probtype.pdclass()(M)
275 |     calcloglik = U.function([X, M], pd.logp(X))
276 |     calcent = U.function([M], pd.entropy())
277 |     Xval = U.eval(pd.sample(), feed_dict={M:Mval})
278 |     logliks = calcloglik(Xval, Mval)
279 |     entval_ll = - logliks.mean() #pylint: disable=E1101
280 |     entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
281 |     entval = calcent(Mval).mean() #pylint: disable=E1101
282 |     assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas
283 | 
284 |     # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
285 |     M2 = probtype.param_placeholder([N])
286 |     pd2 = probtype.pdclass()(M2)
287 |     q = pdparam + np.random.randn(pdparam.size) * 0.1
288 |     Mval2 = np.repeat(q[None, :], N, axis=0)
289 |     calckl = U.function([M, M2], pd.kl(pd2))
290 |     klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
291 |     logliks = calcloglik(Xval, Mval2)
292 |     klval_ll = - entval - logliks.mean() #pylint: disable=E1101
293 |     klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
294 |     assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
295 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/math_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | 
 5 | def discount(x, gamma):
 6 |     """
 7 |     computes discounted sums along 0th dimension of x.
 8 | 
 9 |     inputs
10 |     ------
11 |     x: ndarray
12 |     gamma: float
13 | 
14 |     outputs
15 |     -------
16 |     y: ndarray with same shape as x, satisfying
17 | 
18 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 |                 where k = len(x) - t - 1
20 | 
21 |     """
22 |     assert x.ndim >= 1
23 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 | 
25 | def explained_variance(ypred,y):
26 |     """
27 |     Computes fraction of variance that ypred explains about y.
28 |     Returns 1 - Var[y-ypred] / Var[y]
29 | 
30 |     interpretation:
31 |         ev=0  =>  might as well have predicted zero
32 |         ev=1  =>  perfect prediction
33 |         ev<0  =>  worse than just predicting zero
34 | 
35 |     """
36 |     assert y.ndim == 1 and ypred.ndim == 1
37 |     vary = np.var(y)
38 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 | 
40 | def explained_variance_2d(ypred, y):
41 |     assert y.ndim == 2 and ypred.ndim == 2
42 |     vary = np.var(y, axis=0)
43 |     out = 1 - np.var(y-ypred)/vary
44 |     out[vary < 1e-10] = 0
45 |     return out
46 | 
47 | def ncc(ypred, y):
48 |     return np.corrcoef(ypred, y)[1,0]
49 | 
50 | def flatten_arrays(arrs):
51 |     return np.concatenate([arr.flat for arr in arrs])
52 | 
53 | def unflatten_vector(vec, shapes):
54 |     i=0
55 |     arrs = []
56 |     for shape in shapes:
57 |         size = np.prod(shape)
58 |         arr = vec[i:i+size].reshape(shape)
59 |         arrs.append(arr)
60 |         i += size
61 |     return arrs
62 | 
63 | def discount_with_boundaries(X, New, gamma):
64 |     """
65 |     X: 2d array of floats, time x features
66 |     New: 2d array of bools, indicating when a new episode has started
67 |     """
68 |     Y = np.zeros_like(X)
69 |     T = X.shape[0]
70 |     Y[T-1] = X[T-1]
71 |     for t in range(T-2, -1, -1):
72 |         Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 |     return Y
74 | 
75 | def test_discount_with_boundaries():
76 |     gamma=0.9
77 |     x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 |     starts = [1.0, 0.0, 0.0, 1.0]
79 |     y = discount_with_boundaries(x, starts, gamma)
80 |     assert np.allclose(y, [
81 |         1 + gamma * 2 + gamma**2 * 3,
82 |         2 + gamma * 3,
83 |         3,
84 |         4
85 |     ])


--------------------------------------------------------------------------------
/gailtf/baselines/common/misc_util.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | import os
  4 | import pickle
  5 | import random
  6 | import tempfile
  7 | import time
  8 | import zipfile
  9 | 
 10 | 
 11 | def zipsame(*seqs):
 12 |     L = len(seqs[0])
 13 |     assert all(len(seq) == L for seq in seqs[1:])
 14 |     return zip(*seqs)
 15 | 
 16 | 
 17 | def unpack(seq, sizes):
 18 |     """
 19 |     Unpack 'seq' into a sequence of lists, with lengths specified by 'sizes'.
 20 |     None = just one bare element, not a list
 21 | 
 22 |     Example:
 23 |     unpack([1,2,3,4,5,6], [3,None,2]) -> ([1,2,3], 4, [5,6])
 24 |     """
 25 |     seq = list(seq)
 26 |     it = iter(seq)
 27 |     assert sum(1 if s is None else s for s in sizes) == len(seq), "Trying to unpack %s into %s" % (seq, sizes)
 28 |     for size in sizes:
 29 |         if size is None:
 30 |             yield it.__next__()
 31 |         else:
 32 |             li = []
 33 |             for _ in range(size):
 34 |                 li.append(it.__next__())
 35 |             yield li
 36 | 
 37 | 
 38 | class EzPickle(object):
 39 |     """Objects that are pickled and unpickled via their constructor
 40 |     arguments.
 41 | 
 42 |     Example usage:
 43 | 
 44 |         class Dog(Animal, EzPickle):
 45 |             def __init__(self, furcolor, tailkind="bushy"):
 46 |                 Animal.__init__()
 47 |                 EzPickle.__init__(furcolor, tailkind)
 48 |                 ...
 49 | 
 50 |     When this object is unpickled, a new Dog will be constructed by passing the provided
 51 |     furcolor and tailkind into the constructor. However, philosophers are still not sure
 52 |     whether it is still the same dog.
 53 | 
 54 |     This is generally needed only for environments which wrap C/C++ code, such as MuJoCo
 55 |     and Atari.
 56 |     """
 57 | 
 58 |     def __init__(self, *args, **kwargs):
 59 |         self._ezpickle_args = args
 60 |         self._ezpickle_kwargs = kwargs
 61 | 
 62 |     def __getstate__(self):
 63 |         return {"_ezpickle_args": self._ezpickle_args, "_ezpickle_kwargs": self._ezpickle_kwargs}
 64 | 
 65 |     def __setstate__(self, d):
 66 |         out = type(self)(*d["_ezpickle_args"], **d["_ezpickle_kwargs"])
 67 |         self.__dict__.update(out.__dict__)
 68 | 
 69 | 
 70 | def set_global_seeds(i):
 71 |     try:
 72 |         import tensorflow as tf
 73 |     except ImportError:
 74 |         pass
 75 |     else:
 76 |         tf.set_random_seed(i)
 77 |     np.random.seed(i)
 78 |     random.seed(i)
 79 | 
 80 | 
 81 | def pretty_eta(seconds_left):
 82 |     """Print the number of seconds in human readable format.
 83 | 
 84 |     Examples:
 85 |     2 days
 86 |     2 hours and 37 minutes
 87 |     less than a minute
 88 | 
 89 |     Paramters
 90 |     ---------
 91 |     seconds_left: int
 92 |         Number of seconds to be converted to the ETA
 93 |     Returns
 94 |     -------
 95 |     eta: str
 96 |         String representing the pretty ETA.
 97 |     """
 98 |     minutes_left = seconds_left // 60
 99 |     seconds_left %= 60
100 |     hours_left = minutes_left // 60
101 |     minutes_left %= 60
102 |     days_left = hours_left // 24
103 |     hours_left %= 24
104 | 
105 |     def helper(cnt, name):
106 |         return "{} {}{}".format(str(cnt), name, ('s' if cnt > 1 else ''))
107 | 
108 |     if days_left > 0:
109 |         msg = helper(days_left, 'day')
110 |         if hours_left > 0:
111 |             msg += ' and ' + helper(hours_left, 'hour')
112 |         return msg
113 |     if hours_left > 0:
114 |         msg = helper(hours_left, 'hour')
115 |         if minutes_left > 0:
116 |             msg += ' and ' + helper(minutes_left, 'minute')
117 |         return msg
118 |     if minutes_left > 0:
119 |         return helper(minutes_left, 'minute')
120 |     return 'less than a minute'
121 | 
122 | 
123 | class RunningAvg(object):
124 |     def __init__(self, gamma, init_value=None):
125 |         """Keep a running estimate of a quantity. This is a bit like mean
126 |         but more sensitive to recent changes.
127 | 
128 |         Parameters
129 |         ----------
130 |         gamma: float
131 |             Must be between 0 and 1, where 0 is the most sensitive to recent
132 |             changes.
133 |         init_value: float or None
134 |             Initial value of the estimate. If None, it will be set on the first update.
135 |         """
136 |         self._value = init_value
137 |         self._gamma = gamma
138 | 
139 |     def update(self, new_val):
140 |         """Update the estimate.
141 | 
142 |         Parameters
143 |         ----------
144 |         new_val: float
145 |             new observated value of estimated quantity.
146 |         """
147 |         if self._value is None:
148 |             self._value = new_val
149 |         else:
150 |             self._value = self._gamma * self._value + (1.0 - self._gamma) * new_val
151 | 
152 |     def __float__(self):
153 |         """Get the current estimate"""
154 |         return self._value
155 | 
156 | 
157 | class SimpleMonitor(gym.Wrapper):
158 |     def __init__(self, env):
159 |         """Adds two qunatities to info returned by every step:
160 | 
161 |             num_steps: int
162 |                 Number of steps takes so far
163 |             rewards: [float]
164 |                 All the cumulative rewards for the episodes completed so far.
165 |         """
166 |         super().__init__(env)
167 |         # current episode state
168 |         self._current_reward = None
169 |         self._num_steps = None
170 |         # temporary monitor state that we do not save
171 |         self._time_offset = None
172 |         self._total_steps = None
173 |         # monitor state
174 |         self._episode_rewards = []
175 |         self._episode_lengths = []
176 |         self._episode_end_times = []
177 | 
178 |     def _reset(self):
179 |         obs = self.env.reset()
180 |         # recompute temporary state if needed
181 |         if self._time_offset is None:
182 |             self._time_offset = time.time()
183 |             if len(self._episode_end_times) > 0:
184 |                 self._time_offset -= self._episode_end_times[-1]
185 |         if self._total_steps is None:
186 |             self._total_steps = sum(self._episode_lengths)
187 |         # update monitor state
188 |         if self._current_reward is not None:
189 |             self._episode_rewards.append(self._current_reward)
190 |             self._episode_lengths.append(self._num_steps)
191 |             self._episode_end_times.append(time.time() - self._time_offset)
192 |         # reset episode state
193 |         self._current_reward = 0
194 |         self._num_steps = 0
195 | 
196 |         return obs
197 | 
198 |     def _step(self, action):
199 |         obs, rew, done, info = self.env.step(action)
200 |         self._current_reward += rew
201 |         self._num_steps += 1
202 |         self._total_steps += 1
203 |         info['steps'] = self._total_steps
204 |         info['rewards'] = self._episode_rewards
205 |         return (obs, rew, done, info)
206 | 
207 |     def get_state(self):
208 |         return {
209 |             'env_id': self.env.unwrapped.spec.id,
210 |             'episode_data': {
211 |                 'episode_rewards': self._episode_rewards,
212 |                 'episode_lengths': self._episode_lengths,
213 |                 'episode_end_times': self._episode_end_times,
214 |                 'initial_reset_time': 0,
215 |             }
216 |         }
217 | 
218 |     def set_state(self, state):
219 |         assert state['env_id'] == self.env.unwrapped.spec.id
220 |         ed = state['episode_data']
221 |         self._episode_rewards = ed['episode_rewards']
222 |         self._episode_lengths = ed['episode_lengths']
223 |         self._episode_end_times = ed['episode_end_times']
224 | 
225 | 
226 | def boolean_flag(parser, name, default=False, help=None):
227 |     """Add a boolean flag to argparse parser.
228 | 
229 |     Parameters
230 |     ----------
231 |     parser: argparse.Parser
232 |         parser to add the flag to
233 |     name: str
234 |         --<name> will enable the flag, while --no-<name> will disable it
235 |     default: bool or None
236 |         default value of the flag
237 |     help: str
238 |         help string for the flag
239 |     """
240 |     dest = name.replace('-', '_')
241 |     parser.add_argument("--" + name, action="store_true", default=default, dest=dest, help=help)
242 |     parser.add_argument("--no-" + name, action="store_false", dest=dest)
243 | 
244 | 
245 | def get_wrapper_by_name(env, classname):
246 |     """Given an a gym environment possibly wrapped multiple times, returns a wrapper
247 |     of class named classname or raises ValueError if no such wrapper was applied
248 | 
249 |     Parameters
250 |     ----------
251 |     env: gym.Env of gym.Wrapper
252 |         gym environment
253 |     classname: str
254 |         name of the wrapper
255 | 
256 |     Returns
257 |     -------
258 |     wrapper: gym.Wrapper
259 |         wrapper named classname
260 |     """
261 |     currentenv = env
262 |     while True:
263 |         if classname == currentenv.class_name():
264 |             return currentenv
265 |         elif isinstance(currentenv, gym.Wrapper):
266 |             currentenv = currentenv.env
267 |         else:
268 |             raise ValueError("Couldn't find wrapper named %s" % classname)
269 | 
270 | 
271 | def relatively_safe_pickle_dump(obj, path, compression=False):
272 |     """This is just like regular pickle dump, except from the fact that failure cases are
273 |     different:
274 | 
275 |         - It's never possible that we end up with a pickle in corrupted state.
276 |         - If a there was a different file at the path, that file will remain unchanged in the
277 |           even of failure (provided that filesystem rename is atomic).
278 |         - it is sometimes possible that we end up with useless temp file which needs to be
279 |           deleted manually (it will be removed automatically on the next function call)
280 | 
281 |     The indended use case is periodic checkpoints of experiment state, such that we never
282 |     corrupt previous checkpoints if the current one fails.
283 | 
284 |     Parameters
285 |     ----------
286 |     obj: object
287 |         object to pickle
288 |     path: str
289 |         path to the output file
290 |     compression: bool
291 |         if true pickle will be compressed
292 |     """
293 |     temp_storage = path + ".relatively_safe"
294 |     if compression:
295 |         # Using gzip here would be simpler, but the size is limited to 2GB
296 |         with tempfile.NamedTemporaryFile() as uncompressed_file:
297 |             pickle.dump(obj, uncompressed_file)
298 |             with zipfile.ZipFile(temp_storage, "w", compression=zipfile.ZIP_DEFLATED) as myzip:
299 |                 myzip.write(uncompressed_file.name, "data")
300 |     else:
301 |         with open(temp_storage, "wb") as f:
302 |             pickle.dump(obj, f)
303 |     os.rename(temp_storage, path)
304 | 
305 | 
306 | def pickle_load(path, compression=False):
307 |     """Unpickle a possible compressed pickle.
308 | 
309 |     Parameters
310 |     ----------
311 |     path: str
312 |         path to the output file
313 |     compression: bool
314 |         if true assumes that pickle was compressed when created and attempts decompression.
315 | 
316 |     Returns
317 |     -------
318 |     obj: object
319 |         the unpickled object
320 |     """
321 | 
322 |     if compression:
323 |         with zipfile.ZipFile(path, "r", compression=zipfile.ZIP_DEFLATED) as myzip:
324 |             with myzip.open("data") as f:
325 |                 return pickle.load(f)
326 |     else:
327 |         with open(path, "rb") as f:
328 |             return pickle.load(f)
329 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import gailtf.baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | class MpiAdam(object):
 7 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
 8 |         self.var_list = var_list
 9 |         self.beta1 = beta1
10 |         self.beta2 = beta2
11 |         self.epsilon = epsilon
12 |         self.scale_grad_by_procs = scale_grad_by_procs
13 |         size = sum(U.numel(v) for v in var_list)
14 |         self.m = np.zeros(size, 'float32')
15 |         self.v = np.zeros(size, 'float32')
16 |         self.t = 0
17 |         self.setfromflat = U.SetFromFlat(var_list)
18 |         self.getflat = U.GetFlat(var_list)
19 |         self.comm = MPI.COMM_WORLD if comm is None else comm
20 | 
21 |     def update(self, localg, stepsize):
22 |         if self.t % 100 == 0:
23 |             self.check_synced()
24 |         localg = localg.astype('float32')
25 |         globalg = np.zeros_like(localg)
26 |         self.comm.Allreduce(localg, globalg, op=MPI.SUM)
27 |         if self.scale_grad_by_procs:
28 |             globalg /= self.comm.Get_size()
29 | 
30 |         self.t += 1
31 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
32 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
33 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
34 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
35 |         self.setfromflat(self.getflat() + step)
36 | 
37 |     def sync(self):
38 |         theta = self.getflat()
39 |         self.comm.Bcast(theta, root=0)
40 |         self.setfromflat(theta)
41 | 
42 |     def check_synced(self):
43 |         if self.comm.Get_rank() == 0: # this is root
44 |             theta = self.getflat()
45 |             self.comm.Bcast(theta, root=0)
46 |         else:
47 |             thetalocal = self.getflat()
48 |             thetaroot = np.empty_like(thetalocal)
49 |             self.comm.Bcast(thetaroot, root=0)
50 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
51 | 
52 | @U.in_session
53 | def test_MpiAdam():
54 |     np.random.seed(0)
55 |     tf.set_random_seed(0)
56 |     
57 |     a = tf.Variable(np.random.randn(3).astype('float32'))
58 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
59 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
60 | 
61 |     stepsize = 1e-2
62 |     update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
63 |     do_update = U.function([], loss, updates=[update_op])
64 | 
65 |     tf.get_default_session().run(tf.global_variables_initializer())
66 |     for i in range(10):
67 |         print(i,do_update())
68 | 
69 |     tf.set_random_seed(0)
70 |     tf.get_default_session().run(tf.global_variables_initializer())
71 | 
72 |     var_list = [a,b]
73 |     lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
74 |     adam = MpiAdam(var_list)
75 | 
76 |     for i in range(10):
77 |         l,g = lossandgrad()
78 |         adam.update(g, stepsize)
79 |         print(i,l)


--------------------------------------------------------------------------------
/gailtf/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
 1 | import os, subprocess, sys
 2 | 
 3 | def mpi_fork(n, bind_to_core=False):
 4 |     """Re-launches the current script with workers
 5 |     Returns "parent" for original parent, "child" for MPI children
 6 |     """
 7 |     if n<=1: 
 8 |         return "child"
 9 |     if os.getenv("IN_MPI") is None:
10 |         env = os.environ.copy()
11 |         env.update(
12 |             MKL_NUM_THREADS="1",
13 |             OMP_NUM_THREADS="1",
14 |             IN_MPI="1"
15 |         )
16 |         args = ["mpirun", "-np", str(n)]
17 |         if bind_to_core:
18 |             args += ["-bind-to", "core"]
19 |         args += [sys.executable] + sys.argv
20 |         subprocess.check_call(args, env=env)
21 |         return "parent"
22 |     else:
23 |         return "child"
24 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | from gailtf.baselines.common import zipsame
 4 | 
 5 | def mpi_moments(x, axis=0):
 6 |     x = np.asarray(x, dtype='float64')
 7 |     newshape = list(x.shape)
 8 |     newshape.pop(axis)
 9 |     n = np.prod(newshape,dtype=int)
10 |     totalvec = np.zeros(n*2+1, 'float64')
11 |     addvec = np.concatenate([x.sum(axis=axis).ravel(), 
12 |         np.square(x).sum(axis=axis).ravel(), 
13 |         np.array([x.shape[axis]],dtype='float64')])
14 |     MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
15 |     sum = totalvec[:n]
16 |     sumsq = totalvec[n:2*n]
17 |     count = totalvec[2*n]
18 |     if count == 0:
19 |         mean = np.empty(newshape); mean[:] = np.nan
20 |         std = np.empty(newshape); std[:] = np.nan
21 |     else:
22 |         mean = sum/count
23 |         std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0))
24 |     return mean, std, count
25 | 
26 | 
27 | def test_runningmeanstd():
28 |     comm = MPI.COMM_WORLD
29 |     np.random.seed(0)
30 |     for (triple,axis) in [
31 |         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
32 |         ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
33 |         ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
34 |         ]:
35 | 
36 | 
37 |         x = np.concatenate(triple, axis=axis)
38 |         ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
39 | 
40 | 
41 |         ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
42 | 
43 |         for (a1,a2) in zipsame(ms1, ms2):
44 |             print(a1, a2)
45 |             assert np.allclose(a1, a2)
46 |             print("ok!")
47 | 
48 | if __name__ == "__main__":
49 |     #mpirun -np 3 python <script>
50 |     test_runningmeanstd()


--------------------------------------------------------------------------------
/gailtf/baselines/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | from mpi4py import MPI
  2 | import tensorflow as tf, gailtf.baselines.common.tf_util as U, numpy as np
  3 | 
  4 | class RunningMeanStd(object):
  5 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
  6 |     def __init__(self, epsilon=1e-2, shape=()):
  7 | 
  8 |         self._sum = tf.get_variable(
  9 |             dtype=tf.float64,
 10 |             shape=shape,
 11 |             initializer=tf.constant_initializer(0.0),
 12 |             name="runningsum", trainable=False)
 13 |         self._sumsq = tf.get_variable(
 14 |             dtype=tf.float64,
 15 |             shape=shape,
 16 |             initializer=tf.constant_initializer(epsilon),
 17 |             name="runningsumsq", trainable=False)
 18 |         self._count = tf.get_variable(
 19 |             dtype=tf.float64,
 20 |             shape=(),
 21 |             initializer=tf.constant_initializer(epsilon),
 22 |             name="count", trainable=False)
 23 |         self.shape = shape
 24 | 
 25 |         self.mean = tf.to_float(self._sum / self._count)
 26 |         self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
 27 | 
 28 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 29 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 30 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 31 |         self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
 32 |             updates=[tf.assign_add(self._sum, newsum),
 33 |                      tf.assign_add(self._sumsq, newsumsq),
 34 |                      tf.assign_add(self._count, newcount)])
 35 | 
 36 | 
 37 |     def update(self, x):
 38 |         x = x.astype('float64')
 39 |         n = int(np.prod(self.shape))
 40 |         totalvec = np.zeros(n*2+1, 'float64')
 41 |         addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
 42 |         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 43 |         self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
 44 | 
 45 | @U.in_session
 46 | def test_runningmeanstd():
 47 |     for (x1, x2, x3) in [
 48 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
 49 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
 50 |         ]:
 51 | 
 52 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
 53 |         U.initialize()
 54 | 
 55 |         x = np.concatenate([x1, x2, x3], axis=0)
 56 |         ms1 = [x.mean(axis=0), x.std(axis=0)]
 57 |         rms.update(x1)
 58 |         rms.update(x2)
 59 |         rms.update(x3)
 60 |         ms2 = U.eval([rms.mean, rms.std])
 61 | 
 62 |         assert np.allclose(ms1, ms2)
 63 | 
 64 | @U.in_session
 65 | def test_dist():
 66 |     np.random.seed(0)
 67 |     p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
 68 |     q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
 69 | 
 70 |     # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
 71 |     # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
 72 | 
 73 |     comm = MPI.COMM_WORLD
 74 |     assert comm.Get_size()==2
 75 |     if comm.Get_rank()==0:
 76 |         x1,x2,x3 = p1,p2,p3
 77 |     elif comm.Get_rank()==1:
 78 |         x1,x2,x3 = q1,q2,q3
 79 |     else:
 80 |         assert False
 81 | 
 82 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 83 |     U.initialize()
 84 | 
 85 |     rms.update(x1)
 86 |     rms.update(x2)
 87 |     rms.update(x3)
 88 | 
 89 |     bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
 90 | 
 91 |     def checkallclose(x,y):
 92 |         print(x,y)
 93 |         return np.allclose(x,y)
 94 | 
 95 |     assert checkallclose(
 96 |         bigvec.mean(axis=0),
 97 |         U.eval(rms.mean)
 98 |     )
 99 |     assert checkallclose(
100 |         bigvec.std(axis=0),
101 |         U.eval(rms.std)
102 |     )
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     # Run with mpirun -np 2 python <filename>
107 |     test_dist()
108 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t and t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """Build a Segment Tree data structure.
  7 | 
  8 |         https://en.wikipedia.org/wiki/Segment_tree
  9 | 
 10 |         Can be used as regular array, but with two
 11 |         important differences:
 12 | 
 13 |             a) setting item's value is slightly slower.
 14 |                It is O(lg capacity) instead of O(1).
 15 |             b) user has access to an efficient `reduce`
 16 |                operation which reduces `operation` over
 17 |                a contiguous subsequence of items in the
 18 |                array.
 19 | 
 20 |         Paramters
 21 |         ---------
 22 |         capacity: int
 23 |             Total size of the array - must be a power of two.
 24 |         operation: lambda obj, obj -> obj
 25 |             and operation for combining elements (eg. sum, max)
 26 |             must for a mathematical group together with the set of
 27 |             possible values for array elements.
 28 |         neutral_element: obj
 29 |             neutral element for the operation above. eg. float('-inf')
 30 |             for max and 0 for sum.
 31 |         """
 32 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 33 |         self._capacity = capacity
 34 |         self._value = [neutral_element for _ in range(2 * capacity)]
 35 |         self._operation = operation
 36 | 
 37 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 38 |         if start == node_start and end == node_end:
 39 |             return self._value[node]
 40 |         mid = (node_start + node_end) // 2
 41 |         if end <= mid:
 42 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 43 |         else:
 44 |             if mid + 1 <= start:
 45 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 46 |             else:
 47 |                 return self._operation(
 48 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 49 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 50 |                 )
 51 | 
 52 |     def reduce(self, start=0, end=None):
 53 |         """Returns result of applying `self.operation`
 54 |         to a contiguous subsequence of the array.
 55 | 
 56 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 57 | 
 58 |         Parameters
 59 |         ----------
 60 |         start: int
 61 |             beginning of the subsequence
 62 |         end: int
 63 |             end of the subsequences
 64 | 
 65 |         Returns
 66 |         -------
 67 |         reduced: obj
 68 |             result of reducing self.operation over the specified range of array elements.
 69 |         """
 70 |         if end is None:
 71 |             end = self._capacity
 72 |         if end < 0:
 73 |             end += self._capacity
 74 |         end -= 1
 75 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 76 | 
 77 |     def __setitem__(self, idx, val):
 78 |         # index of the leaf
 79 |         idx += self._capacity
 80 |         self._value[idx] = val
 81 |         idx //= 2
 82 |         while idx >= 1:
 83 |             self._value[idx] = self._operation(
 84 |                 self._value[2 * idx],
 85 |                 self._value[2 * idx + 1]
 86 |             )
 87 |             idx //= 2
 88 | 
 89 |     def __getitem__(self, idx):
 90 |         assert 0 <= idx < self._capacity
 91 |         return self._value[self._capacity + idx]
 92 | 
 93 | 
 94 | class SumSegmentTree(SegmentTree):
 95 |     def __init__(self, capacity):
 96 |         super(SumSegmentTree, self).__init__(
 97 |             capacity=capacity,
 98 |             operation=operator.add,
 99 |             neutral_element=0.0
100 |         )
101 | 
102 |     def sum(self, start=0, end=None):
103 |         """Returns arr[start] + ... + arr[end]"""
104 |         return super(SumSegmentTree, self).reduce(start, end)
105 | 
106 |     def find_prefixsum_idx(self, prefixsum):
107 |         """Find the highest index `i` in the array such that
108 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
109 | 
110 |         if array values are probabilities, this function
111 |         allows to sample indexes according to the discrete
112 |         probability efficiently.
113 | 
114 |         Parameters
115 |         ----------
116 |         perfixsum: float
117 |             upperbound on the sum of array prefix
118 | 
119 |         Returns
120 |         -------
121 |         idx: int
122 |             highest index satisfying the prefixsum constraint
123 |         """
124 |         assert 0 <= prefixsum <= self.sum() + 1e-5
125 |         idx = 1
126 |         while idx < self._capacity:  # while non-leaf
127 |             if self._value[2 * idx] > prefixsum:
128 |                 idx = 2 * idx
129 |             else:
130 |                 prefixsum -= self._value[2 * idx]
131 |                 idx = 2 * idx + 1
132 |         return idx - self._capacity
133 | 
134 | 
135 | class MinSegmentTree(SegmentTree):
136 |     def __init__(self, capacity):
137 |         super(MinSegmentTree, self).__init__(
138 |             capacity=capacity,
139 |             operation=min,
140 |             neutral_element=float('inf')
141 |         )
142 | 
143 |     def min(self, start=0, end=None):
144 |         """Returns min(arr[start], ...,  arr[end])"""
145 | 
146 |         return super(MinSegmentTree, self).reduce(start, end)
147 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from gailtf.baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
 4 | 
 5 | 
 6 | def test_piecewise_schedule():
 7 |     ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
 8 | 
 9 |     assert np.isclose(ps.value(-10), 500)
10 |     assert np.isclose(ps.value(0), 150)
11 |     assert np.isclose(ps.value(5), 200)
12 |     assert np.isclose(ps.value(9), 80)
13 |     assert np.isclose(ps.value(50), 50)
14 |     assert np.isclose(ps.value(80), 50)
15 |     assert np.isclose(ps.value(150), 0)
16 |     assert np.isclose(ps.value(175), -25)
17 |     assert np.isclose(ps.value(201), 500)
18 |     assert np.isclose(ps.value(500), 500)
19 | 
20 |     assert np.isclose(ps.value(200 - 1e-10), -50)
21 | 
22 | 
23 | def test_constant_schedule():
24 |     cs = ConstantSchedule(5)
25 |     for i in range(-100, 100):
26 |         assert np.isclose(cs.value(i), 5)
27 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from gailtf.baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  4 | 
  5 | 
  6 | def test_tree_set():
  7 |     tree = SumSegmentTree(4)
  8 | 
  9 |     tree[2] = 1.0
 10 |     tree[3] = 3.0
 11 | 
 12 |     assert np.isclose(tree.sum(), 4.0)
 13 |     assert np.isclose(tree.sum(0, 2), 0.0)
 14 |     assert np.isclose(tree.sum(0, 3), 1.0)
 15 |     assert np.isclose(tree.sum(2, 3), 1.0)
 16 |     assert np.isclose(tree.sum(2, -1), 1.0)
 17 |     assert np.isclose(tree.sum(2, 4), 4.0)
 18 | 
 19 | 
 20 | def test_tree_set_overlap():
 21 |     tree = SumSegmentTree(4)
 22 | 
 23 |     tree[2] = 1.0
 24 |     tree[2] = 3.0
 25 | 
 26 |     assert np.isclose(tree.sum(), 3.0)
 27 |     assert np.isclose(tree.sum(2, 3), 3.0)
 28 |     assert np.isclose(tree.sum(2, -1), 3.0)
 29 |     assert np.isclose(tree.sum(2, 4), 3.0)
 30 |     assert np.isclose(tree.sum(1, 2), 0.0)
 31 | 
 32 | 
 33 | def test_prefixsum_idx():
 34 |     tree = SumSegmentTree(4)
 35 | 
 36 |     tree[2] = 1.0
 37 |     tree[3] = 3.0
 38 | 
 39 |     assert tree.find_prefixsum_idx(0.0) == 2
 40 |     assert tree.find_prefixsum_idx(0.5) == 2
 41 |     assert tree.find_prefixsum_idx(0.99) == 2
 42 |     assert tree.find_prefixsum_idx(1.01) == 3
 43 |     assert tree.find_prefixsum_idx(3.00) == 3
 44 |     assert tree.find_prefixsum_idx(4.00) == 3
 45 | 
 46 | 
 47 | def test_prefixsum_idx2():
 48 |     tree = SumSegmentTree(4)
 49 | 
 50 |     tree[0] = 0.5
 51 |     tree[1] = 1.0
 52 |     tree[2] = 1.0
 53 |     tree[3] = 3.0
 54 | 
 55 |     assert tree.find_prefixsum_idx(0.00) == 0
 56 |     assert tree.find_prefixsum_idx(0.55) == 1
 57 |     assert tree.find_prefixsum_idx(0.99) == 1
 58 |     assert tree.find_prefixsum_idx(1.51) == 2
 59 |     assert tree.find_prefixsum_idx(3.00) == 3
 60 |     assert tree.find_prefixsum_idx(5.50) == 3
 61 | 
 62 | 
 63 | def test_max_interval_tree():
 64 |     tree = MinSegmentTree(4)
 65 | 
 66 |     tree[0] = 1.0
 67 |     tree[2] = 0.5
 68 |     tree[3] = 3.0
 69 | 
 70 |     assert np.isclose(tree.min(), 0.5)
 71 |     assert np.isclose(tree.min(0, 2), 1.0)
 72 |     assert np.isclose(tree.min(0, 3), 0.5)
 73 |     assert np.isclose(tree.min(0, -1), 0.5)
 74 |     assert np.isclose(tree.min(2, 4), 0.5)
 75 |     assert np.isclose(tree.min(3, 4), 3.0)
 76 | 
 77 |     tree[2] = 0.7
 78 | 
 79 |     assert np.isclose(tree.min(), 0.7)
 80 |     assert np.isclose(tree.min(0, 2), 1.0)
 81 |     assert np.isclose(tree.min(0, 3), 0.7)
 82 |     assert np.isclose(tree.min(0, -1), 0.7)
 83 |     assert np.isclose(tree.min(2, 4), 0.7)
 84 |     assert np.isclose(tree.min(3, 4), 3.0)
 85 | 
 86 |     tree[2] = 4.0
 87 | 
 88 |     assert np.isclose(tree.min(), 1.0)
 89 |     assert np.isclose(tree.min(0, 2), 1.0)
 90 |     assert np.isclose(tree.min(0, 3), 1.0)
 91 |     assert np.isclose(tree.min(0, -1), 1.0)
 92 |     assert np.isclose(tree.min(2, 4), 3.0)
 93 |     assert np.isclose(tree.min(2, 3), 4.0)
 94 |     assert np.isclose(tree.min(2, -1), 4.0)
 95 |     assert np.isclose(tree.min(3, 4), 3.0)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     test_tree_set()
100 |     test_tree_set_overlap()
101 |     test_prefixsum_idx()
102 |     test_prefixsum_idx2()
103 |     test_max_interval_tree()
104 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
 1 | # tests for tf_util
 2 | import tensorflow as tf
 3 | from gailtf.baselines.common.tf_util import (
 4 |     function,
 5 |     initialize,
 6 |     set_value,
 7 |     single_threaded_session
 8 | )
 9 | 
10 | 
11 | def test_set_value():
12 |     a = tf.Variable(42.)
13 |     with single_threaded_session():
14 |         set_value(a, 5)
15 |         assert a.eval() == 5
16 |         g = tf.get_default_graph()
17 |         g.finalize()
18 |         set_value(a, 6)
19 |         assert a.eval() == 6
20 | 
21 |         # test the test
22 |         try:
23 |             assert a.eval() == 7
24 |         except AssertionError:
25 |             pass
26 |         else:
27 |             assert False, "assertion should have failed"
28 | 
29 | 
30 | def test_function():
31 |     tf.reset_default_graph()
32 |     x = tf.placeholder(tf.int32, (), name="x")
33 |     y = tf.placeholder(tf.int32, (), name="y")
34 |     z = 3 * x + 2 * y
35 |     lin = function([x, y], z, givens={y: 0})
36 | 
37 |     with single_threaded_session():
38 |         initialize()
39 | 
40 |         assert lin(2) == 6
41 |         assert lin(x=3) == 9
42 |         assert lin(2, 2) == 10
43 |         assert lin(x=2, y=3) == 12
44 | 
45 | 
46 | def test_multikwargs():
47 |     tf.reset_default_graph()
48 |     x = tf.placeholder(tf.int32, (), name="x")
49 |     with tf.variable_scope("other"):
50 |         x2 = tf.placeholder(tf.int32, (), name="x")
51 |     z = 3 * x + 2 * x2
52 | 
53 |     lin = function([x, x2], z, givens={x2: 0})
54 |     with single_threaded_session():
55 |         initialize()
56 |         assert lin(2) == 6
57 |         assert lin(2, 2) == 10
58 |         expt_caught = False
59 |         try:
60 |             lin(x=2)
61 |         except AssertionError:
62 |             expt_caught = True
63 |         assert expt_caught
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     test_set_value()
68 |     test_function()
69 |     test_multikwargs()
70 | 


--------------------------------------------------------------------------------
/gailtf/baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
 1 | class VecEnv(object):
 2 |     """
 3 |     Vectorized environment base class
 4 |     """
 5 |     def step(self, vac):
 6 |         """
 7 |         Apply sequence of actions to sequence of environments
 8 |         actions -> (observations, rewards, news)
 9 | 
10 |         where 'news' is a boolean vector indicating whether each element is new.
11 |         """
12 |         raise NotImplementedError
13 |     def reset(self):
14 |         """
15 |         Reset all environments
16 |         """
17 |         raise NotImplementedError
18 |     def close(self):
19 |         pass


--------------------------------------------------------------------------------
/gailtf/baselines/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiprocessing import Process, Pipe
 3 | from gailtf.baselines.common.vec_env import VecEnv
 4 | 
 5 | def worker(remote, env_fn_wrapper):
 6 |     env = env_fn_wrapper.x()
 7 |     while True:
 8 |         cmd, data = remote.recv()
 9 |         if cmd == 'step':
10 |             ob, reward, done, info = env.step(data)
11 |             if done:
12 |                 ob = env.reset()
13 |             remote.send((ob, reward, done, info))
14 |         elif cmd == 'reset':
15 |             ob = env.reset()
16 |             remote.send(ob)
17 |         elif cmd == 'close':
18 |             remote.close()
19 |             break
20 |         elif cmd == 'get_spaces':
21 |             remote.send((env.action_space, env.observation_space))
22 |         else:
23 |             raise NotImplementedError
24 | 
25 | class CloudpickleWrapper(object):
26 |     """
27 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
28 |     """
29 |     def __init__(self, x):
30 |         self.x = x
31 |     def __getstate__(self):
32 |         import cloudpickle
33 |         return cloudpickle.dumps(self.x)
34 |     def __setstate__(self, ob):
35 |         import pickle
36 |         self.x = pickle.loads(ob)
37 | 
38 | class SubprocVecEnv(VecEnv):
39 |     def __init__(self, env_fns):
40 |         """
41 |         envs: list of gym environments to run in subprocesses
42 |         """
43 |         nenvs = len(env_fns)
44 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])        
45 |         self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn))) 
46 |             for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
47 |         for p in self.ps:
48 |             p.start()
49 | 
50 |         self.remotes[0].send(('get_spaces', None))
51 |         self.action_space, self.observation_space = self.remotes[0].recv()
52 | 
53 | 
54 |     def step(self, actions):
55 |         for remote, action in zip(self.remotes, actions):
56 |             remote.send(('step', action))
57 |         results = [remote.recv() for remote in self.remotes]
58 |         obs, rews, dones, infos = zip(*results)
59 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
60 | 
61 |     def reset(self):
62 |         for remote in self.remotes:
63 |             remote.send(('reset', None))
64 |         return np.stack([remote.recv() for remote in self.remotes])
65 | 
66 |     def close(self):
67 |         for remote in self.remotes:
68 |             remote.send(('close', None))
69 |         for p in self.ps:
70 |             p.join()
71 | 
72 |     @property
73 |     def num_envs(self):
74 |         return len(self.remotes)
75 | 


--------------------------------------------------------------------------------
/gailtf/baselines/logger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import shutil
  4 | import os.path as osp
  5 | import json
  6 | import time
  7 | import datetime
  8 | import tempfile
  9 | 
 10 | LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']
 11 | 
 12 | DEBUG = 10
 13 | INFO = 20
 14 | WARN = 30
 15 | ERROR = 40
 16 | 
 17 | DISABLED = 50
 18 | 
 19 | class OutputFormat(object):
 20 |     def writekvs(self, kvs):
 21 |         """
 22 |         Write key-value pairs
 23 |         """
 24 |         raise NotImplementedError
 25 | 
 26 |     def writeseq(self, args):
 27 |         """
 28 |         Write a sequence of other data (e.g. a logging message)
 29 |         """
 30 |         pass
 31 | 
 32 |     def close(self):
 33 |         return
 34 | 
 35 | 
 36 | class HumanOutputFormat(OutputFormat):
 37 |     def __init__(self, file):
 38 |         self.file = file
 39 | 
 40 |     def writekvs(self, kvs):
 41 |         # Create strings for printing
 42 |         key2str = {}
 43 |         for (key, val) in sorted(kvs.items()):
 44 |             if isinstance(val, float):
 45 |                 valstr = '%-8.3g' % (val,)
 46 |             else:
 47 |                 valstr = str(val)
 48 |             key2str[self._truncate(key)] = self._truncate(valstr)
 49 | 
 50 |         # Find max widths
 51 |         keywidth = max(map(len, key2str.keys()))
 52 |         valwidth = max(map(len, key2str.values()))
 53 | 
 54 |         # Write out the data
 55 |         dashes = '-' * (keywidth + valwidth + 7)
 56 |         lines = [dashes]
 57 |         for (key, val) in sorted(key2str.items()):
 58 |             lines.append('| %s%s | %s%s |' % (
 59 |                 key,
 60 |                 ' ' * (keywidth - len(key)),
 61 |                 val,
 62 |                 ' ' * (valwidth - len(val)),
 63 |             ))
 64 |         lines.append(dashes)
 65 |         self.file.write('\n'.join(lines) + '\n')
 66 | 
 67 |         # Flush the output to the file
 68 |         self.file.flush()
 69 | 
 70 |     def _truncate(self, s):
 71 |         return s[:20] + '...' if len(s) > 23 else s
 72 | 
 73 |     def writeseq(self, args):
 74 |         for arg in args:
 75 |             self.file.write(arg)
 76 |         self.file.write('\n')
 77 |         self.file.flush()
 78 | 
 79 | class JSONOutputFormat(OutputFormat):
 80 |     def __init__(self, file):
 81 |         self.file = file
 82 | 
 83 |     def writekvs(self, kvs):
 84 |         for k, v in sorted(kvs.items()):
 85 |             if hasattr(v, 'dtype'):
 86 |                 v = v.tolist()
 87 |                 kvs[k] = float(v)
 88 |         self.file.write(json.dumps(kvs) + '\n')
 89 |         self.file.flush()
 90 | 
 91 | class TensorBoardOutputFormat(OutputFormat):
 92 |     """
 93 |     Dumps key/value pairs into TensorBoard's numeric format.
 94 |     """
 95 |     def __init__(self, dir):
 96 |         os.makedirs(dir, exist_ok=True)
 97 |         self.dir = dir
 98 |         self.step = 1
 99 |         prefix = 'events'
100 |         path = osp.join(osp.abspath(dir), prefix)
101 |         import tensorflow as tf
102 |         from tensorflow.python import pywrap_tensorflow        
103 |         from tensorflow.core.util import event_pb2
104 |         from tensorflow.python.util import compat
105 |         self.tf = tf
106 |         self.event_pb2 = event_pb2
107 |         self.pywrap_tensorflow = pywrap_tensorflow
108 |         self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
109 | 
110 |     def writekvs(self, kvs):
111 |         def summary_val(k, v):
112 |             kwargs = {'tag': k, 'simple_value': float(v)}
113 |             return self.tf.Summary.Value(**kwargs)
114 |         summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
115 |         event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
116 |         event.step = self.step # is there any reason why you'd want to specify the step?
117 |         self.writer.WriteEvent(event)
118 |         self.writer.Flush()
119 |         self.step += 1
120 | 
121 |     def close(self):
122 |         if self.writer:
123 |             self.writer.Close()
124 |             self.writer = None
125 | 
126 | 
127 | def make_output_format(format, ev_dir):
128 |     os.makedirs(ev_dir, exist_ok=True)
129 |     if format == 'stdout':
130 |         return HumanOutputFormat(sys.stdout)
131 |     elif format == 'log':
132 |         log_file = open(osp.join(ev_dir, 'log.txt'), 'wt')
133 |         return HumanOutputFormat(log_file)
134 |     elif format == 'json':
135 |         json_file = open(osp.join(ev_dir, 'progress.json'), 'wt')
136 |         return JSONOutputFormat(json_file)
137 |     elif format == 'tensorboard':
138 |         return TensorBoardOutputFormat(osp.join(ev_dir, 'tb'))
139 |     else:
140 |         raise ValueError('Unknown format specified: %s' % (format,))
141 | 
142 | # ================================================================
143 | # API
144 | # ================================================================
145 | 
146 | def logkv(key, val):
147 |     """
148 |     Log a value of some diagnostic
149 |     Call this once for each diagnostic quantity, each iteration
150 |     """
151 |     Logger.CURRENT.logkv(key, val)
152 | 
153 | def logkvs(d):
154 |     """
155 |     Log a dictionary of key-value pairs
156 |     """
157 |     for (k, v) in d.items():
158 |         logkv(k, v)
159 | 
160 | def dumpkvs():
161 |     """
162 |     Write all of the diagnostics from the current iteration
163 | 
164 |     level: int. (see logger.py docs) If the global logger level is higher than
165 |                 the level argument here, don't print to stdout.
166 |     """
167 |     Logger.CURRENT.dumpkvs()
168 | 
169 | def getkvs():
170 |     return Logger.CURRENT.name2val    
171 | 
172 | 
173 | def log(*args, level=INFO):
174 |     """
175 |     Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
176 |     """
177 |     Logger.CURRENT.log(*args, level=level)
178 | 
179 | 
180 | def debug(*args):
181 |     log(*args, level=DEBUG)
182 | 
183 | 
184 | def info(*args):
185 |     log(*args, level=INFO)
186 | 
187 | 
188 | def warn(*args):
189 |     log(*args, level=WARN)
190 | 
191 | 
192 | def error(*args):
193 |     log(*args, level=ERROR)
194 | 
195 | 
196 | def set_level(level):
197 |     """
198 |     Set logging threshold on current logger.
199 |     """
200 |     Logger.CURRENT.set_level(level)
201 | 
202 | def get_dir():
203 |     """
204 |     Get directory that log files are being written to.
205 |     will be None if there is no output directory (i.e., if you didn't call start)
206 |     """
207 |     return Logger.CURRENT.get_dir()
208 | 
209 | record_tabular = logkv
210 | dump_tabular = dumpkvs
211 | 
212 | # ================================================================
213 | # Backend
214 | # ================================================================
215 | 
216 | class Logger(object):
217 |     DEFAULT = None  # A logger with no output files. (See right below class definition)
218 |                     # So that you can still log to the terminal without setting up any output files
219 |     CURRENT = None  # Current logger being used by the free functions above
220 | 
221 |     def __init__(self, dir, output_formats):
222 |         self.name2val = {}  # values this iteration
223 |         self.level = INFO
224 |         self.dir = dir
225 |         self.output_formats = output_formats
226 | 
227 |     # Logging API, forwarded
228 |     # ----------------------------------------
229 |     def logkv(self, key, val):
230 |         self.name2val[key] = val
231 | 
232 |     def dumpkvs(self):
233 |         if self.level == DISABLED: return
234 |         for fmt in self.output_formats:
235 |             fmt.writekvs(self.name2val)
236 |         self.name2val.clear()
237 | 
238 |     def log(self, *args, level=INFO):
239 |         if self.level <= level:
240 |             self._do_log(args)
241 | 
242 |     # Configuration
243 |     # ----------------------------------------
244 |     def set_level(self, level):
245 |         self.level = level
246 | 
247 |     def get_dir(self):
248 |         return self.dir
249 | 
250 |     def close(self):
251 |         for fmt in self.output_formats:
252 |             fmt.close()
253 | 
254 |     # Misc
255 |     # ----------------------------------------
256 |     def _do_log(self, args):
257 |         for fmt in self.output_formats:
258 |             fmt.writeseq(args)
259 | 
260 | Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
261 | 
262 | def configure(dir=None, format_strs=None):
263 |     assert Logger.CURRENT is Logger.DEFAULT,\
264 |         "Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
265 |     prevlogger = Logger.CURRENT
266 |     if dir is None:
267 |         dir = os.getenv('OPENAI_LOGDIR')
268 |     if dir is None:
269 |         dir = osp.join(tempfile.gettempdir(), 
270 |             datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
271 |     if format_strs is None:
272 |         format_strs = LOG_OUTPUT_FORMATS
273 |     output_formats = [make_output_format(f, dir) for f in format_strs]
274 |     Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
275 |     log('Logging to %s'%dir)
276 | 
277 | if os.getenv('OPENAI_LOGDIR'): 
278 |     # if OPENAI_LOGDIR is set, configure the logger on import
279 |     # this kind of nasty (unexpected to user), but I don't know how else to inject the logger
280 |     # to a script that's getting run in a subprocess
281 |     configure(dir=os.getenv('OPENAI_LOGDIR'))
282 | 
283 | def reset():
284 |     Logger.CURRENT = Logger.DEFAULT
285 |     log('Reset logger')
286 | 
287 | # ================================================================
288 | 
289 | def _demo():
290 |     info("hi")
291 |     debug("shouldn't appear")
292 |     set_level(DEBUG)
293 |     debug("should appear")
294 |     dir = "/tmp/testlogging"
295 |     if os.path.exists(dir):
296 |         shutil.rmtree(dir)
297 |     with session(dir=dir):
298 |         logkv("a", 3)
299 |         logkv("b", 2.5)
300 |         dumpkvs()
301 |         logkv("b", -2.5)
302 |         logkv("a", 5.5)
303 |         dumpkvs()
304 |         info("^^^ should see a = 5.5")
305 | 
306 |     logkv("b", -2.5)
307 |     dumpkvs()
308 | 
309 |     logkv("a", "longasslongasslongasslongasslongasslongassvalue")
310 |     dumpkvs()
311 | 
312 | 
313 | if __name__ == "__main__":
314 |     _demo()
315 | 


--------------------------------------------------------------------------------
/gailtf/baselines/ppo1/README.md:
--------------------------------------------------------------------------------
1 | # PPOSGD
2 | 
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
7 | 
8 | 


--------------------------------------------------------------------------------
/gailtf/baselines/ppo1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/gailtf/baselines/ppo1/__init__.py


--------------------------------------------------------------------------------
/gailtf/baselines/ppo1/cnn_policy.py:
--------------------------------------------------------------------------------
 1 | from gailtf.baselines.common.mpi_running_mean_std import RunningMeanStd
 2 | import gailtf.baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import gym
 5 | from gailtf.baselines.common.distributions import make_pdtype
 6 | 
 7 | class CnnPolicy(object):
 8 |     recurrent = False
 9 |     def __init__(self, name, ob_space, ac_space, kind='large'):
10 |         with tf.variable_scope(name):
11 |             self._init(ob_space, ac_space, kind)
12 |             self.scope = tf.get_variable_scope().name
13 | 
14 |     def _init(self, ob_space, ac_space, kind):
15 |         assert isinstance(ob_space, gym.spaces.Box)
16 | 
17 |         self.pdtype = pdtype = make_pdtype(ac_space)
18 |         sequence_length = None
19 | 
20 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 | 
22 |         x = ob / 255.0
23 |         if kind == 'small': # from A3C paper
24 |             x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
25 |             x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
26 |             x = U.flattenallbut0(x)
27 |             x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
28 |         elif kind == 'large': # Nature DQN
29 |             x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
30 |             x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
31 |             x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
32 |             x = U.flattenallbut0(x)
33 |             x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
34 |         else:
35 |             raise NotImplementedError
36 | 
37 |         logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
38 |         self.pd = pdtype.pdfromflat(logits)
39 |         self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]
40 | 
41 |         self.state_in = []
42 |         self.state_out = []
43 | 
44 |         stochastic = tf.placeholder(dtype=tf.bool, shape=())
45 |         ac = self.pd.sample() # XXX
46 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
47 | 
48 |     def act(self, stochastic, ob):
49 |         ac1, vpred1 =  self._act(stochastic, ob[None])
50 |         return ac1[0], vpred1[0]
51 |     def get_variables(self):
52 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
53 |     def get_trainable_variables(self):
54 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
55 |     def get_initial_state(self):
56 |         return []
57 | 
58 | 


--------------------------------------------------------------------------------
/gailtf/baselines/ppo1/mlp_policy.py:
--------------------------------------------------------------------------------
 1 | from gailtf.baselines.common.mpi_running_mean_std import RunningMeanStd
 2 | import gailtf.baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import gym
 5 | from gailtf.baselines.common.distributions import make_pdtype
 6 | 
 7 | class MlpPolicy(object):
 8 |     recurrent = False
 9 |     def __init__(self, name, reuse=False, *args, **kwargs):
10 |         with tf.variable_scope(name):
11 |             if reuse:
12 |                 tf.get_variable_scope().reuse_variables()
13 |             self._init(*args, **kwargs)
14 |             self.scope = tf.get_variable_scope().name
15 | 
16 |     def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
17 |         assert isinstance(ob_space, gym.spaces.Box)
18 | 
19 |         self.pdtype = pdtype = make_pdtype(ac_space)
20 |         sequence_length = None
21 | 
22 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
23 | 
24 |         with tf.variable_scope("obfilter"):
25 |             self.ob_rms = RunningMeanStd(shape=ob_space.shape)
26 | 
27 |         obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
28 |         last_out = obz
29 |         for i in range(num_hid_layers):
30 |             last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
31 |         self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
32 | 
33 |         last_out = obz
34 |         for i in range(num_hid_layers):
35 |             last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
36 |         if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
37 |             mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
38 |             logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
39 |             pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
40 |         else:
41 |             pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
42 | 
43 |         self.pd = pdtype.pdfromflat(pdparam)
44 | 
45 |         self.state_in = []
46 |         self.state_out = []
47 | 
48 |         # change for BC
49 |         #stochastic = tf.placeholder(dtype=tf.bool, shape=())
50 |         stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
51 |         ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
52 |         self.ac = ac
53 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
54 | 
55 |     def act(self, stochastic, ob):
56 |         ac1, vpred1 =  self._act(stochastic, ob[None])
57 |         return ac1[0], vpred1[0]
58 |     def get_variables(self):
59 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
60 |     def get_trainable_variables(self):
61 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
62 |     def get_initial_state(self):
63 |         return []
64 | 
65 | 


--------------------------------------------------------------------------------
/gailtf/baselines/ppo1/pposgd_simple.py:
--------------------------------------------------------------------------------
  1 | from gailtf.baselines.common import Dataset, explained_variance, fmt_row, zipsame
  2 | from gailtf.baselines import logger
  3 | import gailtf.baselines.common.tf_util as U
  4 | import tensorflow as tf, numpy as np
  5 | import time, os, sys
  6 | from gailtf.baselines.common.mpi_adam import MpiAdam
  7 | from gailtf.baselines.common.mpi_moments import mpi_moments
  8 | from mpi4py import MPI
  9 | from collections import deque
 10 | import pickle as pkl
 11 | 
 12 | # Sample one trajectory (until trajectory end)
 13 | def traj_episode_generator(pi, env, horizon, stochastic):
 14 |     t = 0
 15 |     ac = env.action_space.sample() # not used, just so we have the datatype
 16 |     new = True # marks if we're on first timestep of an episode
 17 | 
 18 |     ob = env.reset()
 19 |     cur_ep_ret = 0 # return in current episode
 20 |     cur_ep_len = 0 # len of current episode
 21 | 
 22 |     # Initialize history arrays
 23 |     obs = []; rews = []; news = []; acs = []
 24 | 
 25 |     while True:
 26 |         prevac = ac
 27 |         ac, vpred = pi.act(stochastic, ob)
 28 |         obs.append(ob)
 29 |         news.append(new)
 30 |         acs.append(ac)
 31 |         ob, rew, new, _ = env.step(ac)
 32 |         rews.append(rew)
 33 | 
 34 |         cur_ep_ret += rew
 35 |         cur_ep_len += 1
 36 |         if t > 0 and (new or t % horizon == 0):
 37 |             # convert list into numpy array
 38 |             obs = np.array(obs)
 39 |             rews = np.array(rews)
 40 |             news = np.array(news)
 41 |             acs = np.array(acs)
 42 |             yield {"ob":obs, "rew":rews, "new":news, "ac":acs,
 43 |                     "ep_ret":cur_ep_ret, "ep_len":cur_ep_len}
 44 |             ob = env.reset()
 45 |             cur_ep_ret = 0; cur_ep_len = 0; t = 0
 46 | 
 47 |             # Initialize history arrays
 48 |             obs = []; rews = []; news = []; acs = []
 49 |         t += 1
 50 | 
 51 | def traj_segment_generator(pi, env, horizon, stochastic):
 52 |     t = 0
 53 |     ac = env.action_space.sample() # not used, just so we have the datatype
 54 |     new = True # marks if we're on first timestep of an episode
 55 |     ob = env.reset()
 56 | 
 57 |     cur_ep_ret = 0 # return in current episode
 58 |     cur_ep_len = 0 # len of current episode
 59 |     ep_rets = [] # returns of completed episodes in this segment
 60 |     ep_lens = [] # lengths of ...
 61 | 
 62 |     # Initialize history arrays
 63 |     obs = np.array([ob for _ in range(horizon)])
 64 |     rews = np.zeros(horizon, 'float32')
 65 |     vpreds = np.zeros(horizon, 'float32')
 66 |     news = np.zeros(horizon, 'int32')
 67 |     acs = np.array([ac for _ in range(horizon)])
 68 |     prevacs = acs.copy()
 69 | 
 70 |     while True:
 71 |         prevac = ac
 72 |         ac, vpred = pi.act(stochastic, ob)
 73 |         # Slight weirdness here because we need value function at time T
 74 |         # before returning segment [0, T-1] so we get the correct
 75 |         # terminal value
 76 |         if t > 0 and t % horizon == 0:
 77 |             yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
 78 |                     "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
 79 |                     "ep_rets" : ep_rets, "ep_lens" : ep_lens}
 80 |             # Be careful!!! if you change the downstream algorithm to aggregate
 81 |             # several of these batches, then be sure to do a deepcopy
 82 |             ep_rets = []
 83 |             ep_lens = []
 84 |         i = t % horizon
 85 |         obs[i] = ob
 86 |         vpreds[i] = vpred
 87 |         news[i] = new
 88 |         acs[i] = ac
 89 |         prevacs[i] = prevac
 90 | 
 91 |         ob, rew, new, _ = env.step(ac)
 92 |         rews[i] = rew
 93 | 
 94 |         cur_ep_ret += rew
 95 |         cur_ep_len += 1
 96 |         if new:
 97 |             ep_rets.append(cur_ep_ret)
 98 |             ep_lens.append(cur_ep_len)
 99 |             cur_ep_ret = 0
100 |             cur_ep_len = 0
101 |             ob = env.reset()
102 |         t += 1
103 | 
104 | def add_vtarg_and_adv(seg, gamma, lam):
105 |     """
106 |     Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
107 |     """
108 |     new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
109 |     vpred = np.append(seg["vpred"], seg["nextvpred"])
110 |     T = len(seg["rew"])
111 |     seg["adv"] = gaelam = np.empty(T, 'float32')
112 |     rew = seg["rew"]
113 |     lastgaelam = 0
114 |     for t in reversed(range(T)):
115 |         nonterminal = 1-new[t+1]
116 |         delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
117 |         gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
118 |     seg["tdlamret"] = seg["adv"] + seg["vpred"]
119 | 
120 | def learn(env, policy_func, *,
121 |         timesteps_per_batch, # timesteps per actor per update
122 |         clip_param, entcoeff, # clipping parameter epsilon, entropy coeff
123 |         optim_epochs, optim_stepsize, optim_batchsize,# optimization hypers
124 |         gamma, lam, # advantage estimation
125 |         max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0,  # time constraint
126 |         callback=None, # you can do anything in the callback, since it takes locals(), globals()
127 |         adam_epsilon=1e-5,
128 |         schedule='constant', # annealing for stepsize parameters (epsilon and adam)
129 |         save_per_iter=100,
130 |         ckpt_dir=None, task="train",
131 |         sample_stochastic=True,
132 |         load_model_path=None, task_name=None, max_sample_traj=1500
133 |         ):
134 |     # Setup losses and stuff
135 |     # ----------------------------------------
136 |     ob_space = env.observation_space
137 |     ac_space = env.action_space
138 |     pi = policy_func("pi", ob_space, ac_space) # Construct network for new policy
139 |     oldpi = policy_func("oldpi", ob_space, ac_space) # Network for old policy
140 |     atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
141 |     ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
142 | 
143 |     lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule
144 |     clip_param = clip_param * lrmult # Annealed cliping parameter epislon
145 | 
146 |     ob = U.get_placeholder_cached(name="ob")
147 |     ac = pi.pdtype.sample_placeholder([None])
148 | 
149 |     kloldnew = oldpi.pd.kl(pi.pd)
150 |     ent = pi.pd.entropy()
151 |     meankl = U.mean(kloldnew)
152 |     meanent = U.mean(ent)
153 |     pol_entpen = (-entcoeff) * meanent
154 | 
155 |     ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold
156 |     surr1 = ratio * atarg # surrogate from conservative policy iteration
157 |     surr2 = U.clip(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg #
158 |     pol_surr = - U.mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP)
159 |     vf_loss = U.mean(tf.square(pi.vpred - ret))
160 |     total_loss = pol_surr + pol_entpen + vf_loss
161 |     losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent]
162 |     loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"]
163 | 
164 |     var_list = pi.get_trainable_variables()
165 |     lossandgrad = U.function([ob, ac, atarg, ret, lrmult], losses + [U.flatgrad(total_loss, var_list)])
166 |     adam = MpiAdam(var_list, epsilon=adam_epsilon)
167 | 
168 |     assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
169 |         for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
170 |     compute_losses = U.function([ob, ac, atarg, ret, lrmult], losses)
171 | 
172 |     U.initialize()
173 |     adam.sync()
174 | 
175 |     # Prepare for rollouts
176 |     # ----------------------------------------
177 |     seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)
178 |     traj_gen = traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic)
179 | 
180 |     episodes_so_far = 0
181 |     timesteps_so_far = 0
182 |     iters_so_far = 0
183 |     tstart = time.time()
184 |     lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
185 |     rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards
186 | 
187 |     assert sum([max_iters>0, max_timesteps>0, max_episodes>0, max_seconds>0])==1, "Only one time constraint permitted"
188 | 
189 |     if task == 'sample_trajectory':
190 |         # not elegant, i know :(
191 |         sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic)
192 |         sys.exit()
193 | 
194 |     while True:
195 |         if callback: callback(locals(), globals())
196 |         if max_timesteps and timesteps_so_far >= max_timesteps:
197 |             break
198 |         elif max_episodes and episodes_so_far >= max_episodes:
199 |             break
200 |         elif max_iters and iters_so_far >= max_iters:
201 |             break
202 |         elif max_seconds and time.time() - tstart >= max_seconds:
203 |             break
204 | 
205 |         if schedule == 'constant':
206 |             cur_lrmult = 1.0
207 |         elif schedule == 'linear':
208 |             cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
209 |         else:
210 |             raise NotImplementedError
211 | 
212 |         # Save model
213 |         if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
214 |             U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far)
215 | 
216 |         logger.log("********** Iteration %i ************"%iters_so_far)
217 | 
218 |         seg = seg_gen.__next__()
219 |         add_vtarg_and_adv(seg, gamma, lam)
220 | 
221 |         # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
222 |         ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
223 |         vpredbefore = seg["vpred"] # predicted value function before udpate
224 |         atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
225 |         d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent)
226 |         optim_batchsize = optim_batchsize or ob.shape[0]
227 | 
228 |         if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
229 | 
230 |         assign_old_eq_new() # set old parameter values to new parameter values
231 |         logger.log("Optimizing...")
232 |         logger.log(fmt_row(13, loss_names))
233 |         # Here we do a bunch of optimization epochs over the data
234 |         for _ in range(optim_epochs):
235 |             losses = [] # list of tuples, each of which gives the loss for a minibatch
236 |             for batch in d.iterate_once(optim_batchsize):
237 |                 *newlosses, g = lossandgrad(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
238 |                 adam.update(g, optim_stepsize * cur_lrmult) 
239 |                 losses.append(newlosses)
240 |             logger.log(fmt_row(13, np.mean(losses, axis=0)))
241 | 
242 |         logger.log("Evaluating losses...")
243 |         losses = []
244 |         for batch in d.iterate_once(optim_batchsize):
245 |             newlosses = compute_losses(batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult)
246 |             losses.append(newlosses)            
247 |         meanlosses,_,_ = mpi_moments(losses, axis=0)
248 |         logger.log(fmt_row(13, meanlosses))
249 |         for (lossval, name) in zipsame(meanlosses, loss_names):
250 |             logger.record_tabular("loss_"+name, lossval)
251 |         logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
252 |         lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
253 |         listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
254 |         lens, rews = map(flatten_lists, zip(*listoflrpairs))
255 |         lenbuffer.extend(lens)
256 |         rewbuffer.extend(rews)
257 |         logger.record_tabular("EpLenMean", np.mean(lenbuffer))
258 |         logger.record_tabular("EpRewMean", np.mean(rewbuffer))
259 |         logger.record_tabular("EpThisIter", len(lens))
260 |         episodes_so_far += len(lens)
261 |         timesteps_so_far += sum(lens)
262 |         iters_so_far += 1
263 |         logger.record_tabular("EpisodesSoFar", episodes_so_far)
264 |         logger.record_tabular("TimestepsSoFar", timesteps_so_far)
265 |         logger.record_tabular("TimeElapsed", time.time() - tstart)
266 |         if MPI.COMM_WORLD.Get_rank()==0:
267 |             logger.dump_tabular()
268 | 
269 | def sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic):
270 | 
271 |     assert load_model_path is not None
272 |     U.load_state(load_model_path)
273 |     sample_trajs = []
274 |     for iters_so_far in range(max_sample_traj):
275 |         logger.log("********** Iteration %i ************"%iters_so_far)
276 |         traj = traj_gen.__next__()
277 |         ob, new, ep_ret, ac, rew, ep_len = traj['ob'], traj['new'], traj['ep_ret'], traj['ac'], traj['rew'], traj['ep_len']
278 |         logger.record_tabular("ep_ret", ep_ret)
279 |         logger.record_tabular("ep_len", ep_len)
280 |         logger.record_tabular("immediate reward", np.mean(rew))
281 |         if MPI.COMM_WORLD.Get_rank()==0:
282 |             logger.dump_tabular()
283 |         traj_data = {"ob":ob, "ac":ac, "rew": rew, "ep_ret":ep_ret}
284 |         sample_trajs.append(traj_data)
285 | 
286 |     sample_ep_rets = [traj["ep_ret"] for traj in sample_trajs]
287 |     logger.log("Average total return: %f"%(sum(sample_ep_rets)/len(sample_ep_rets)))
288 |     if sample_stochastic:
289 |         task_name = 'stochastic.' + task_name
290 |     else:
291 |         task_name = 'deterministic.' + task_name
292 |     pkl.dump(sample_trajs, open(task_name+".pkl", "wb"))
293 | 
294 | def flatten_lists(listoflists):
295 |     return [el for list_ in listoflists for el in list_]
296 | 


--------------------------------------------------------------------------------
/gailtf/baselines/ppo1/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from gailtf.baselines.common import set_global_seeds, tf_util as U
 3 | from gailtf.baselines import bench
 4 | import os.path as osp
 5 | import gym, logging
 6 | from gailtf.baselines import logger
 7 | import ipdb
 8 | 
 9 | def train(args):
10 |     from gailtf.baselines.ppo1 import mlp_policy, pposgd_simple
11 |     U.make_session(num_cpu=args.num_cpu).__enter__()
12 |     set_global_seeds(args.seed)
13 |     env = gym.make(args.env_id)
14 |     def policy_fn(name, ob_space, ac_space):
15 |         return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
16 |             hid_size=64, num_hid_layers=2)
17 |     env = bench.Monitor(env, logger.get_dir() and 
18 |         osp.join(logger.get_dir(), "monitor.json"))
19 |     env.seed(args.seed)
20 |     gym.logger.setLevel(logging.WARN)
21 |     task_name = "ppo." + args.env_id.split("-")[0] + "." + ("%.2f"%args.entcoeff)
22 |     args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
23 |     pposgd_simple.learn(env, policy_fn, 
24 |             max_timesteps=args.num_timesteps,
25 |             timesteps_per_batch=2048,
26 |             clip_param=0.2, entcoeff=args.entcoeff,
27 |             optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
28 |             gamma=0.99, lam=0.95, schedule='linear', ckpt_dir=args.checkpoint_dir,
29 |             save_per_iter=args.save_per_iter, task=args.task,
30 |             sample_stochastic=args.sample_stochastic,
31 |             load_model_path=args.load_model_path,
32 |             task_name=task_name
33 |         )
34 |     env.close()
35 | 
36 | def main():
37 |     import argparse
38 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
39 |     parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
40 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
41 |     parser.add_argument('--task', help='Choose to do which task', type=str, choices=['train', 'sample_trajectory'], default='train')
42 |     parser.add_argument('--sample_stochastic', type=bool, default=False)
43 |     parser.add_argument('--num_cpu', help='number of cpu to used', type=int, default=1)
44 |     parser.add_argument('--entcoeff', help='entropy coefficiency', type=float, default=0)
45 |     parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100)
46 |     parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=1e6)
47 |     parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
48 |     parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None)
49 |     args = parser.parse_args()
50 |     train(args)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/gailtf/baselines/trpo_mpi/README.md:
--------------------------------------------------------------------------------
1 | # trpo_mpi
2 | 
3 | - Original paper: https://arxiv.org/abs/1502.05477
4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 16 python -m baselines.trpo_mpi.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.trpo_mpi.run_mujoco` runs the algorithm for 1M timesteps on a Mujoco environment.
7 | 


--------------------------------------------------------------------------------
/gailtf/baselines/trpo_mpi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/gailtf/baselines/trpo_mpi/__init__.py


--------------------------------------------------------------------------------
/gailtf/baselines/trpo_mpi/nosharing_cnn_policy.py:
--------------------------------------------------------------------------------
 1 | from gailtf.baselines.common.mpi_running_mean_std import RunningMeanStd
 2 | import gailtf.baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import gym
 5 | from gailtf.baselines.common.distributions import make_pdtype
 6 | 
 7 | class CnnPolicy(object):
 8 |     recurrent = False
 9 |     def __init__(self, name, ob_space, ac_space):
10 |         with tf.variable_scope(name):
11 |             self._init(ob_space, ac_space)
12 |             self.scope = tf.get_variable_scope().name
13 | 
14 |     def _init(self, ob_space, ac_space):
15 |         assert isinstance(ob_space, gym.spaces.Box)
16 | 
17 |         self.pdtype = pdtype = make_pdtype(ac_space)
18 |         sequence_length = None
19 | 
20 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 | 
22 |         obscaled = ob / 255.0
23 | 
24 |         with tf.variable_scope("pol"):
25 |             x = obscaled
26 |             x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
27 |             x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
28 |             x = U.flattenallbut0(x)
29 |             x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
30 |             logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
31 |             self.pd = pdtype.pdfromflat(logits)
32 |         with tf.variable_scope("vf"):
33 |             x = obscaled
34 |             x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
35 |             x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
36 |             x = U.flattenallbut0(x)
37 |             x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
38 |             self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
39 |             self.vpredz = self.vpred
40 | 
41 |         self.state_in = []
42 |         self.state_out = []
43 | 
44 |         stochastic = tf.placeholder(dtype=tf.bool, shape=())
45 |         ac = self.pd.sample() # XXX
46 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
47 | 
48 |     def act(self, stochastic, ob):
49 |         ac1, vpred1 =  self._act(stochastic, ob[None])
50 |         return ac1[0], vpred1[0]
51 |     def get_variables(self):
52 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
53 |     def get_trainable_variables(self):
54 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
55 |     def get_initial_state(self):
56 |         return []
57 | 
58 | 


--------------------------------------------------------------------------------
/gailtf/baselines/trpo_mpi/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # noinspection PyUnresolvedReferences
 3 | import mujoco_py # Mujoco must come before other imports. https://openai.slack.com/archives/C1H6P3R7B/p1492828680631850
 4 | from mpi4py import MPI
 5 | from gailtf.baselines.common import set_global_seeds
 6 | import os.path as osp
 7 | import gym
 8 | import logging
 9 | from gailtf.baselines import logger
10 | from gailtf.baselines.ppo1.mlp_policy import MlpPolicy
11 | from gailtf.baselines.common.mpi_fork import mpi_fork
12 | from gailtf.baselines import bench
13 | from gailtf.baselines.trpo_mpi import trpo_mpi
14 | 
15 | def train(args):
16 |     import gailtf.baselines.common.tf_util as U
17 |     sess = U.single_threaded_session()
18 |     sess.__enter__()
19 | 
20 |     rank = MPI.COMM_WORLD.Get_rank()
21 |     if rank != 0:
22 |         logger.set_level(logger.DISABLED)
23 |     workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
24 |     set_global_seeds(workerseed)
25 |     env = gym.make(args.env_id)
26 |     def policy_fn(name, ob_space, ac_space):
27 |         return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
28 |             hid_size=32, num_hid_layers=2)
29 |     env = bench.Monitor(env, logger.get_dir() and 
30 |         osp.join(logger.get_dir(), "%i.monitor.json" % rank))
31 |     env.seed(workerseed)
32 |     gym.logger.setLevel(logging.WARN)
33 | 
34 |     task_name = "trpo." + args.env_id.split("-")[0] + "." + ("%.2f"%args.entcoeff)
35 |     args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
36 |     trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
37 |         max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3,
38 |         sample_stochastic=args.sample_stochastic, task_name=task_name, save_per_iter=args.save_per_iter,
39 |         ckpt_dir=args.checkpoint_dir, load_model_path=args.load_model_path, task=args.task)
40 |     env.close()
41 | 
42 | def main():
43 |     import argparse
44 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
45 |     parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
46 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
47 |     parser.add_argument('--task', help='Choose to do which task', type=str, choices=['train', 'sample_trajectory'], default='train')
48 |     parser.add_argument('--sample_stochastic', type=bool, default=False)
49 |     parser.add_argument('--num_cpu', help='number of cpu to used', type=int, default=1)
50 |     parser.add_argument('--entcoeff', help='entropy coefficiency', type=float, default=0)
51 |     parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100)
52 |     parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=1e6)
53 |     parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
54 |     parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None)
55 |     args = parser.parse_args()
56 |     train(args)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/gailtf/baselines/trpo_mpi/trpo_mpi.py:
--------------------------------------------------------------------------------
  1 | from gailtf.baselines.common import explained_variance, zipsame, dataset
  2 | from gailtf.baselines import logger
  3 | import gailtf.baselines.common.tf_util as U
  4 | import tensorflow as tf, numpy as np
  5 | import time, os, sys
  6 | from gailtf.baselines.common import colorize
  7 | from mpi4py import MPI
  8 | from collections import deque
  9 | from gailtf.baselines.common.mpi_adam import MpiAdam
 10 | from gailtf.baselines.common.cg import cg
 11 | from contextlib import contextmanager
 12 | import pickle as pkl
 13 | 
 14 | # Sample one trajectory (until trajectory end)
 15 | def traj_episode_generator(pi, env, horizon, stochastic):
 16 |     t = 0
 17 |     ac = env.action_space.sample() # not used, just so we have the datatype
 18 |     new = True # marks if we're on first timestep of an episode
 19 | 
 20 |     ob = env.reset()
 21 |     cur_ep_ret = 0 # return in current episode
 22 |     cur_ep_len = 0 # len of current episode
 23 | 
 24 |     # Initialize history arrays
 25 |     obs = []; rews = []; news = []; acs = []
 26 | 
 27 |     while True:
 28 |         prevac = ac
 29 |         ac, vpred = pi.act(stochastic, ob)
 30 |         obs.append(ob)
 31 |         news.append(new)
 32 |         acs.append(ac)
 33 |         ob, rew, new, _ = env.step(ac)
 34 |         rews.append(rew)
 35 | 
 36 |         cur_ep_ret += rew
 37 |         cur_ep_len += 1
 38 |         if t > 0 and (new or t % horizon == 0):
 39 |             # convert list into numpy array
 40 |             obs = np.array(obs)
 41 |             rews = np.array(rews)
 42 |             news = np.array(news)
 43 |             acs = np.array(acs)
 44 |             yield {"ob":obs, "rew":rews, "new":news, "ac":acs,
 45 |                     "ep_ret":cur_ep_ret, "ep_len":cur_ep_len}
 46 |             ob = env.reset()
 47 |             cur_ep_ret = 0; cur_ep_len = 0; t = 0
 48 | 
 49 |             # Initialize history arrays
 50 |             obs = []; rews = []; news = []; acs = []
 51 |         t += 1
 52 | 
 53 | def traj_segment_generator(pi, env, horizon, stochastic):
 54 |     # Initialize state variables
 55 |     t = 0
 56 |     ac = env.action_space.sample()
 57 |     new = True
 58 |     rew = 0.0
 59 |     ob = env.reset()
 60 | 
 61 |     cur_ep_ret = 0
 62 |     cur_ep_len = 0
 63 |     ep_rets = []
 64 |     ep_lens = []
 65 | 
 66 |     # Initialize history arrays
 67 |     obs = np.array([ob for _ in range(horizon)])
 68 |     rews = np.zeros(horizon, 'float32')
 69 |     vpreds = np.zeros(horizon, 'float32')
 70 |     news = np.zeros(horizon, 'int32')
 71 |     acs = np.array([ac for _ in range(horizon)])
 72 |     prevacs = acs.copy()
 73 | 
 74 |     while True:
 75 |         prevac = ac
 76 |         ac, vpred = pi.act(stochastic, ob)
 77 |         # Slight weirdness here because we need value function at time T
 78 |         # before returning segment [0, T-1] so we get the correct
 79 |         # terminal value
 80 |         if t > 0 and t % horizon == 0:
 81 |             yield {"ob" : obs, "rew" : rews, "vpred" : vpreds, "new" : news,
 82 |                     "ac" : acs, "prevac" : prevacs, "nextvpred": vpred * (1 - new),
 83 |                     "ep_rets" : ep_rets, "ep_lens" : ep_lens}
 84 |             _, vpred = pi.act(stochastic, ob)            
 85 |             # Be careful!!! if you change the downstream algorithm to aggregate
 86 |             # several of these batches, then be sure to do a deepcopy
 87 |             ep_rets = []
 88 |             ep_lens = []
 89 |         i = t % horizon
 90 |         obs[i] = ob
 91 |         vpreds[i] = vpred
 92 |         news[i] = new
 93 |         acs[i] = ac
 94 |         prevacs[i] = prevac
 95 | 
 96 |         ob, rew, new, _ = env.step(ac)
 97 |         rews[i] = rew
 98 | 
 99 |         cur_ep_ret += rew
100 |         cur_ep_len += 1
101 |         if new:
102 |             ep_rets.append(cur_ep_ret)
103 |             ep_lens.append(cur_ep_len)
104 |             cur_ep_ret = 0
105 |             cur_ep_len = 0
106 |             ob = env.reset()
107 |         t += 1
108 | 
109 | def add_vtarg_and_adv(seg, gamma, lam):
110 |     new = np.append(seg["new"], 0) # last element is only used for last vtarg, but we already zeroed it if last new = 1
111 |     vpred = np.append(seg["vpred"], seg["nextvpred"])
112 |     T = len(seg["rew"])
113 |     seg["adv"] = gaelam = np.empty(T, 'float32')
114 |     rew = seg["rew"]
115 |     lastgaelam = 0
116 |     for t in reversed(range(T)):
117 |         nonterminal = 1-new[t+1]
118 |         delta = rew[t] + gamma * vpred[t+1] * nonterminal - vpred[t]
119 |         gaelam[t] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
120 |     seg["tdlamret"] = seg["adv"] + seg["vpred"]
121 | 
122 | def learn(env, policy_func, *,
123 |         timesteps_per_batch, # what to train on
124 |         max_kl, cg_iters,
125 |         gamma, lam, # advantage estimation
126 |         entcoeff=0.0,
127 |         cg_damping=1e-2,
128 |         vf_stepsize=3e-4,
129 |         vf_iters =3,
130 |         max_timesteps=0, max_episodes=0, max_iters=0,  # time constraint
131 |         callback=None,
132 |         sample_stochastic=True, task="train",
133 |         ckpt_dir=None, save_per_iter=100,
134 |         load_model_path=None, task_name=None,
135 |         max_sample_traj=1500
136 |         ):
137 |     nworkers = MPI.COMM_WORLD.Get_size()
138 |     rank = MPI.COMM_WORLD.Get_rank()
139 |     np.set_printoptions(precision=3)    
140 |     # Setup losses and stuff
141 |     # ----------------------------------------
142 |     ob_space = env.observation_space
143 |     ac_space = env.action_space
144 |     pi = policy_func("pi", ob_space, ac_space)
145 |     oldpi = policy_func("oldpi", ob_space, ac_space)
146 |     atarg = tf.placeholder(dtype=tf.float32, shape=[None]) # Target advantage function (if applicable)
147 |     ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return
148 | 
149 |     ob = U.get_placeholder_cached(name="ob")
150 |     ac = pi.pdtype.sample_placeholder([None])
151 | 
152 |     kloldnew = oldpi.pd.kl(pi.pd)
153 |     ent = pi.pd.entropy()
154 |     meankl = U.mean(kloldnew)
155 |     meanent = U.mean(ent)
156 |     entbonus = entcoeff * meanent
157 | 
158 |     vferr = U.mean(tf.square(pi.vpred - ret))
159 | 
160 |     ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold
161 |     surrgain = U.mean(ratio * atarg)
162 | 
163 |     optimgain = surrgain + entbonus
164 |     losses = [optimgain, meankl, entbonus, surrgain, meanent]
165 |     loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]
166 | 
167 |     dist = meankl
168 | 
169 |     all_var_list = pi.get_trainable_variables()
170 |     var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
171 |     vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
172 |     vfadam = MpiAdam(vf_var_list)
173 | 
174 |     get_flat = U.GetFlat(var_list)
175 |     set_from_flat = U.SetFromFlat(var_list)
176 |     klgrads = tf.gradients(dist, var_list)
177 |     flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan")
178 |     shapes = [var.get_shape().as_list() for var in var_list]
179 |     start = 0
180 |     tangents = []
181 |     for shape in shapes:
182 |         sz = U.intprod(shape)
183 |         tangents.append(tf.reshape(flat_tangent[start:start+sz], shape))
184 |         start += sz
185 |     gvp = tf.add_n([U.sum(g*tangent) for (g, tangent) in zipsame(klgrads, tangents)]) #pylint: disable=E1111
186 |     fvp = U.flatgrad(gvp, var_list)
187 | 
188 |     assign_old_eq_new = U.function([],[], updates=[tf.assign(oldv, newv)
189 |         for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())])
190 |     compute_losses = U.function([ob, ac, atarg], losses)
191 |     compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)])
192 |     compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
193 |     compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list))
194 | 
195 |     @contextmanager
196 |     def timed(msg):
197 |         if rank == 0:
198 |             print(colorize(msg, color='magenta'))
199 |             tstart = time.time()
200 |             yield
201 |             print(colorize("done in %.3f seconds"%(time.time() - tstart), color='magenta'))
202 |         else:
203 |             yield
204 |     
205 |     def allmean(x):
206 |         assert isinstance(x, np.ndarray)
207 |         out = np.empty_like(x)
208 |         MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
209 |         out /= nworkers
210 |         return out
211 | 
212 |     U.initialize()
213 |     th_init = get_flat()
214 |     MPI.COMM_WORLD.Bcast(th_init, root=0)
215 |     set_from_flat(th_init)
216 |     vfadam.sync()
217 |     print("Init param sum", th_init.sum(), flush=True)
218 | 
219 |     # Prepare for rollouts
220 |     # ----------------------------------------
221 |     seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True)
222 |     traj_gen =  traj_episode_generator(pi, env, timesteps_per_batch, stochastic=sample_stochastic)
223 | 
224 |     episodes_so_far = 0
225 |     timesteps_so_far = 0
226 |     iters_so_far = 0
227 |     tstart = time.time()
228 |     lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths
229 |     rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards
230 | 
231 |     assert sum([max_iters>0, max_timesteps>0, max_episodes>0])==1
232 |     if task == 'sample_trajectory':
233 |         # not elegant, i know :(
234 |         sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic)
235 |         sys.exit()
236 | 
237 |     while True:        
238 |         if callback: callback(locals(), globals())
239 |         if max_timesteps and timesteps_so_far >= max_timesteps:
240 |             break
241 |         elif max_episodes and episodes_so_far >= max_episodes:
242 |             break
243 |         elif max_iters and iters_so_far >= max_iters:
244 |             break
245 |         logger.log("********** Iteration %i ************"%iters_so_far)
246 |         # Save model
247 |         if iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
248 |             U.save_state(os.path.join(ckpt_dir, task_name), counter=iters_so_far)
249 | 
250 |         with timed("sampling"):
251 |             seg = seg_gen.__next__()
252 |         add_vtarg_and_adv(seg, gamma, lam)
253 | 
254 |         # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
255 |         ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
256 |         vpredbefore = seg["vpred"] # predicted value function before udpate
257 |         atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
258 | 
259 |         if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
260 |         if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy
261 | 
262 |         args = seg["ob"], seg["ac"], atarg
263 |         fvpargs = [arr[::5] for arr in args]
264 |         def fisher_vector_product(p):
265 |             return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p
266 | 
267 |         assign_old_eq_new() # set old parameter values to new parameter values
268 |         with timed("computegrad"):
269 |             *lossbefore, g = compute_lossandgrad(*args)
270 |         lossbefore = allmean(np.array(lossbefore))
271 |         g = allmean(g)
272 |         if np.allclose(g, 0):
273 |             logger.log("Got zero gradient. not updating")
274 |         else:
275 |             with timed("cg"):
276 |                 stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank==0)
277 |             assert np.isfinite(stepdir).all()
278 |             shs = .5*stepdir.dot(fisher_vector_product(stepdir))
279 |             lm = np.sqrt(shs / max_kl)
280 |             # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
281 |             fullstep = stepdir / lm
282 |             expectedimprove = g.dot(fullstep)
283 |             surrbefore = lossbefore[0]
284 |             stepsize = 1.0
285 |             thbefore = get_flat()
286 |             for _ in range(10):
287 |                 thnew = thbefore + fullstep * stepsize
288 |                 set_from_flat(thnew)
289 |                 meanlosses = surr, kl, *_ = allmean(np.array(compute_losses(*args)))
290 |                 improve = surr - surrbefore
291 |                 logger.log("Expected: %.3f Actual: %.3f"%(expectedimprove, improve))
292 |                 if not np.isfinite(meanlosses).all():
293 |                     logger.log("Got non-finite value of losses -- bad!")
294 |                 elif kl > max_kl * 1.5:
295 |                     logger.log("violated KL constraint. shrinking step.")
296 |                 elif improve < 0:
297 |                     logger.log("surrogate didn't improve. shrinking step.")
298 |                 else:
299 |                     logger.log("Stepsize OK!")
300 |                     break
301 |                 stepsize *= .5
302 |             else:
303 |                 logger.log("couldn't compute a good step")
304 |                 set_from_flat(thbefore)
305 |             if nworkers > 1 and iters_so_far % 20 == 0:
306 |                 paramsums = MPI.COMM_WORLD.allgather((thnew.sum(), vfadam.getflat().sum())) # list of tuples
307 |                 assert all(np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
308 | 
309 |         for (lossname, lossval) in zip(loss_names, meanlosses):
310 |             logger.record_tabular(lossname, lossval)
311 | 
312 |         with timed("vf"):
313 | 
314 |             for _ in range(vf_iters):
315 |                 for (mbob, mbret) in dataset.iterbatches((seg["ob"], seg["tdlamret"]), 
316 |                 include_final_partial_batch=False, batch_size=64):
317 |                     g = allmean(compute_vflossandgrad(mbob, mbret))
318 |                     vfadam.update(g, vf_stepsize)
319 | 
320 |         logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
321 | 
322 |         lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
323 |         listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
324 |         lens, rews = map(flatten_lists, zip(*listoflrpairs))
325 |         lenbuffer.extend(lens)
326 |         rewbuffer.extend(rews)
327 | 
328 |         logger.record_tabular("EpLenMean", np.mean(lenbuffer))
329 |         logger.record_tabular("EpRewMean", np.mean(rewbuffer))
330 |         logger.record_tabular("EpThisIter", len(lens))
331 |         episodes_so_far += len(lens)
332 |         timesteps_so_far += sum(lens)
333 |         iters_so_far += 1
334 | 
335 |         logger.record_tabular("EpisodesSoFar", episodes_so_far)
336 |         logger.record_tabular("TimestepsSoFar", timesteps_so_far)
337 |         logger.record_tabular("TimeElapsed", time.time() - tstart)
338 | 
339 |         if rank==0:
340 |             logger.dump_tabular()
341 | 
342 | def sample_trajectory(load_model_path, max_sample_traj, traj_gen, task_name, sample_stochastic):
343 | 
344 |     assert load_model_path is not None
345 |     U.load_state(load_model_path)
346 |     sample_trajs = []
347 |     for iters_so_far in range(max_sample_traj):
348 |         logger.log("********** Iteration %i ************"%iters_so_far)
349 |         traj = traj_gen.__next__()
350 |         ob, new, ep_ret, ac, rew, ep_len = traj['ob'], traj['new'], traj['ep_ret'], traj['ac'], traj['rew'], traj['ep_len']
351 |         logger.record_tabular("ep_ret", ep_ret)
352 |         logger.record_tabular("ep_len", ep_len)
353 |         logger.record_tabular("immediate reward", np.mean(rew))
354 |         if MPI.COMM_WORLD.Get_rank()==0:
355 |             logger.dump_tabular()
356 |         traj_data = {"ob":ob, "ac":ac, "rew": rew, "ep_ret":ep_ret}
357 |         sample_trajs.append(traj_data)
358 | 
359 |     sample_ep_rets = [traj["ep_ret"] for traj in sample_trajs]
360 |     logger.log("Average total return: %f"%(sum(sample_ep_rets)/len(sample_ep_rets)))
361 |     if sample_stochastic:
362 |         task_name = 'stochastic.' + task_name
363 |     else:
364 |         task_name = 'deterministic.' + task_name
365 |     pkl.dump(sample_trajs, open(task_name+".pkl", "wb"))
366 | 
367 | def flatten_lists(listoflists):
368 |     return [el for list_ in listoflists for el in list_]
369 | 


--------------------------------------------------------------------------------
/gailtf/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/gailtf/common/__init__.py


--------------------------------------------------------------------------------
/gailtf/common/statistics.py:
--------------------------------------------------------------------------------
 1 | # the code is similar to https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py
 2 | # in my implementation, I use one statistics for recording immediate results and 
 3 | #             another one for episodic results.
 4 | import gailtf.baselines.common.tf_util as U
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | 
 8 | class stats():
 9 | 
10 |     def __init__(self, scalar_keys=[], histogram_keys=[]):
11 |         
12 |         self.scalar_keys = scalar_keys
13 |         self.histogram_keys = histogram_keys
14 |         self.scalar_summaries = []
15 |         self.scalar_summaries_ph = []
16 |         self.histogram_summaries_ph = []
17 |         self.histogram_summaries = []
18 |         with tf.variable_scope('summary'):
19 |             for k in scalar_keys:
20 |                 ph = tf.placeholder('float32', None, name=k+'.scalar.summary')
21 |                 sm = tf.summary.scalar(k+'.scalar.summary', ph)
22 |                 self.scalar_summaries_ph.append(ph)
23 |                 self.scalar_summaries.append(sm)
24 |             for k in histogram_keys:
25 |                 ph = tf.placeholder('float32', None, name=k+'.histogram.summary')
26 |                 sm = tf.summary.scalar(k+'.histogram.summary', ph)
27 |                 self.histogram_summaries_ph.append(ph)
28 |                 self.histogram_summaries.append(sm)
29 | 
30 |         self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries)
31 | 
32 |     def add_all_summary(self, writer, values, iter):
33 |         # Note that the order of the incoming ```values``` should be the same as the that of the 
34 |         #            ```scalar_keys``` given in ```__init__```
35 |         if np.sum(np.isnan(values)+0) != 0:
36 |             return
37 |         sess = U.get_session()
38 |         keys = self.scalar_summaries_ph + self.histogram_summaries_ph
39 |         feed_dict = {}
40 |         for k, v in zip(keys, values):
41 |             feed_dict.update({k:v})
42 |         summaries_str = sess.run(self.summaries, feed_dict)
43 |         writer.add_summary(summaries_str, iter)
44 | 


--------------------------------------------------------------------------------
/gailtf/common/tf_util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def logsigmoid(a):
 4 |   '''Equivalent to tf.log(tf.sigmoid(a))'''
 5 |   return -tf.nn.softplus(-a)
 6 | 
 7 | def logit_bernoulli_entropy(logits):
 8 |   ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
 9 |   return ent
10 | 


--------------------------------------------------------------------------------
/gailtf/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/gailtf/dataset/__init__.py


--------------------------------------------------------------------------------
/gailtf/dataset/mujoco.py:
--------------------------------------------------------------------------------
  1 | from gailtf.baselines import logger
  2 | import pickle as pkl
  3 | import numpy as np
  4 | from tqdm import tqdm
  5 | import ipdb
  6 | 
  7 | class Dset(object):
  8 |     def __init__(self, inputs, labels, randomize):
  9 |         self.inputs = inputs
 10 |         self.labels = labels
 11 |         assert len(self.inputs) == len(self.labels)
 12 |         self.randomize = randomize
 13 |         self.num_pairs = len(inputs)
 14 |         self.init_pointer()
 15 |        
 16 |     def init_pointer(self):
 17 |         self.pointer = 0
 18 |         if self.randomize:
 19 |             idx = np.arange(self.num_pairs)
 20 |             np.random.shuffle(idx)
 21 |             self.inputs = self.inputs[idx, :]
 22 |             self.labels = self.labels[idx, :]
 23 | 
 24 |     def get_next_batch(self, batch_size):
 25 |         # if batch_size is negative -> return all
 26 |         if batch_size < 0:
 27 |             return self.inputs, self.labels
 28 |         if self.pointer + batch_size >= self.num_pairs:
 29 |             self.init_pointer()
 30 |         end = self.pointer + batch_size
 31 |         inputs = self.inputs[self.pointer:end, :]
 32 |         labels = self.labels[self.pointer:end, :]
 33 |         self.pointer = end
 34 |         return inputs, labels
 35 | 
 36 | class Mujoco_Dset(object):
 37 |     def __init__(self, expert_path, train_fraction=0.7, ret_threshold=None, traj_limitation=np.inf, randomize=True):
 38 |         with open(expert_path, "rb") as f:
 39 |             traj_data = pkl.load(f)
 40 |         obs = []
 41 |         acs = []
 42 |         rets = []
 43 |         lens = []
 44 |         for traj in tqdm(traj_data):
 45 |             if ret_threshold is not None and traj["ep_ret"] < ret_threshold:
 46 |                 pass
 47 |             if len(rets) >= traj_limitation:
 48 |                 break
 49 |             rets.append(traj["ep_ret"])
 50 |             lens.append(len(traj["ob"]))
 51 |             obs.append(traj["ob"])
 52 |             acs.append(traj["ac"])
 53 |         self.num_traj = len(rets)
 54 |         self.avg_ret = sum(rets)/len(rets)
 55 |         self.avg_len = sum(lens)/len(lens)
 56 |         self.rets = np.array(rets)
 57 |         self.lens = np.array(lens)
 58 |         self.obs = np.array([v for ob in obs for v in ob])
 59 |         self.acs = np.array([v for ac in acs for v in ac])
 60 |         if len(self.acs) > 2:
 61 |             self.acs = np.squeeze(self.acs)
 62 |         assert len(self.obs) == len(self.acs)
 63 |         self.num_transition = len(self.obs)
 64 |         self.randomize = randomize
 65 |         self.dset = Dset(self.obs, self.acs, self.randomize)
 66 |         # for behavior cloning
 67 |         self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction),:], 
 68 |                       self.acs[:int(self.num_transition*train_fraction),:], self.randomize)
 69 |         self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):,:], 
 70 |                       self.acs[int(self.num_transition*train_fraction):,:], self.randomize)
 71 |         self.log_info()
 72 | 
 73 |     def log_info(self):
 74 |         logger.log("Total trajectories: %d"%self.num_traj)
 75 |         logger.log("Total transitions: %d"%self.num_transition)
 76 |         logger.log("Average episode length: %f"%self.avg_len)
 77 |         logger.log("Average returns: %f"%self.avg_ret)
 78 | 
 79 |     def get_next_batch(self, batch_size, split=None):
 80 |         if split is None:
 81 |             return self.dset.get_next_batch(batch_size)
 82 |         elif split == 'train':
 83 |             return self.train_set.get_next_batch(batch_size)
 84 |         elif split == 'val':
 85 |             return self.val_set.get_next_batch(batch_size)
 86 |         else:
 87 |             raise NotImplementedError
 88 | 
 89 |     def plot(self):
 90 |         import matplotlib.pyplot as plt
 91 |         plt.hist(self.rets)
 92 |         plt.savefig("histogram_rets.png")
 93 |         plt.close()
 94 | 
 95 | 
 96 | def test(expert_path):
 97 |     dset = Mujoco_Dset(expert_path)
 98 |     dset.plot()
 99 | 
100 | if __name__ == '__main__':
101 |     import argparse
102 |     parser = argparse.ArgumentParser()
103 |     parser.add_argument("--expert_path", type=str, default="../baselines/ppo1/ppo.Hopper.0.00.pkl")
104 |     args = parser.parse_args()
105 |     test(args.expert_path)
106 | 
107 | 


--------------------------------------------------------------------------------
/gailtf/network/adversary.py:
--------------------------------------------------------------------------------
 1 | from gailtf.baselines.common.mpi_running_mean_std import RunningMeanStd
 2 | import tensorflow as tf
 3 | from gailtf.baselines.common import tf_util as U
 4 | from gailtf.common.tf_util import *
 5 | import numpy as np
 6 | import ipdb
 7 | 
 8 | class TransitionClassifier(object):
 9 |   def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):
10 |     self.scope = scope
11 |     self.observation_shape = env.observation_space.shape
12 |     self.actions_shape = env.action_space.shape
13 |     self.input_shape = tuple([o+a for o,a in zip(self.observation_shape, self.actions_shape)])
14 |     self.num_actions = env.action_space.shape[0]
15 |     self.hidden_size = hidden_size
16 |     self.build_ph()
17 |     # Build grpah
18 |     generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False)
19 |     expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True)
20 |     # Build accuracy
21 |     generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
22 |     expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
23 |     # Build regression loss
24 |     # let x = logits, z = targets.
25 |     # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
26 |     generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits))
27 |     generator_loss = tf.reduce_mean(generator_loss)
28 |     expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits))
29 |     expert_loss = tf.reduce_mean(expert_loss)
30 |     # Build entropy loss
31 |     logits = tf.concat([generator_logits, expert_logits], 0)
32 |     entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
33 |     entropy_loss = -entcoeff*entropy
34 |     # Loss + Accuracy terms
35 |     self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
36 |     self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"]
37 |     self.total_loss = generator_loss + expert_loss + entropy_loss
38 |     # Build Reward for policy
39 |     self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8)
40 |     var_list = self.get_trainable_variables()
41 |     self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], 
42 |                          self.losses + [U.flatgrad(self.total_loss, var_list)])
43 | 
44 |   def build_ph(self):
45 |     self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph")
46 |     self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph")
47 |     self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph")
48 |     self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph")
49 | 
50 |   def build_graph(self, obs_ph, acs_ph, reuse=False):
51 |     with tf.variable_scope(self.scope):
52 |       if reuse:
53 |         tf.get_variable_scope().reuse_variables()
54 | 
55 |       with tf.variable_scope("obfilter"):
56 |           self.obs_rms = RunningMeanStd(shape=self.observation_shape)
57 |       obs = (obs_ph - self.obs_rms.mean) / self.obs_rms.std
58 |       _input = tf.concat([obs, acs_ph], axis=1) # concatenate the two input -> form a transition
59 |       p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh)
60 |       p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)
61 |       logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity)
62 |     return logits
63 | 
64 |   def get_trainable_variables(self):
65 |     return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
66 | 
67 |   def get_reward(self, obs, acs):
68 |     sess = U.get_session()
69 |     if len(obs.shape) == 1:
70 |       obs = np.expand_dims(obs, 0)
71 |     if len(acs.shape) == 1:
72 |       acs = np.expand_dims(acs, 0)
73 |     feed_dict = {self.generator_obs_ph:obs, self.generator_acs_ph:acs}
74 |     reward = sess.run(self.reward_op, feed_dict)
75 |     return reward
76 | 
77 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from gailtf.baselines.common import set_global_seeds, tf_util as U
  3 | import gym, logging, sys
  4 | from gailtf.baselines import bench
  5 | import os.path as osp
  6 | from gailtf.baselines import logger
  7 | from gailtf.dataset.mujoco import Mujoco_Dset
  8 | import numpy as np
  9 | import ipdb
 10 | 
 11 | def argsparser():
 12 |     parser = argparse.ArgumentParser("Tensorflow Implementation of GAIL")
 13 |     parser.add_argument('--env_id', help='environment ID', default='Hopper-v1')
 14 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
 15 |     parser.add_argument('--num_cpu', help='number of cpu to used', type=int, default=1)
 16 |     parser.add_argument('--expert_path', type=str, default='baselines/ppo1/deterministic.ppo.Hopper.0.00.pkl')
 17 |     parser.add_argument('--checkpoint_dir', help='the directory to save model', default='checkpoint')
 18 |     parser.add_argument('--log_dir', help='the directory to save log file', default='log')
 19 |     parser.add_argument('--load_model_path', help='if provided, load the model', type=str, default=None)
 20 |     # Task
 21 |     parser.add_argument('--task', type=str, choices=['train', 'evaluate'], default='train')
 22 |     # for evaluatation
 23 |     parser.add_argument('--stochastic_policy', type=bool, default=False)
 24 |     #  Mujoco Dataset Configuration
 25 |     parser.add_argument('--ret_threshold', help='the return threshold for the expert trajectories', type=int, default=0)
 26 |     parser.add_argument('--traj_limitation', type=int, default=np.inf)
 27 |     # Optimization Configuration
 28 |     parser.add_argument('--g_step', help='number of steps to train policy in each epoch', type=int, default=3)
 29 |     parser.add_argument('--d_step', help='number of steps to train discriminator in each epoch', type=int, default=1)
 30 |     # Network Configuration (Using MLP Policy)
 31 |     parser.add_argument('--policy_hidden_size', type=int, default=100)
 32 |     parser.add_argument('--adversary_hidden_size', type=int, default=100)
 33 |     # Algorithms Configuration
 34 |     parser.add_argument('--algo', type=str, choices=['bc', 'trpo', 'ppo'], default='trpo')
 35 |     parser.add_argument('--max_kl', type=float, default=0.01)
 36 |     parser.add_argument('--policy_entcoeff', help='entropy coefficiency of policy', type=float, default=0)
 37 |     parser.add_argument('--adversary_entcoeff', help='entropy coefficiency of discriminator', type=float, default=1e-3)
 38 |     # Traing Configuration
 39 |     parser.add_argument('--save_per_iter', help='save model every xx iterations', type=int, default=100)
 40 |     parser.add_argument('--num_timesteps', help='number of timesteps per episode', type=int, default=5e6)
 41 |     # Behavior Cloning
 42 |     parser.add_argument('--pretrained', help='Use BC to pretrain', type=bool, default=False)
 43 |     parser.add_argument('--BC_max_iter', help='Max iteration for training BC', type=int, default=1e4)
 44 |     return parser.parse_args()
 45 | 
 46 | def get_task_name(args):
 47 |     if args.algo == 'bc':
 48 |         task_name = 'behavior_cloning.'
 49 |         if args.traj_limitation != np.inf: task_name += "traj_limitation_%d."%args.traj_limitation
 50 |         task_name += args.env_id.split("-")[0]
 51 |     else:
 52 |         task_name = args.algo + "_gail."
 53 |         if args.pretrained: task_name += "with_pretrained."
 54 |         if args.traj_limitation != np.inf: task_name += "traj_limitation_%d."%args.traj_limitation
 55 |         task_name += args.env_id.split("-")[0]
 56 |         if args.ret_threshold > 0: task_name += ".return_threshold_%d" % args.ret_threshold
 57 |         task_name = task_name + ".g_step_" + str(args.g_step) + ".d_step_" + str(args.d_step) + \
 58 |                 ".policy_entcoeff_" + str(args.policy_entcoeff) + ".adversary_entcoeff_" + str(args.adversary_entcoeff)
 59 |     return task_name
 60 | 
 61 | def main(args):
 62 |     from gailtf.baselines.ppo1 import mlp_policy
 63 |     U.make_session(num_cpu=args.num_cpu).__enter__()
 64 |     set_global_seeds(args.seed)
 65 |     env = gym.make(args.env_id)
 66 |     def policy_fn(name, ob_space, ac_space, reuse=False):
 67 |         return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
 68 |             reuse=reuse, hid_size=64, num_hid_layers=2)
 69 |     env = bench.Monitor(env, logger.get_dir() and
 70 |         osp.join(logger.get_dir(), "monitor.json"))
 71 |     env.seed(args.seed)
 72 |     gym.logger.setLevel(logging.WARN)
 73 |     task_name = get_task_name(args)
 74 |     args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
 75 |     args.log_dir = osp.join(args.log_dir, task_name)
 76 |     dataset = Mujoco_Dset(expert_path=args.expert_path, ret_threshold=args.ret_threshold, traj_limitation=args.traj_limitation)
 77 |     pretrained_weight = None
 78 |     if (args.pretrained and args.task == 'train') or args.algo == 'bc':
 79 |         # Pretrain with behavior cloning
 80 |         from gailtf.algo import behavior_clone
 81 |         if args.algo == 'bc' and args.task == 'evaluate':
 82 |             behavior_clone.evaluate(env, policy_fn, args.load_model_path, stochastic_policy=args.stochastic_policy)
 83 |             sys.exit()
 84 |         pretrained_weight = behavior_clone.learn(env, policy_fn, dataset,
 85 |             max_iters=args.BC_max_iter, pretrained=args.pretrained, 
 86 |             ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name)
 87 |         if args.algo == 'bc':
 88 |             sys.exit()
 89 | 
 90 |     from gailtf.network.adversary import TransitionClassifier
 91 |     # discriminator
 92 |     discriminator = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
 93 |     if args.algo == 'trpo':
 94 |         # Set up for MPI seed
 95 |         from mpi4py import MPI
 96 |         rank = MPI.COMM_WORLD.Get_rank()
 97 |         if rank != 0:
 98 |             logger.set_level(logger.DISABLED)
 99 |         workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank()
100 |         set_global_seeds(workerseed)
101 |         env.seed(workerseed)
102 |         from gailtf.algo import trpo_mpi
103 |         if args.task == 'train':
104 |             trpo_mpi.learn(env, policy_fn, discriminator, dataset,
105 |                 pretrained=args.pretrained, pretrained_weight=pretrained_weight,
106 |                 g_step=args.g_step, d_step=args.d_step,
107 |                 timesteps_per_batch=1024, 
108 |                 max_kl=args.max_kl, cg_iters=10, cg_damping=0.1,
109 |                 max_timesteps=args.num_timesteps, 
110 |                 entcoeff=args.policy_entcoeff, gamma=0.995, lam=0.97, 
111 |                 vf_iters=5, vf_stepsize=1e-3,
112 |                 ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir,
113 |                 save_per_iter=args.save_per_iter, load_model_path=args.load_model_path,
114 |                 task_name=task_name)
115 |         elif args.task == 'evaluate':
116 |             trpo_mpi.evaluate(env, policy_fn, args.load_model_path, timesteps_per_batch=1024,
117 |                 number_trajs=10, stochastic_policy=args.stochastic_policy)
118 |         else: raise NotImplementedError
119 |     else: raise NotImplementedError
120 | 
121 |     env.close()
122 | 
123 | if __name__ == '__main__':
124 |     args = argsparser()
125 |     main(args)
126 | 


--------------------------------------------------------------------------------
/misc/HalfCheetah-D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/HalfCheetah-D.png


--------------------------------------------------------------------------------
/misc/HalfCheetah-length-reward(D).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/HalfCheetah-length-reward(D).png


--------------------------------------------------------------------------------
/misc/HalfCheetah-true-reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/HalfCheetah-true-reward.png


--------------------------------------------------------------------------------
/misc/Hopper-D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/Hopper-D.png


--------------------------------------------------------------------------------
/misc/Hopper-length-reward(D).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/Hopper-length-reward(D).png


--------------------------------------------------------------------------------
/misc/Hopper-true-reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/Hopper-true-reward.png


--------------------------------------------------------------------------------
/misc/Walker2d-D.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/Walker2d-D.png


--------------------------------------------------------------------------------
/misc/Walker2d-length-reward(D).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/Walker2d-length-reward(D).png


--------------------------------------------------------------------------------
/misc/Walker2d-true-reward.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/Walker2d-true-reward.png


--------------------------------------------------------------------------------
/misc/exp.md:
--------------------------------------------------------------------------------
 1 | # Experiments for GAIL
 2 | 
 3 | ## Hopper-v1
 4 | 
 5 | The total return (environment rewards, only used for evaluation):
 6 | 
 7 | ![](Hopper-true-reward.png)
 8 | 
 9 | The episode length and reward from discriminator:
10 | 
11 | ![](Hopper-length-reward(D).png)
12 | 
13 | The loss and accuracy of the discriminator:
14 | 
15 | ![](Hopper-D.png)
16 | 
17 | Legend:
18 | 
19 | ![](hopper.png)
20 | 
21 | ## Walker2d-v1
22 | 
23 | The total return (environment rewards, only used for evaluation):
24 | 
25 | ![](Walker2d-true-reward.png)
26 | 
27 | The episode length and reward from discriminator:
28 | 
29 | ![](Walker2d-length-reward(D).png)
30 | 
31 | The loss and accuracy of the discriminator:
32 | 
33 | ![](Walker2d-D.png)
34 | 
35 | Legend:
36 | 
37 | ![](walker2d.png)
38 | 
39 | ## HalfCheetah-v1
40 | 
41 | The total return (environment rewards, only used for evaluation):
42 | 
43 | ![](HalfCheetah-true-reward.png)
44 | 
45 | The episode length and reward from discriminator:
46 | 
47 | ![](HalfCheetah-length-reward(D).png)
48 | 
49 | The loss and accuracy of the discriminator:
50 | 
51 | ![](HalfCheetah-D.png)
52 | 
53 | Legend:
54 | 
55 | ![](halfcheetah.png)
56 | 


--------------------------------------------------------------------------------
/misc/halfcheetah.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/halfcheetah.png


--------------------------------------------------------------------------------
/misc/hopper.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/hopper.png


--------------------------------------------------------------------------------
/misc/walker2d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andrewliao11/gail-tf/ad92f41c26c34e8fabc536664fb11b44f25956cf/misc/walker2d.png


--------------------------------------------------------------------------------