├── .benchmark_pattern
├── baselines
    ├── __init__.py
    ├── a2c
    │   ├── __init__.py
    │   ├── README.md
    │   └── runner.py
    ├── acer
    │   ├── __init__.py
    │   ├── defaults.py
    │   ├── README.md
    │   ├── runner.py
    │   ├── policies.py
    │   └── buffer.py
    ├── acktr
    │   ├── __init__.py
    │   ├── acktr.py
    │   ├── README.md
    │   ├── run_mujoco.py
    │   ├── utils.py
    │   ├── value_functions.py
    │   ├── kfac_utils.py
    │   └── policies.py
    ├── ddpg
    │   ├── __init__.py
    │   ├── README.md
    │   ├── noise.py
    │   ├── models.py
    │   └── memory.py
    ├── gail
    │   ├── __init__.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   └── mujoco_dset.py
    │   ├── result
    │   │   ├── hopper-training.png
    │   │   ├── humanoid-training.png
    │   │   ├── walker2d-training.png
    │   │   ├── halfcheetah-training.png
    │   │   ├── humanoidstandup-training.png
    │   │   ├── Hopper-normalized-stochastic-scores.png
    │   │   ├── Hopper-normalized-deterministic-scores.png
    │   │   ├── Hopper-unnormalized-stochastic-scores.png
    │   │   ├── Humanoid-normalized-stochastic-scores.png
    │   │   ├── Walker2d-normalized-stochastic-scores.png
    │   │   ├── HalfCheetah-normalized-stochastic-scores.png
    │   │   ├── Hopper-unnormalized-deterministic-scores.png
    │   │   ├── Humanoid-normalized-deterministic-scores.png
    │   │   ├── Humanoid-unnormalized-stochastic-scores.png
    │   │   ├── Walker2d-normalized-deterministic-scores.png
    │   │   ├── Walker2d-unnormalized-stochastic-scores.png
    │   │   ├── HalfCheetah-normalized-deterministic-scores.png
    │   │   ├── HalfCheetah-unnormalized-stochastic-scores.png
    │   │   ├── Humanoid-unnormalized-deterministic-scores.png
    │   │   ├── Walker2d-unnormalized-deterministic-scores.png
    │   │   ├── HalfCheetah-unnormalized-deterministic-scores.png
    │   │   ├── HumanoidStandup-normalized-stochastic-scores.png
    │   │   ├── HumanoidStandup-unnormalized-stochastic-scores.png
    │   │   ├── HumanoidStandup-normalized-deterministic-scores.png
    │   │   ├── HumanoidStandup-unnormalized-deterministic-scores.png
    │   │   └── gail-result.md
    │   ├── README.md
    │   ├── statistics.py
    │   ├── mlp_policy.py
    │   └── adversary.py
    ├── her
    │   ├── __init__.py
    │   ├── experiment
    │   │   ├── __init__.py
    │   │   ├── play.py
    │   │   └── plot.py
    │   ├── README.md
    │   ├── actor_critic.py
    │   ├── her.py
    │   ├── replay_buffer.py
    │   └── util.py
    ├── ppo1
    │   ├── __init__.py
    │   ├── README.md
    │   ├── run_mujoco.py
    │   ├── run_robotics.py
    │   ├── run_atari.py
    │   ├── cnn_policy.py
    │   ├── run_humanoid.py
    │   └── mlp_policy.py
    ├── ppo2
    │   ├── __init__.py
    │   ├── README.md
    │   └── defaults.py
    ├── trpo_mpi
    │   ├── __init__.py
    │   ├── README.md
    │   └── defaults.py
    ├── common
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── envs
    │   │   │   ├── __init__.py
    │   │   │   ├── fixed_sequence_env.py
    │   │   │   ├── identity_env.py
    │   │   │   └── mnist_env.py
    │   │   ├── test_schedules.py
    │   │   ├── test_tf_util.py
    │   │   ├── test_cartpole.py
    │   │   ├── test_doc_examples.py
    │   │   ├── test_fixed_sequence.py
    │   │   ├── test_mnist.py
    │   │   ├── test_identity.py
    │   │   ├── util.py
    │   │   ├── test_segment_tree.py
    │   │   └── test_serialization.py
    │   ├── __init__.py
    │   ├── runners.py
    │   ├── mpi_fork.py
    │   ├── identity_env.py
    │   ├── tile_images.py
    │   ├── vec_env
    │   │   ├── vec_monitor.py
    │   │   ├── vec_frame_stack.py
    │   │   ├── util.py
    │   │   ├── vec_normalize.py
    │   │   ├── dummy_vec_env.py
    │   │   ├── subproc_vec_env.py
    │   │   ├── test_vec_env.py
    │   │   └── __init__.py
    │   ├── cg.py
    │   ├── mpi_adam_optimizer.py
    │   ├── running_stat.py
    │   ├── input.py
    │   ├── mpi_moments.py
    │   ├── console_util.py
    │   ├── dataset.py
    │   ├── math_util.py
    │   ├── mpi_adam.py
    │   ├── filters.py
    │   ├── mpi_util.py
    │   ├── mpi_running_mean_std.py
    │   ├── schedules.py
    │   └── segment_tree.py
    ├── deepq
    │   ├── experiments
    │   │   ├── __init__.py
    │   │   ├── enjoy_cartpole.py
    │   │   ├── enjoy_mountaincar.py
    │   │   ├── enjoy_pong.py
    │   │   ├── train_mountaincar.py
    │   │   ├── train_cartpole.py
    │   │   ├── enjoy_retro.py
    │   │   ├── run_retro.py
    │   │   ├── run_atari.py
    │   │   └── custom_cartpole.py
    │   ├── __init__.py
    │   ├── defaults.py
    │   ├── README.md
    │   └── utils.py
    ├── bench
    │   └── __init__.py
    └── results_plotter.py
├── data
    ├── logo.jpg
    └── cartpole.gif
├── setup.cfg
├── .travis.yml
├── .gitignore
├── Dockerfile
├── LICENSE
└── setup.py


/.benchmark_pattern:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/a2c/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/acer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/acktr/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/ddpg/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/gail/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/her/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/ppo1/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/ppo2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/trpo_mpi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/common/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/gail/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/her/experiment/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/common/tests/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/acktr/acktr.py:
--------------------------------------------------------------------------------
1 | from baselines.acktr.acktr_disc import *
2 | 


--------------------------------------------------------------------------------
/data/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/data/logo.jpg


--------------------------------------------------------------------------------
/data/cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/data/cartpole.gif


--------------------------------------------------------------------------------
/baselines/acer/defaults.py:
--------------------------------------------------------------------------------
1 | def atari():
2 |     return dict(
3 |         lrschedule='constant'
4 |     )
5 | 


--------------------------------------------------------------------------------
/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.bench.benchmarks import *
2 | from baselines.bench.monitor import *


--------------------------------------------------------------------------------
/baselines/gail/result/hopper-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/hopper-training.png


--------------------------------------------------------------------------------
/baselines/gail/result/humanoid-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/humanoid-training.png


--------------------------------------------------------------------------------
/baselines/gail/result/walker2d-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/walker2d-training.png


--------------------------------------------------------------------------------
/baselines/gail/result/halfcheetah-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/halfcheetah-training.png


--------------------------------------------------------------------------------
/baselines/gail/result/humanoidstandup-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/humanoidstandup-training.png


--------------------------------------------------------------------------------
/baselines/gail/result/Hopper-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Hopper-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Humanoid-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Walker2d-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Humanoid-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Walker2d-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from baselines.common.console_util import *
3 | from baselines.common.dataset import Dataset
4 | from baselines.common.math_util import *
5 | from baselines.common.misc_util import *
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | select = F,E999
 3 | exclude = 
 4 |     .git,
 5 |     __pycache__,
 6 |     baselines/her,
 7 |     baselines/ddpg,
 8 |     baselines/ppo1,
 9 |     baselines/bench,
10 |     baselines/acktr, 
11 |     
12 |     
13 |     
14 |     
15 | 
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - "3.6"
 4 | 
 5 | services:
 6 |     - docker
 7 | 
 8 | install:
 9 |     - pip install flake8
10 |     - docker build . -t baselines-test
11 | 
12 | script:
13 |     - flake8 .
14 |     - docker run baselines-test pytest -v .
15 | 


--------------------------------------------------------------------------------
/baselines/ddpg/README.md:
--------------------------------------------------------------------------------
1 | # DDPG
2 | 
3 | - Original paper: https://arxiv.org/abs/1509.02971
4 | - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/
5 | - `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options.


--------------------------------------------------------------------------------
/baselines/acer/README.md:
--------------------------------------------------------------------------------
1 | # ACER
2 | 
3 | - Original paper: https://arxiv.org/abs/1611.01224
4 | - `python -m baselines.run --alg=acer --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
5 | - also refer to the repo-wide [README.md](../../README.md#training-models)
6 | 
7 | 


--------------------------------------------------------------------------------
/baselines/a2c/README.md:
--------------------------------------------------------------------------------
1 | # A2C
2 | 
3 | - Original paper: https://arxiv.org/abs/1602.01783
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options
6 | - also refer to the repo-wide [README.md](../../README.md#training-models)
7 | 


--------------------------------------------------------------------------------
/baselines/acktr/README.md:
--------------------------------------------------------------------------------
1 | # ACKTR
2 | 
3 | - Original paper: https://arxiv.org/abs/1708.05144
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
6 | - also refer to the repo-wide [README.md](../../README.md#training-models)
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/baselines/deepq/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.deepq import models  # noqa
2 | from baselines.deepq.build_graph import build_act, build_train  # noqa
3 | from baselines.deepq.deepq import learn, load_act  # noqa
4 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer  # noqa
5 | 
6 | def wrap_atari_dqn(env):
7 |     from baselines.common.atari_wrappers import wrap_deepmind
8 |     return wrap_deepmind(env, frame_stack=True, scale=True)
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.pyc
 3 | *.pkl
 4 | *.py~
 5 | .pytest_cache
 6 | .DS_Store
 7 | .idea
 8 | 
 9 | # Setuptools distribution and build folders.
10 | /dist/
11 | /build
12 | keys/
13 | 
14 | # Virtualenv
15 | /env
16 | 
17 | 
18 | *.sublime-project
19 | *.sublime-workspace
20 | 
21 | .idea
22 | 
23 | logs/
24 | 
25 | .ipynb_checkpoints
26 | ghostdriver.log
27 | 
28 | htmlcov
29 | 
30 | junk
31 | src
32 | 
33 | *.egg-info
34 | .cache
35 | 
36 | MUJOCO_LOG.TXT
37 | 


--------------------------------------------------------------------------------
/baselines/ppo2/README.md:
--------------------------------------------------------------------------------
1 | # PPO2
2 | 
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | 
6 | - `python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
7 | - `python -m baselines.run --alg=ppo2 --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M frames on a Mujoco Ant environment.
8 | - also refer to the repo-wide [README.md](../../README.md#training-models)
9 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def main():
 7 |     env = gym.make("CartPole-v0")
 8 |     act = deepq.load("cartpole_model.pkl")
 9 | 
10 |     while True:
11 |         obs, done = env.reset(), False
12 |         episode_rew = 0
13 |         while not done:
14 |             env.render()
15 |             obs, rew, done, _ = env.step(act(obs[None])[0])
16 |             episode_rew += rew
17 |         print("Episode reward", episode_rew)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     main()
22 | 


--------------------------------------------------------------------------------
/baselines/trpo_mpi/README.md:
--------------------------------------------------------------------------------
1 | # trpo_mpi
2 | 
3 | - Original paper: https://arxiv.org/abs/1502.05477
4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 16 python -m baselines.run --alg=trpo_mpi --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
6 | - `python -m baselines.run --alg=trpo_mpi --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M timesteps on a Mujoco Ant environment. 
7 | - also refer to the repo-wide [README.md](../../README.md#training-models)
8 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_mountaincar.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def main():
 7 |     env = gym.make("MountainCar-v0")
 8 |     act = deepq.load("mountaincar_model.pkl")
 9 | 
10 |     while True:
11 |         obs, done = env.reset(), False
12 |         episode_rew = 0
13 |         while not done:
14 |             env.render()
15 |             obs, rew, done, _ = env.step(act(obs[None])[0])
16 |             episode_rew += rew
17 |         print("Episode reward", episode_rew)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     main()
22 | 


--------------------------------------------------------------------------------
/baselines/deepq/defaults.py:
--------------------------------------------------------------------------------
 1 | def atari():
 2 |     return dict(
 3 |         network='conv_only',
 4 |         lr=1e-4,
 5 |         buffer_size=10000,
 6 |         exploration_fraction=0.1,
 7 |         exploration_final_eps=0.01,
 8 |         train_freq=4,
 9 |         learning_starts=10000,
10 |         target_network_update_freq=1000,
11 |         gamma=0.99,
12 |         prioritized_replay=True,
13 |         prioritized_replay_alpha=0.6,
14 |         checkpoint_freq=10000,
15 |         checkpoint_path=None,
16 |         dueling=True
17 |     )
18 | 
19 | def retro():
20 |     return atari()
21 | 
22 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_pong.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from baselines import deepq
 3 | 
 4 | 
 5 | def main():
 6 |     env = gym.make("PongNoFrameskip-v4")
 7 |     env = deepq.wrap_atari_dqn(env)
 8 |     act = deepq.load("pong_model.pkl")
 9 | 
10 |     while True:
11 |         obs, done = env.reset(), False
12 |         episode_rew = 0
13 |         while not done:
14 |             env.render()
15 |             obs, rew, done, _ = env.step(act(obs[None])[0])
16 |             episode_rew += rew
17 |         print("Episode reward", episode_rew)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     main()
22 | 


--------------------------------------------------------------------------------
/baselines/ppo2/defaults.py:
--------------------------------------------------------------------------------
 1 | def mujoco():
 2 |     return dict(
 3 |         nsteps=2048,
 4 |         nminibatches=32,
 5 |         lam=0.95,
 6 |         gamma=0.99,
 7 |         noptepochs=10,
 8 |         log_interval=1,
 9 |         ent_coef=0.0,
10 |         lr=lambda f: 3e-4 * f,
11 |         cliprange=0.2,
12 |         value_network='copy'
13 |     )
14 | 
15 | def atari():
16 |     return dict(
17 |         nsteps=128, nminibatches=4,
18 |         lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
19 |         ent_coef=.01,
20 |         lr=lambda f : f * 2.5e-4,
21 |         cliprange=lambda f : f * 0.1,
22 |     )
23 | 


--------------------------------------------------------------------------------
/baselines/ppo1/README.md:
--------------------------------------------------------------------------------
 1 | # PPOSGD
 2 | 
 3 | - Original paper: https://arxiv.org/abs/1707.06347
 4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
 5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
 6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
 7 | 
 8 | - Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model`
 9 | - Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model`
10 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
 4 | ENV CODE_DIR /root/code
 5 | ENV VENV /root/venv
 6 | 
 7 | RUN \
 8 |     pip install virtualenv && \
 9 |     virtualenv $VENV --python=python3 && \
10 |     . $VENV/bin/activate && \
11 |     pip install --upgrade pip
12 | 
13 | ENV PATH=$VENV/bin:$PATH
14 | 
15 | COPY . $CODE_DIR/baselines
16 | WORKDIR $CODE_DIR/baselines
17 | 
18 | # Clean up pycache and pyc files
19 | RUN rm -rf __pycache__ && \
20 |     find . -name "*.pyc" -delete && \
21 |     pip install tensorflow && \
22 |     pip install -e .[test]
23 | 
24 | 
25 | CMD /bin/bash
26 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_mountaincar.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def main():
 7 |     env = gym.make("MountainCar-v0")
 8 |     # Enabling layer_norm here is import for parameter space noise!
 9 |     model = deepq.models.mlp([64], layer_norm=True)
10 |     act = deepq.learn(
11 |         env,
12 |         q_func=model,
13 |         lr=1e-3,
14 |         max_timesteps=100000,
15 |         buffer_size=50000,
16 |         exploration_fraction=0.1,
17 |         exploration_final_eps=0.1,
18 |         print_freq=10,
19 |         param_noise=True
20 |     )
21 |     print("Saving model to mountaincar_model.pkl")
22 |     act.save("mountaincar_model.pkl")
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     main()
27 | 


--------------------------------------------------------------------------------
/baselines/common/runners.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | class AbstractEnvRunner(ABC):
 5 |     def __init__(self, *, env, model, nsteps):
 6 |         self.env = env
 7 |         self.model = model
 8 |         self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
 9 |         self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
10 |         self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
11 |         self.obs[:] = env.reset()
12 |         self.nsteps = nsteps
13 |         self.states = model.initial_state
14 |         self.dones = [False for _ in range(nenv)]
15 | 
16 |     @abstractmethod
17 |     def run(self):
18 |         raise NotImplementedError
19 | 
20 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
 1 | import os, subprocess, sys
 2 | 
 3 | def mpi_fork(n, bind_to_core=False):
 4 |     """Re-launches the current script with workers
 5 |     Returns "parent" for original parent, "child" for MPI children
 6 |     """
 7 |     if n<=1: 
 8 |         return "child"
 9 |     if os.getenv("IN_MPI") is None:
10 |         env = os.environ.copy()
11 |         env.update(
12 |             MKL_NUM_THREADS="1",
13 |             OMP_NUM_THREADS="1",
14 |             IN_MPI="1"
15 |         )
16 |         args = ["mpirun", "-np", str(n)]
17 |         if bind_to_core:
18 |             args += ["-bind-to", "core"]
19 |         args += [sys.executable] + sys.argv
20 |         subprocess.check_call(args, env=env)
21 |         return "parent"
22 |     else:
23 |         return "child"
24 | 


--------------------------------------------------------------------------------
/baselines/trpo_mpi/defaults.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.models import mlp, cnn_small
 2 | 
 3 | 
 4 | def atari():
 5 |     return dict(
 6 |         network = cnn_small(),
 7 |         timesteps_per_batch=512, 
 8 |         max_kl=0.001,
 9 |         cg_iters=10,
10 |         cg_damping=1e-3,
11 |         gamma=0.98,
12 |         lam=1.0,
13 |         vf_iters=3,
14 |         vf_stepsize=1e-4,
15 |         entcoeff=0.00,
16 |     )
17 | 
18 | def mujoco():
19 |     return dict(
20 |         network = mlp(num_hidden=32, num_layers=2),
21 |         timesteps_per_batch=1024,
22 |         max_kl=0.01,
23 |         cg_iters=10,
24 |         cg_damping=0.1,
25 |         gamma=0.99,
26 |         lam=0.98,
27 |         vf_iters=5,
28 |         vf_stepsize=1e-3,
29 |         normalize_observations=True, 
30 |     )
31 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def callback(lcl, _glb):
 7 |     # stop training if reward exceeds 199
 8 |     is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
 9 |     return is_solved
10 | 
11 | 
12 | def main():
13 |     env = gym.make("CartPole-v0")
14 |     act = deepq.learn(
15 |         env,
16 |         network='mlp',
17 |         lr=1e-3,
18 |         total_timesteps=100000,
19 |         buffer_size=50000,
20 |         exploration_fraction=0.1,
21 |         exploration_final_eps=0.02,
22 |         print_freq=10,
23 |         callback=callback
24 |     )
25 |     print("Saving model to cartpole_model.pkl")
26 |     act.save("cartpole_model.pkl")
27 | 
28 | 
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------
/baselines/common/identity_env.py:
--------------------------------------------------------------------------------
 1 | from gym import Env
 2 | from gym.spaces import Discrete
 3 | 
 4 | 
 5 | class IdentityEnv(Env):
 6 |     def __init__(
 7 |             self,
 8 |             dim,
 9 |             ep_length=100,
10 |     ):
11 | 
12 |         self.action_space = Discrete(dim)
13 |         self.reset()
14 | 
15 |     def reset(self):
16 |         self._choose_next_state()
17 |         self.observation_space = self.action_space
18 | 
19 |         return self.state
20 | 
21 |     def step(self, actions):
22 |         rew = self._get_reward(actions)
23 |         self._choose_next_state()
24 |         return self.state, rew, False, {}
25 | 
26 |     def _choose_next_state(self):
27 |         self.state = self.action_space.sample()
28 | 
29 |     def _get_reward(self, actions):
30 |         return 1 if self.state == actions else 0
31 | 


--------------------------------------------------------------------------------
/baselines/common/tile_images.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def tile_images(img_nhwc):
 4 |     """
 5 |     Tile N images into one big PxQ image
 6 |     (P,Q) are chosen to be as close as possible, and if N
 7 |     is square, then P=Q.
 8 | 
 9 |     input: img_nhwc, list or array of images, ndim=4 once turned into array
10 |         n = batch index, h = height, w = width, c = channel
11 |     returns:
12 |         bigim_HWc, ndarray with ndim=3
13 |     """
14 |     img_nhwc = np.asarray(img_nhwc)
15 |     N, h, w, c = img_nhwc.shape
16 |     H = int(np.ceil(np.sqrt(N)))
17 |     W = int(np.ceil(float(N)/H))
18 |     img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
19 |     img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
20 |     img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
21 |     img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
22 |     return img_Hh_Ww_c
23 | 
24 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
 4 | 
 5 | 
 6 | def test_piecewise_schedule():
 7 |     ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
 8 | 
 9 |     assert np.isclose(ps.value(-10), 500)
10 |     assert np.isclose(ps.value(0), 150)
11 |     assert np.isclose(ps.value(5), 200)
12 |     assert np.isclose(ps.value(9), 80)
13 |     assert np.isclose(ps.value(50), 50)
14 |     assert np.isclose(ps.value(80), 50)
15 |     assert np.isclose(ps.value(150), 0)
16 |     assert np.isclose(ps.value(175), -25)
17 |     assert np.isclose(ps.value(201), 500)
18 |     assert np.isclose(ps.value(500), 500)
19 | 
20 |     assert np.isclose(ps.value(200 - 1e-10), -50)
21 | 
22 | 
23 | def test_constant_schedule():
24 |     cs = ConstantSchedule(5)
25 |     for i in range(-100, 100):
26 |         assert np.isclose(cs.value(i), 5)
27 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/vec_monitor.py:
--------------------------------------------------------------------------------
 1 | from . import VecEnvWrapper
 2 | import numpy as np
 3 | 
 4 | 
 5 | class VecMonitor(VecEnvWrapper):
 6 |     def __init__(self, venv):
 7 |         VecEnvWrapper.__init__(self, venv)
 8 |         self.eprets = None
 9 |         self.eplens = None
10 | 
11 |     def reset(self):
12 |         obs = self.venv.reset()
13 |         self.eprets = np.zeros(self.num_envs, 'f')
14 |         self.eplens = np.zeros(self.num_envs, 'i')
15 |         return obs
16 | 
17 |     def step_wait(self):
18 |         obs, rews, dones, infos = self.venv.step_wait()
19 |         self.eprets += rews
20 |         self.eplens += 1
21 |         newinfos = []
22 |         for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)):
23 |             info = info.copy()
24 |             if done:
25 |                 info['episode'] = {'r': ret, 'l': eplen}
26 |                 self.eprets[i] = 0
27 |                 self.eplens[i] = 0
28 |             newinfos.append(info)
29 |         return obs, rews, dones, newinfos
30 | 


--------------------------------------------------------------------------------
/baselines/common/cg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
 3 |     """
 4 |     Demmel p 312
 5 |     """
 6 |     p = b.copy()
 7 |     r = b.copy()
 8 |     x = np.zeros_like(b)
 9 |     rdotr = r.dot(r)
10 | 
11 |     fmtstr =  "%10i %10.3g %10.3g"
12 |     titlestr =  "%10s %10s %10s"
13 |     if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 | 
15 |     for i in range(cg_iters):
16 |         if callback is not None:
17 |             callback(x)
18 |         if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 |         z = f_Ax(p)
20 |         v = rdotr / p.dot(z)
21 |         x += v*p
22 |         r -= v*z
23 |         newrdotr = r.dot(r)
24 |         mu = newrdotr/rdotr
25 |         p = r + mu*p
26 | 
27 |         rdotr = newrdotr
28 |         if rdotr < residual_tol:
29 |             break
30 | 
31 |     if callback is not None:
32 |         callback(x)
33 |     if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
34 |     return x


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017 OpenAI (http://openai.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/baselines/ppo1/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 4 | from baselines.common import tf_util as U
 5 | from baselines import logger
 6 | 
 7 | def train(env_id, num_timesteps, seed):
 8 |     from baselines.ppo1 import mlp_policy, pposgd_simple
 9 |     U.make_session(num_cpu=1).__enter__()
10 |     def policy_fn(name, ob_space, ac_space):
11 |         return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
12 |             hid_size=64, num_hid_layers=2)
13 |     env = make_mujoco_env(env_id, seed)
14 |     pposgd_simple.learn(env, policy_fn,
15 |             max_timesteps=num_timesteps,
16 |             timesteps_per_actorbatch=2048,
17 |             clip_param=0.2, entcoeff=0.0,
18 |             optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
19 |             gamma=0.99, lam=0.95, schedule='linear',
20 |         )
21 |     env.close()
22 | 
23 | def main():
24 |     args = mujoco_arg_parser().parse_args()
25 |     logger.configure()
26 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
27 | 
28 | if __name__ == '__main__':
29 |     main()
30 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
 1 | # tests for tf_util
 2 | import tensorflow as tf
 3 | from baselines.common.tf_util import (
 4 |     function,
 5 |     initialize,
 6 |     single_threaded_session
 7 | )
 8 | 
 9 | 
10 | def test_function():
11 |     with tf.Graph().as_default():
12 |         x = tf.placeholder(tf.int32, (), name="x")
13 |         y = tf.placeholder(tf.int32, (), name="y")
14 |         z = 3 * x + 2 * y
15 |         lin = function([x, y], z, givens={y: 0})
16 | 
17 |         with single_threaded_session():
18 |             initialize()
19 | 
20 |             assert lin(2) == 6
21 |             assert lin(2, 2) == 10
22 | 
23 | 
24 | def test_multikwargs():
25 |     with tf.Graph().as_default():
26 |         x = tf.placeholder(tf.int32, (), name="x")
27 |         with tf.variable_scope("other"):
28 |             x2 = tf.placeholder(tf.int32, (), name="x")
29 |         z = 3 * x + 2 * x2
30 | 
31 |         lin = function([x, x2], z, givens={x2: 0})
32 |         with single_threaded_session():
33 |             initialize()
34 |             assert lin(2) == 6
35 |             assert lin(2, 2) == 10
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     test_function()
40 |     test_multikwargs()
41 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_cartpole.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | 
 4 | from baselines.run import get_learn_function
 5 | from baselines.common.tests.util import reward_per_episode_test
 6 | 
 7 | common_kwargs = dict(
 8 |     total_timesteps=30000,
 9 |     network='mlp',
10 |     gamma=1.0,
11 |     seed=0,
12 | )
13 |    
14 | learn_kwargs = {
15 |     'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
16 |     'acktr': dict(nsteps=32, value_network='copy'),
17 |     'deepq': dict(total_timesteps=20000),
18 |     'ppo2': dict(value_network='copy'),
19 |     'trpo_mpi': {}
20 | }
21 | 
22 | @pytest.mark.slow
23 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
24 | def test_cartpole(alg):
25 |     '''
26 |     Test if the algorithm (with an mlp policy)
27 |     can learn to balance the cartpole
28 |     '''
29 | 
30 |     kwargs = common_kwargs.copy()
31 |     kwargs.update(learn_kwargs[alg])
32 | 
33 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
34 |     def env_fn(): 
35 |         
36 |         env = gym.make('CartPole-v0')
37 |         env.seed(0)
38 |         return env
39 | 
40 |     reward_per_episode_test(env_fn, learn_fn, 100)
41 | 
42 | if __name__ == '__main__':
43 |     test_cartpole('deepq')
44 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_retro.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import numpy as np
 4 | 
 5 | from baselines import deepq
 6 | from baselines.common import retro_wrappers
 7 | 
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser()
11 |     parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes')
12 |     parser.add_argument('--gamestate', help='game state to load', default='Level1-1')
13 |     parser.add_argument('--model', help='model pickle file from ActWrapper.save', default='model.pkl')
14 |     args = parser.parse_args()
15 | 
16 |     env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=None)
17 |     env = retro_wrappers.wrap_deepmind_retro(env)
18 |     act = deepq.load(args.model)
19 | 
20 |     while True:
21 |         obs, done = env.reset(), False
22 |         episode_rew = 0
23 |         while not done:
24 |             env.render()
25 |             action = act(obs[None])[0]
26 |             env_action = np.zeros(env.action_space.n)
27 |             env_action[action] = 1
28 |             obs, rew, done, _ = env.step(env_action)
29 |             episode_rew += rew
30 |         print('Episode reward', episode_rew)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/baselines/acktr/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import tensorflow as tf
 4 | from baselines import logger
 5 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 6 | from baselines.acktr.acktr_cont import learn
 7 | from baselines.acktr.policies import GaussianMlpPolicy
 8 | from baselines.acktr.value_functions import NeuralNetValueFunction
 9 | 
10 | def train(env_id, num_timesteps, seed):
11 |     env = make_mujoco_env(env_id, seed)
12 | 
13 |     with tf.Session(config=tf.ConfigProto()):
14 |         ob_dim = env.observation_space.shape[0]
15 |         ac_dim = env.action_space.shape[0]
16 |         with tf.variable_scope("vf"):
17 |             vf = NeuralNetValueFunction(ob_dim, ac_dim)
18 |         with tf.variable_scope("pi"):
19 |             policy = GaussianMlpPolicy(ob_dim, ac_dim)
20 | 
21 |         learn(env, policy=policy, vf=vf,
22 |             gamma=0.99, lam=0.97, timesteps_per_batch=2500,
23 |             desired_kl=0.002,
24 |             num_timesteps=num_timesteps, animate=False)
25 | 
26 |         env.close()
27 | 
28 | def main():
29 |     args = mujoco_arg_parser().parse_args()
30 |     logger.configure()
31 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
32 | 
33 | if __name__ == "__main__":
34 |     main()
35 | 


--------------------------------------------------------------------------------
/baselines/common/tests/envs/fixed_sequence_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import Env
 3 | from gym.spaces import Discrete
 4 | 
 5 | 
 6 | class FixedSequenceEnv(Env):
 7 |     def __init__(
 8 |             self,
 9 |             n_actions=10,
10 |             seed=0,
11 |             episode_len=100
12 |     ):
13 |         self.np_random = np.random.RandomState()
14 |         self.np_random.seed(seed)
15 |         self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)]
16 | 
17 |         self.action_space = Discrete(n_actions)
18 |         self.observation_space = Discrete(1)
19 | 
20 |         self.episode_len = episode_len
21 |         self.time = 0
22 |         self.reset()
23 | 
24 |     def reset(self):
25 |         self.time = 0
26 |         return 0
27 | 
28 |     def step(self, actions):
29 |         rew = self._get_reward(actions)
30 |         self._choose_next_state()
31 |         done = False
32 |         if self.episode_len and self.time >= self.episode_len:
33 |             rew = 0
34 |             done = True
35 | 
36 |         return 0, rew, done, {}
37 | 
38 |     def _choose_next_state(self):
39 |         self.time += 1
40 | 
41 |     def _get_reward(self, actions):
42 |         return 1 if actions == self.sequence[self.time] else 0
43 |         
44 | 
45 | 


--------------------------------------------------------------------------------
/baselines/gail/README.md:
--------------------------------------------------------------------------------
 1 | # Generative Adversarial Imitation Learning (GAIL)
 2 | 
 3 | - Original paper: https://arxiv.org/abs/1606.03476
 4 | 
 5 | For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md)
 6 | 
 7 | ## If you want to train an imitation learning agent
 8 | 
 9 | ### Step 1: Download expert data
10 | 
11 | Download the expert data into `./data`, [download link](https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing)
12 | 
13 | ### Step 2: Run GAIL
14 | 
15 | Run with single thread:
16 | 
17 | ```bash
18 | python -m baselines.gail.run_mujoco
19 | ```
20 | 
21 | Run with multiple threads:
22 | 
23 | ```bash
24 | mpirun -np 16 python -m baselines.gail.run_mujoco
25 | ```
26 | 
27 | See help (`-h`) for more options.
28 | 
29 | #### In case you want to run Behavior Cloning (BC)
30 | 
31 | ```bash
32 | python -m baselines.gail.behavior_clone
33 | ```
34 | 
35 | See help (`-h`) for more options.
36 | 
37 | 
38 | ## Contributing
39 | 
40 | Bug reports and pull requests are welcome on GitHub at https://github.com/openai/baselines/pulls.
41 | 
42 | ## Maintainers
43 | 
44 | - Yuan-Hong Liao, andrewliao11_at_gmail_dot_com
45 | - Ryan Julian, ryanjulian_at_gmail_dot_com
46 | 
47 | ## Others
48 | 
49 | Thanks to the open source:
50 | 
51 | - @openai/imitation
52 | - @carpedm20/deep-rl-tensorflow
53 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
 1 | from . import VecEnvWrapper
 2 | import numpy as np
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class VecFrameStack(VecEnvWrapper):
 7 |     def __init__(self, venv, nstack):
 8 |         self.venv = venv
 9 |         self.nstack = nstack
10 |         wos = venv.observation_space  # wrapped ob space
11 |         low = np.repeat(wos.low, self.nstack, axis=-1)
12 |         high = np.repeat(wos.high, self.nstack, axis=-1)
13 |         self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
14 |         observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
15 |         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
16 | 
17 |     def step_wait(self):
18 |         obs, rews, news, infos = self.venv.step_wait()
19 |         self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
20 |         for (i, new) in enumerate(news):
21 |             if new:
22 |                 self.stackedobs[i] = 0
23 |         self.stackedobs[..., -obs.shape[-1]:] = obs
24 |         return self.stackedobs, rews, news, infos
25 | 
26 |     def reset(self):
27 |         obs = self.venv.reset()
28 |         self.stackedobs[...] = 0
29 |         self.stackedobs[..., -obs.shape[-1]:] = obs
30 |         return self.stackedobs
31 | 
32 |     def close(self):
33 |         self.venv.close()
34 | 


--------------------------------------------------------------------------------
/baselines/acktr/utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
 4 |     with tf.variable_scope(name, reuse=reuse):
 5 |         assert (len(tf.get_variable_scope().name.split('/')) == 2)
 6 | 
 7 |         w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
 8 |         b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
 9 |         weight_decay_fc = 3e-4
10 | 
11 |         if weight_loss_dict is not None:
12 |             weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
13 |             if weight_loss_dict is not None:
14 |                 weight_loss_dict[w] = weight_decay_fc
15 |                 weight_loss_dict[b] = 0.0
16 | 
17 |             tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
18 | 
19 |         return tf.nn.bias_add(tf.matmul(x, w), b)
20 | 
21 | def kl_div(action_dist1, action_dist2, action_size):
22 |     mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
23 |     mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
24 | 
25 |     numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
26 |     denominator = 2 * tf.square(std2) + 1e-8
27 |     return tf.reduce_sum(
28 |         numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
29 | 


--------------------------------------------------------------------------------
/baselines/ppo1/run_robotics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from mpi4py import MPI
 4 | from baselines.common import set_global_seeds
 5 | from baselines import logger
 6 | from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
 7 | import mujoco_py
 8 | 
 9 | 
10 | def train(env_id, num_timesteps, seed):
11 |     from baselines.ppo1 import mlp_policy, pposgd_simple
12 |     import baselines.common.tf_util as U
13 |     rank = MPI.COMM_WORLD.Get_rank()
14 |     sess = U.single_threaded_session()
15 |     sess.__enter__()
16 |     mujoco_py.ignore_mujoco_warnings().__enter__()
17 |     workerseed = seed + 10000 * rank
18 |     set_global_seeds(workerseed)
19 |     env = make_robotics_env(env_id, workerseed, rank=rank)
20 |     def policy_fn(name, ob_space, ac_space):
21 |         return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
22 |             hid_size=256, num_hid_layers=3)
23 | 
24 |     pposgd_simple.learn(env, policy_fn,
25 |             max_timesteps=num_timesteps,
26 |             timesteps_per_actorbatch=2048,
27 |             clip_param=0.2, entcoeff=0.0,
28 |             optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
29 |             gamma=0.99, lam=0.95, schedule='linear',
30 |         )
31 |     env.close()
32 | 
33 | 
34 | def main():
35 |     args = robotics_arg_parser().parse_args()
36 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_adam_optimizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from mpi4py import MPI
 4 | 
 5 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
 6 |     """Adam optimizer that averages gradients across mpi processes."""
 7 |     def __init__(self, comm, **kwargs):
 8 |         self.comm = comm
 9 |         tf.train.AdamOptimizer.__init__(self, **kwargs)
10 |     def compute_gradients(self, loss, var_list, **kwargs):
11 |         grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
12 |         grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
13 |         flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
14 |         shapes = [v.shape.as_list() for g, v in grads_and_vars]
15 |         sizes = [int(np.prod(s)) for s in shapes]
16 | 
17 |         num_tasks = self.comm.Get_size()
18 |         buf = np.zeros(sum(sizes), np.float32)
19 | 
20 |         def _collect_grads(flat_grad):
21 |             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
22 |             np.divide(buf, float(num_tasks), out=buf)
23 |             return buf
24 | 
25 |         avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
26 |         avg_flat_grad.set_shape(flat_grad.shape)
27 |         avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
28 |         avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
29 |                     for g, (_, v) in zip(avg_grads, grads_and_vars)]
30 | 
31 |         return avg_grads_and_vars
32 | 


--------------------------------------------------------------------------------
/baselines/common/running_stat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # http://www.johndcook.com/blog/standard_deviation/
 4 | class RunningStat(object):
 5 |     def __init__(self, shape):
 6 |         self._n = 0
 7 |         self._M = np.zeros(shape)
 8 |         self._S = np.zeros(shape)
 9 |     def push(self, x):
10 |         x = np.asarray(x)
11 |         assert x.shape == self._M.shape
12 |         self._n += 1
13 |         if self._n == 1:
14 |             self._M[...] = x
15 |         else:
16 |             oldM = self._M.copy()
17 |             self._M[...] = oldM + (x - oldM)/self._n
18 |             self._S[...] = self._S + (x - oldM)*(x - self._M)
19 |     @property
20 |     def n(self):
21 |         return self._n
22 |     @property
23 |     def mean(self):
24 |         return self._M
25 |     @property
26 |     def var(self):
27 |         return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
28 |     @property
29 |     def std(self):
30 |         return np.sqrt(self.var)
31 |     @property
32 |     def shape(self):
33 |         return self._M.shape
34 | 
35 | def test_running_stat():
36 |     for shp in ((), (3,), (3,4)):
37 |         li = []
38 |         rs = RunningStat(shp)
39 |         for _ in range(5):
40 |             val = np.random.randn(*shp)
41 |             rs.push(val)
42 |             li.append(val)
43 |             m = np.mean(li, axis=0)
44 |             assert np.allclose(rs.mean, m)
45 |             v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
46 |             assert np.allclose(rs.var, v)
47 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for dealing with vectorized environments.
 3 | """
 4 | 
 5 | from collections import OrderedDict
 6 | 
 7 | import gym
 8 | import numpy as np
 9 | 
10 | 
11 | def copy_obs_dict(obs):
12 |     """
13 |     Deep-copy an observation dict.
14 |     """
15 |     return {k: np.copy(v) for k, v in obs.items()}
16 | 
17 | 
18 | def dict_to_obs(obs_dict):
19 |     """
20 |     Convert an observation dict into a raw array if the
21 |     original observation space was not a Dict space.
22 |     """
23 |     if set(obs_dict.keys()) == {None}:
24 |         return obs_dict[None]
25 |     return obs_dict
26 | 
27 | 
28 | def obs_space_info(obs_space):
29 |     """
30 |     Get dict-structured information about a gym.Space.
31 | 
32 |     Returns:
33 |       A tuple (keys, shapes, dtypes):
34 |         keys: a list of dict keys.
35 |         shapes: a dict mapping keys to shapes.
36 |         dtypes: a dict mapping keys to dtypes.
37 |     """
38 |     if isinstance(obs_space, gym.spaces.Dict):
39 |         assert isinstance(obs_space.spaces, OrderedDict)
40 |         subspaces = obs_space.spaces
41 |     else:
42 |         subspaces = {None: obs_space}
43 |     keys = []
44 |     shapes = {}
45 |     dtypes = {}
46 |     for key, box in subspaces.items():
47 |         keys.append(key)
48 |         shapes[key] = box.shape
49 |         dtypes[key] = box.dtype
50 |     return keys, shapes, dtypes
51 | 
52 | 
53 | def obs_to_dict(obs):
54 |     """
55 |     Convert an observation into a dict.
56 |     """
57 |     if isinstance(obs, dict):
58 |         return obs
59 |     return {None: obs}
60 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_doc_examples.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | try:
 3 |     import mujoco_py
 4 |     _mujoco_present = True
 5 | except BaseException:
 6 |     mujoco_py = None
 7 |     _mujoco_present = False
 8 | 
 9 | 
10 | @pytest.mark.skipif(
11 |     not _mujoco_present, 
12 |     reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
13 | )
14 | def test_lstm_example():
15 |     import tensorflow as tf
16 |     from baselines.common import policies, models, cmd_util
17 |     from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
18 | 
19 |     # create vectorized environment
20 |     venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])
21 | 
22 |     with tf.Session() as sess:
23 |         # build policy based on lstm network with 128 units
24 |         policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)
25 | 
26 |         # initialize tensorflow variables
27 |         sess.run(tf.global_variables_initializer())
28 | 
29 |         # prepare environment variables
30 |         ob = venv.reset()
31 |         state = policy.initial_state
32 |         done = [False]
33 |         step_counter = 0
34 | 
35 |         # run a single episode until the end (i.e. until done)
36 |         while True:
37 |             action, _, state, _ = policy.step(ob, S=state, M=done)
38 |             ob, reward, done, _ = venv.step(action)
39 |             step_counter += 1
40 |             if done:    
41 |                 break
42 | 
43 |         
44 |         assert step_counter > 5
45 |             
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys
 3 | 
 4 | if sys.version_info.major != 3:
 5 |     print('This Python is only compatible with Python 3, but you are running '
 6 |           'Python {}. The installation will likely fail.'.format(sys.version_info.major))
 7 | 
 8 | 
 9 | extras = {
10 |     'test': [
11 |         'filelock', 
12 |         'pytest'
13 |     ]
14 | }
15 | 
16 | 
17 | all_deps = []
18 | for group_name in extras:
19 |     all_deps += extras[group_name]
20 | 
21 | extras['all'] = all_deps
22 | 
23 | setup(name='baselines',
24 |       packages=[package for package in find_packages()
25 |                 if package.startswith('baselines')],
26 |       install_requires=[
27 |           'gym[mujoco,atari,classic_control,robotics]',
28 |           'scipy',
29 |           'tqdm',
30 |           'joblib',
31 |           'dill',
32 |           'progressbar2',
33 |           'mpi4py',
34 |           'cloudpickle',
35 |           'click',
36 |           'opencv-python'
37 |       ],
38 |       extras_require=extras,
39 |       description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
40 |       author='OpenAI',
41 |       url='https://github.com/openai/baselines',
42 |       author_email='gym@openai.com',
43 |       version='0.1.5')
44 | 
45 | 
46 | # ensure there is some tensorflow build with version above 1.4
47 | try:
48 |     from distutils.version import StrictVersion
49 |     import tensorflow
50 |     assert StrictVersion(tensorflow.__version__) >= StrictVersion('1.4.0')
51 | except ImportError:
52 |     assert False, "TensorFlow needed, of version above 1.4"
53 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_fixed_sequence.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
 3 | 
 4 | from baselines.common.tests.util import simple_test
 5 | from baselines.run import get_learn_function
 6 | 
 7 | common_kwargs = dict(
 8 |     seed=0,
 9 |     total_timesteps=50000,
10 | )
11 |     
12 | learn_kwargs = {
13 |     'a2c': {},
14 |     'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
15 |     # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
16 |     # github issue: https://github.com/openai/baselines/issues/188
17 |     # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
18 | }
19 | 
20 | 
21 | alg_list = learn_kwargs.keys()
22 | rnn_list = ['lstm']
23 | 
24 | @pytest.mark.slow
25 | @pytest.mark.parametrize("alg", alg_list)
26 | @pytest.mark.parametrize("rnn", rnn_list)
27 | def test_fixed_sequence(alg, rnn):
28 |     '''
29 |     Test if the algorithm (with a given policy)
30 |     can learn an identity transformation (i.e. return observation as an action)
31 |     '''
32 | 
33 |     kwargs = learn_kwargs[alg]
34 |     kwargs.update(common_kwargs)
35 | 
36 |     episode_len = 5
37 |     env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
38 |     learn = lambda e: get_learn_function(alg)(
39 |         env=e, 
40 |         network=rnn,
41 |         **kwargs
42 |     )
43 | 
44 |     simple_test(env_fn, learn, 0.7)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     test_fixed_sequence('ppo2', 'lstm')
49 | 
50 |     
51 | 
52 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
 1 | from . import VecEnvWrapper
 2 | from baselines.common.running_mean_std import RunningMeanStd
 3 | import numpy as np
 4 | 
 5 | 
 6 | class VecNormalize(VecEnvWrapper):
 7 |     """
 8 |     A vectorized wrapper that normalizes the observations
 9 |     and returns from an environment.
10 |     """
11 | 
12 |     def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
13 |         VecEnvWrapper.__init__(self, venv)
14 |         self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
15 |         self.ret_rms = RunningMeanStd(shape=()) if ret else None
16 |         self.clipob = clipob
17 |         self.cliprew = cliprew
18 |         self.ret = np.zeros(self.num_envs)
19 |         self.gamma = gamma
20 |         self.epsilon = epsilon
21 | 
22 |     def step_wait(self):
23 |         obs, rews, news, infos = self.venv.step_wait()
24 |         self.ret = self.ret * self.gamma + rews
25 |         obs = self._obfilt(obs)
26 |         if self.ret_rms:
27 |             self.ret_rms.update(self.ret)
28 |             rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
29 |         return obs, rews, news, infos
30 | 
31 |     def _obfilt(self, obs):
32 |         if self.ob_rms:
33 |             self.ob_rms.update(obs)
34 |             obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
35 |             return obs
36 |         else:
37 |             return obs
38 | 
39 |     def reset(self):
40 |         obs = self.venv.reset()
41 |         return self._obfilt(obs)
42 | 


--------------------------------------------------------------------------------
/baselines/her/README.md:
--------------------------------------------------------------------------------
 1 | # Hindsight Experience Replay
 2 | For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/abs/1707.01495).
 3 | 
 4 | ## How to use Hindsight Experience Replay
 5 | 
 6 | ### Getting started
 7 | Training an agent is very simple:
 8 | ```bash
 9 | python -m baselines.her.experiment.train
10 | ```
11 | This will train a DDPG+HER agent on the `FetchReach` environment.
12 | You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
13 | desired goal in 100% of the cases.
14 | The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
15 | the latest policy, and, if enabled, a history of policies every K epochs.
16 | 
17 | To inspect what the agent has learned, use the play script:
18 | ```bash
19 | python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
20 | ```
21 | You can try it right now with the results of the training step (the script prints out the path for you).
22 | This should visualize the current policy for 10 episodes and will also print statistics.
23 | 
24 | 
25 | ### Reproducing results
26 | In order to reproduce the results from [Plappert et al. (2018)](https://arxiv.org/abs/1802.09464), run the following command:
27 | ```bash
28 | python -m baselines.her.experiment.train --num_cpu 19
29 | ```
30 | This will require a machine with sufficient amount of physical CPU cores. In our experiments,
31 | we used [Azure's D15v2 instances](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes),
32 | which have 20 physical cores. We only scheduled the experiment on 19 of those to leave some head-room on the system.
33 | 


--------------------------------------------------------------------------------
/baselines/deepq/README.md:
--------------------------------------------------------------------------------
 1 | ## If you are curious.
 2 | 
 3 | ##### Train a Cartpole agent and watch it play once it converges!
 4 | 
 5 | Here's a list of commands to run to quickly get a working example:
 6 | 
 7 | <img src="../../data/cartpole.gif" width="25%" />
 8 | 
 9 | 
10 | ```bash
11 | # Train model and save the results to cartpole_model.pkl
12 | python -m baselines.run --alg=deepq --env=CartPole-v0 --save_path=./cartpole_model.pkl --num_timesteps=1e5
13 | # Load the model saved in cartpole_model.pkl and visualize the learned policy
14 | python -m baselines.run --alg=deepq --env=CartPole-v0 --load_path=./cartpole_model.pkl --num_timesteps=0 --play
15 | ```
16 | 
17 | ## If you wish to apply DQN to solve a problem.
18 | 
19 | Check out our simple agent trained with one stop shop `deepq.learn` function. 
20 | 
21 | - [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent.
22 | 
23 | In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. Complimentary file `enjoy_cartpole.py` loads and visualizes the learned policy.
24 | 
25 | ## If you wish to experiment with the algorithm
26 | 
27 | ##### Check out the examples
28 | 
29 | - [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm.
30 | - [baselines/deepq/defaults.py](defaults.py) - settings for training on atari. Run 
31 | 
32 | ```bash
33 | python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 
34 | ```
35 | to train on Atari Pong (see more in repo-wide [README.md](../../README.md#training-models))
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/run_retro.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from baselines import deepq
 4 | from baselines.common import set_global_seeds
 5 | from baselines import bench
 6 | from baselines import logger
 7 | from baselines.common import retro_wrappers
 8 | import retro
 9 | 
10 | 
11 | def main():
12 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
13 |     parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes')
14 |     parser.add_argument('--gamestate', help='game state to load', default='Level1-1')
15 |     parser.add_argument('--seed', help='seed', type=int, default=0)
16 |     parser.add_argument('--num-timesteps', type=int, default=int(10e6))
17 |     args = parser.parse_args()
18 |     logger.configure()
19 |     set_global_seeds(args.seed)
20 |     env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE)
21 |     env.seed(args.seed)
22 |     env = bench.Monitor(env, logger.get_dir())
23 |     env = retro_wrappers.wrap_deepmind_retro(env)
24 | 
25 |     model = deepq.models.cnn_to_mlp(
26 |         convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
27 |         hiddens=[256],
28 |         dueling=True
29 |     )
30 |     act = deepq.learn(
31 |         env,
32 |         q_func=model,
33 |         lr=1e-4,
34 |         max_timesteps=args.num_timesteps,
35 |         buffer_size=10000,
36 |         exploration_fraction=0.1,
37 |         exploration_final_eps=0.01,
38 |         train_freq=4,
39 |         learning_starts=10000,
40 |         target_network_update_freq=1000,
41 |         gamma=0.99,
42 |         prioritized_replay=True
43 |     )
44 |     act.save()
45 |     env.close()
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/baselines/ppo1/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from mpi4py import MPI
 4 | from baselines.common import set_global_seeds
 5 | from baselines import bench
 6 | import os.path as osp
 7 | from baselines import logger
 8 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind
 9 | from baselines.common.cmd_util import atari_arg_parser
10 | 
11 | def train(env_id, num_timesteps, seed):
12 |     from baselines.ppo1 import pposgd_simple, cnn_policy
13 |     import baselines.common.tf_util as U
14 |     rank = MPI.COMM_WORLD.Get_rank()
15 |     sess = U.single_threaded_session()
16 |     sess.__enter__()
17 |     if rank == 0:
18 |         logger.configure()
19 |     else:
20 |         logger.configure(format_strs=[])
21 |     workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
22 |     set_global_seeds(workerseed)
23 |     env = make_atari(env_id)
24 |     def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
25 |         return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
26 |     env = bench.Monitor(env, logger.get_dir() and
27 |         osp.join(logger.get_dir(), str(rank)))
28 |     env.seed(workerseed)
29 | 
30 |     env = wrap_deepmind(env)
31 |     env.seed(workerseed)
32 | 
33 |     pposgd_simple.learn(env, policy_fn,
34 |         max_timesteps=int(num_timesteps * 1.1),
35 |         timesteps_per_actorbatch=256,
36 |         clip_param=0.2, entcoeff=0.01,
37 |         optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
38 |         gamma=0.99, lam=0.95,
39 |         schedule='linear'
40 |     )
41 |     env.close()
42 | 
43 | def main():
44 |     args = atari_arg_parser().parse_args()
45 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
46 | 
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_mnist.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | # from baselines.acer import acer_simple as acer
 4 | from baselines.common.tests.envs.mnist_env import MnistEnv
 5 | from baselines.common.tests.util import simple_test
 6 | from baselines.run import get_learn_function
 7 | 
 8 | 
 9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?  
10 | # GitHub issue https://github.com/openai/baselines/issues/189
11 | common_kwargs = {
12 |     'seed': 0,
13 |     'network':'cnn',
14 |     'gamma':0.9,
15 |     'pad':'SAME'
16 | }
17 | 
18 | learn_args = {
19 |     'a2c': dict(total_timesteps=50000),
20 |     # TODO need to resolve inference (step) API differences for acer; also slow
21 |     # 'acer': dict(seed=0, total_timesteps=1000),
22 |     'deepq': dict(total_timesteps=5000),
23 |     'acktr': dict(total_timesteps=30000),
24 |     'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
25 |     'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
26 | }
27 | 
28 |  
29 | #tests pass, but are too slow on travis. Same algorithms are covered 
30 | # by other tests with less compute-hungry nn's and by benchmarks
31 | @pytest.mark.skip 
32 | @pytest.mark.slow
33 | @pytest.mark.parametrize("alg", learn_args.keys())
34 | def test_mnist(alg):
35 |     '''
36 |     Test if the algorithm can learn to classify MNIST digits. 
37 |     Uses CNN policy. 
38 |     '''
39 |     
40 |     learn_kwargs = learn_args[alg]
41 |     learn_kwargs.update(common_kwargs)
42 |     
43 |     learn = get_learn_function(alg)
44 |     learn_fn = lambda e: learn(env=e, **learn_kwargs)
45 |     env_fn = lambda: MnistEnv(seed=0, episode_len=100)
46 | 
47 |     simple_test(env_fn, learn_fn, 0.6)
48 | 
49 | if __name__ == '__main__':
50 |     test_mnist('deepq')
51 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_identity.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv
 3 | from baselines.run import get_learn_function
 4 | from baselines.common.tests.util import simple_test
 5 | 
 6 | common_kwargs = dict(
 7 |     total_timesteps=30000,
 8 |     network='mlp',
 9 |     gamma=0.9,
10 |     seed=0,
11 | )
12 |    
13 | learn_kwargs = {
14 |     'a2c' : {},
15 |     'acktr': {},
16 |     'deepq': {},
17 |     'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
18 |     'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
19 | }
20 | 
21 | 
22 | @pytest.mark.slow
23 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
24 | def test_discrete_identity(alg):
25 |     '''
26 |     Test if the algorithm (with an mlp policy)
27 |     can learn an identity transformation (i.e. return observation as an action)
28 |     '''
29 | 
30 |     kwargs = learn_kwargs[alg]
31 |     kwargs.update(common_kwargs)
32 | 
33 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
34 |     env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
35 |     simple_test(env_fn, learn_fn, 0.9)
36 | 
37 | @pytest.mark.slow
38 | @pytest.mark.parametrize("alg", ['a2c', 'ppo2', 'trpo_mpi'])
39 | def test_continuous_identity(alg):
40 |     '''
41 |     Test if the algorithm (with an mlp policy)
42 |     can learn an identity transformation (i.e. return observation as an action)
43 |     to a required precision
44 |     '''
45 | 
46 |     kwargs = learn_kwargs[alg]
47 |     kwargs.update(common_kwargs)
48 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
49 | 
50 |     env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
51 |     simple_test(env_fn, learn_fn, -0.1)
52 | 
53 | if __name__ == '__main__':
54 |     test_continuous_identity('a2c')    
55 | 
56 | 


--------------------------------------------------------------------------------
/baselines/common/input.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from gym.spaces import Discrete, Box
 3 | 
 4 | def observation_placeholder(ob_space, batch_size=None, name='Ob'):
 5 |     ''' 
 6 |     Create placeholder to feed observations into of the size appropriate to the observation space
 7 |     
 8 |     Parameters:
 9 |     ----------
10 | 
11 |     ob_space: gym.Space     observation space
12 |     
13 |     batch_size: int         size of the batch to be fed into input. Can be left None in most cases. 
14 | 
15 |     name: str               name of the placeholder
16 | 
17 |     Returns:
18 |     -------
19 | 
20 |     tensorflow placeholder tensor
21 |     '''
22 | 
23 |     assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \
24 |         'Can only deal with Discrete and Box observation spaces for now'
25 | 
26 |     return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name)
27 | 
28 | 
29 | def observation_input(ob_space, batch_size=None, name='Ob'):
30 |     ''' 
31 |     Create placeholder to feed observations into of the size appropriate to the observation space, and add input 
32 |     encoder of the appropriate type. 
33 |     '''
34 | 
35 |     placeholder = observation_placeholder(ob_space, batch_size, name)
36 |     return placeholder, encode_observation(ob_space, placeholder)
37 | 
38 | def encode_observation(ob_space, placeholder):
39 |     '''
40 |     Encode input in the way that is appropriate to the observation space
41 | 
42 |     Parameters:
43 |     ----------
44 |     
45 |     ob_space: gym.Space             observation space
46 |     
47 |     placeholder: tf.placeholder     observation input placeholder
48 |     '''
49 |     if isinstance(ob_space, Discrete):
50 |         return tf.to_float(tf.one_hot(placeholder, ob_space.n))
51 | 
52 |     elif isinstance(ob_space, Box):
53 |         return tf.to_float(placeholder)
54 |     else:
55 |         raise NotImplementedError
56 | 
57 | 


--------------------------------------------------------------------------------
/baselines/common/tests/envs/identity_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import abstractmethod
 3 | from gym import Env
 4 | from gym.spaces import Discrete, Box
 5 | 
 6 | 
 7 | class IdentityEnv(Env):
 8 |     def __init__(
 9 |             self,
10 |             episode_len=None
11 |     ):
12 | 
13 |         self.episode_len = episode_len
14 |         self.time = 0
15 |         self.reset()
16 | 
17 |     def reset(self):
18 |         self._choose_next_state()
19 |         self.time = 0
20 |         self.observation_space = self.action_space
21 | 
22 |         return self.state
23 | 
24 |     def step(self, actions):
25 |         rew = self._get_reward(actions)
26 |         self._choose_next_state()
27 |         done = False
28 |         if self.episode_len and self.time >= self.episode_len:
29 |             rew = 0
30 |             done = True
31 | 
32 |         return self.state, rew, done, {}
33 | 
34 |     def _choose_next_state(self):
35 |         self.state = self.action_space.sample()
36 |         self.time += 1
37 | 
38 |     @abstractmethod
39 |     def _get_reward(self, actions):
40 |         raise NotImplementedError
41 | 
42 | 
43 | class DiscreteIdentityEnv(IdentityEnv):
44 |     def __init__(
45 |             self,
46 |             dim,
47 |             episode_len=None,
48 |     ):
49 | 
50 |         self.action_space = Discrete(dim)
51 |         super().__init__(episode_len=episode_len)
52 | 
53 |     def _get_reward(self, actions):
54 |         return 1 if self.state == actions else 0
55 | 
56 | 
57 | class BoxIdentityEnv(IdentityEnv):
58 |     def __init__(
59 |             self,
60 |             shape,
61 |             episode_len=None,
62 |     ):
63 | 
64 |         self.action_space = Box(low=-1.0, high=1.0, shape=shape)
65 |         super().__init__(episode_len=episode_len)
66 | 
67 |     def _get_reward(self, actions):
68 |         diff = actions - self.state
69 |         diff = diff[:]
70 |         return -0.5 * np.dot(diff, diff)
71 | 


--------------------------------------------------------------------------------
/baselines/gail/statistics.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py
 3 | '''
 4 | 
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | 
 8 | import baselines.common.tf_util as U
 9 | 
10 | 
11 | class stats():
12 | 
13 |     def __init__(self, scalar_keys=[], histogram_keys=[]):
14 |         self.scalar_keys = scalar_keys
15 |         self.histogram_keys = histogram_keys
16 |         self.scalar_summaries = []
17 |         self.scalar_summaries_ph = []
18 |         self.histogram_summaries_ph = []
19 |         self.histogram_summaries = []
20 |         with tf.variable_scope('summary'):
21 |             for k in scalar_keys:
22 |                 ph = tf.placeholder('float32', None, name=k+'.scalar.summary')
23 |                 sm = tf.summary.scalar(k+'.scalar.summary', ph)
24 |                 self.scalar_summaries_ph.append(ph)
25 |                 self.scalar_summaries.append(sm)
26 |             for k in histogram_keys:
27 |                 ph = tf.placeholder('float32', None, name=k+'.histogram.summary')
28 |                 sm = tf.summary.scalar(k+'.histogram.summary', ph)
29 |                 self.histogram_summaries_ph.append(ph)
30 |                 self.histogram_summaries.append(sm)
31 | 
32 |         self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries)
33 | 
34 |     def add_all_summary(self, writer, values, iter):
35 |         # Note that the order of the incoming ```values``` should be the same as the that of the
36 |         #            ```scalar_keys``` given in ```__init__```
37 |         if np.sum(np.isnan(values)+0) != 0:
38 |             return
39 |         sess = U.get_session()
40 |         keys = self.scalar_summaries_ph + self.histogram_summaries_ph
41 |         feed_dict = {}
42 |         for k, v in zip(keys, values):
43 |             feed_dict.update({k: v})
44 |         summaries_str = sess.run(self.summaries, feed_dict)
45 |         writer.add_summary(summaries_str, iter)
46 | 


--------------------------------------------------------------------------------
/baselines/her/experiment/play.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import numpy as np
 3 | import pickle
 4 | 
 5 | from baselines import logger
 6 | from baselines.common import set_global_seeds
 7 | import baselines.her.experiment.config as config
 8 | from baselines.her.rollout import RolloutWorker
 9 | 
10 | 
11 | @click.command()
12 | @click.argument('policy_file', type=str)
13 | @click.option('--seed', type=int, default=0)
14 | @click.option('--n_test_rollouts', type=int, default=10)
15 | @click.option('--render', type=int, default=1)
16 | def main(policy_file, seed, n_test_rollouts, render):
17 |     set_global_seeds(seed)
18 | 
19 |     # Load policy.
20 |     with open(policy_file, 'rb') as f:
21 |         policy = pickle.load(f)
22 |     env_name = policy.info['env_name']
23 | 
24 |     # Prepare params.
25 |     params = config.DEFAULT_PARAMS
26 |     if env_name in config.DEFAULT_ENV_PARAMS:
27 |         params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
28 |     params['env_name'] = env_name
29 |     params = config.prepare_params(params)
30 |     config.log_params(params, logger=logger)
31 | 
32 |     dims = config.configure_dims(params)
33 | 
34 |     eval_params = {
35 |         'exploit': True,
36 |         'use_target_net': params['test_with_polyak'],
37 |         'compute_Q': True,
38 |         'rollout_batch_size': 1,
39 |         'render': bool(render),
40 |     }
41 | 
42 |     for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
43 |         eval_params[name] = params[name]
44 |     
45 |     evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
46 |     evaluator.seed(seed)
47 | 
48 |     # Run evaluation.
49 |     evaluator.clear_history()
50 |     for _ in range(n_test_rollouts):
51 |         evaluator.generate_rollouts()
52 | 
53 |     # record logs
54 |     for key, val in evaluator.logs('test'):
55 |         logger.record_tabular(key, np.mean(val))
56 |     logger.dump_tabular()
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/run_atari.py:
--------------------------------------------------------------------------------
 1 | from baselines import deepq
 2 | from baselines.common import set_global_seeds
 3 | from baselines import bench
 4 | import argparse
 5 | from baselines import logger
 6 | from baselines.common.atari_wrappers import make_atari
 7 | 
 8 | 
 9 | def main():
10 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
11 |     parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
12 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
13 |     parser.add_argument('--prioritized', type=int, default=1)
14 |     parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
15 |     parser.add_argument('--dueling', type=int, default=1)
16 |     parser.add_argument('--num-timesteps', type=int, default=int(10e6))
17 |     parser.add_argument('--checkpoint-freq', type=int, default=10000)
18 |     parser.add_argument('--checkpoint-path', type=str, default=None)
19 | 
20 |     args = parser.parse_args()
21 |     logger.configure()
22 |     set_global_seeds(args.seed)
23 |     env = make_atari(args.env)
24 |     env = bench.Monitor(env, logger.get_dir())
25 |     env = deepq.wrap_atari_dqn(env)
26 | 
27 |     deepq.learn(
28 |         env,
29 |         "conv_only",
30 |         convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
31 |         hiddens=[256],
32 |         dueling=bool(args.dueling),
33 |         lr=1e-4,
34 |         total_timesteps=args.num_timesteps,
35 |         buffer_size=10000,
36 |         exploration_fraction=0.1,
37 |         exploration_final_eps=0.01,
38 |         train_freq=4,
39 |         learning_starts=10000,
40 |         target_network_update_freq=1000,
41 |         gamma=0.99,
42 |         prioritized_replay=bool(args.prioritized),
43 |         prioritized_replay_alpha=args.prioritized_replay_alpha,
44 |         checkpoint_freq=args.checkpoint_freq,
45 |         checkpoint_path=args.checkpoint_path,
46 |     )
47 | 
48 |     env.close()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/baselines/her/actor_critic.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from baselines.her.util import store_args, nn
 3 | 
 4 | 
 5 | class ActorCritic:
 6 |     @store_args
 7 |     def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
 8 |                  **kwargs):
 9 |         """The actor-critic network and related training code.
10 | 
11 |         Args:
12 |             inputs_tf (dict of tensors): all necessary inputs for the network: the
13 |                 observation (o), the goal (g), and the action (u)
14 |             dimo (int): the dimension of the observations
15 |             dimg (int): the dimension of the goals
16 |             dimu (int): the dimension of the actions
17 |             max_u (float): the maximum magnitude of actions; action outputs will be scaled
18 |                 accordingly
19 |             o_stats (baselines.her.Normalizer): normalizer for observations
20 |             g_stats (baselines.her.Normalizer): normalizer for goals
21 |             hidden (int): number of hidden units that should be used in hidden layers
22 |             layers (int): number of hidden layers
23 |         """
24 |         self.o_tf = inputs_tf['o']
25 |         self.g_tf = inputs_tf['g']
26 |         self.u_tf = inputs_tf['u']
27 | 
28 |         # Prepare inputs for actor and critic.
29 |         o = self.o_stats.normalize(self.o_tf)
30 |         g = self.g_stats.normalize(self.g_tf)
31 |         input_pi = tf.concat(axis=1, values=[o, g])  # for actor
32 | 
33 |         # Networks.
34 |         with tf.variable_scope('pi'):
35 |             self.pi_tf = self.max_u * tf.tanh(nn(
36 |                 input_pi, [self.hidden] * self.layers + [self.dimu]))
37 |         with tf.variable_scope('Q'):
38 |             # for policy training
39 |             input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
40 |             self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
41 |             # for critic training
42 |             input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
43 |             self._input_Q = input_Q  # exposed for tests
44 |             self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
45 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | from baselines.common import zipsame
 4 | 
 5 | 
 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False):
 7 |     x = np.asarray(x)
 8 |     assert x.ndim > 0
 9 |     if comm is None: comm = MPI.COMM_WORLD
10 |     xsum = x.sum(axis=axis, keepdims=keepdims)
11 |     n = xsum.size
12 |     localsum = np.zeros(n+1, x.dtype)
13 |     localsum[:n] = xsum.ravel()
14 |     localsum[n] = x.shape[axis]
15 |     globalsum = np.zeros_like(localsum)
16 |     comm.Allreduce(localsum, globalsum, op=MPI.SUM)
17 |     return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
18 | 
19 | def mpi_moments(x, axis=0, comm=None, keepdims=False):
20 |     x = np.asarray(x)
21 |     assert x.ndim > 0
22 |     mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
23 |     sqdiffs = np.square(x - mean)
24 |     meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
25 |     assert count1 == count
26 |     std = np.sqrt(meansqdiff)
27 |     if not keepdims:
28 |         newshape = mean.shape[:axis] + mean.shape[axis+1:]
29 |         mean = mean.reshape(newshape)
30 |         std = std.reshape(newshape)
31 |     return mean, std, count
32 | 
33 | 
34 | def test_runningmeanstd():
35 |     import subprocess
36 |     subprocess.check_call(['mpirun', '-np', '3', 
37 |         'python','-c', 
38 |         'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
39 | 
40 | def _helper_runningmeanstd():
41 |     comm = MPI.COMM_WORLD
42 |     np.random.seed(0)
43 |     for (triple,axis) in [
44 |         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
45 |         ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
46 |         ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
47 |         ]:
48 | 
49 | 
50 |         x = np.concatenate(triple, axis=axis)
51 |         ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
52 | 
53 | 
54 |         ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
55 | 
56 |         for (a1,a2) in zipsame(ms1, ms2):
57 |             print(a1, a2)
58 |             assert np.allclose(a1, a2)
59 |             print("ok!")
60 | 
61 | 


--------------------------------------------------------------------------------
/baselines/common/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from contextlib import contextmanager
 3 | import numpy as np
 4 | import time
 5 | import shlex 
 6 | import subprocess
 7 | 
 8 | # ================================================================
 9 | # Misc
10 | # ================================================================
11 | 
12 | def fmt_row(width, row, header=False):
13 |     out = " | ".join(fmt_item(x, width) for x in row)
14 |     if header: out = out + "\n" + "-"*len(out)
15 |     return out
16 | 
17 | def fmt_item(x, l):
18 |     if isinstance(x, np.ndarray):
19 |         assert x.ndim==0
20 |         x = x.item()
21 |     if isinstance(x, (float, np.float32, np.float64)):
22 |         v = abs(x)
23 |         if (v < 1e-4 or v > 1e+4) and v > 0:
24 |             rep = "%7.2e" % x
25 |         else:
26 |             rep = "%7.5f" % x
27 |     else: rep = str(x)
28 |     return " "*(l - len(rep)) + rep
29 | 
30 | color2num = dict(
31 |     gray=30,
32 |     red=31,
33 |     green=32,
34 |     yellow=33,
35 |     blue=34,
36 |     magenta=35,
37 |     cyan=36,
38 |     white=37,
39 |     crimson=38
40 | )
41 | 
42 | def colorize(string, color='green', bold=False, highlight=False):
43 |     attr = []
44 |     num = color2num[color]
45 |     if highlight: num += 10
46 |     attr.append(str(num))
47 |     if bold: attr.append('1')
48 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
49 | 
50 | def print_cmd(cmd, dry=False):
51 |     if isinstance(cmd, str):  # for shell=True
52 |         pass
53 |     else:
54 |         cmd = ' '.join(shlex.quote(arg) for arg in cmd)
55 |     print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd))
56 | 
57 | 
58 | def get_git_commit(cwd=None):
59 |     return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8')
60 | 
61 | def ccap(cmd, dry=False, env=None, **kwargs):
62 |     print_cmd(cmd, dry)
63 |     if not dry:
64 |         subprocess.check_call(cmd, env=env, **kwargs)
65 | 
66 | 
67 | MESSAGE_DEPTH = 0
68 | 
69 | @contextmanager
70 | def timed(msg):
71 |     global MESSAGE_DEPTH #pylint: disable=W0603
72 |     print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
73 |     tstart = time.time()
74 |     MESSAGE_DEPTH += 1
75 |     yield
76 |     MESSAGE_DEPTH -= 1
77 |     print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
78 | 


--------------------------------------------------------------------------------
/baselines/common/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Dataset(object):
 4 |     def __init__(self, data_map, deterministic=False, shuffle=True):
 5 |         self.data_map = data_map
 6 |         self.deterministic = deterministic
 7 |         self.enable_shuffle = shuffle
 8 |         self.n = next(iter(data_map.values())).shape[0]
 9 |         self._next_id = 0
10 |         self.shuffle()
11 | 
12 |     def shuffle(self):
13 |         if self.deterministic:
14 |             return
15 |         perm = np.arange(self.n)
16 |         np.random.shuffle(perm)
17 | 
18 |         for key in self.data_map:
19 |             self.data_map[key] = self.data_map[key][perm]
20 | 
21 |         self._next_id = 0
22 | 
23 |     def next_batch(self, batch_size):
24 |         if self._next_id >= self.n and self.enable_shuffle:
25 |             self.shuffle()
26 | 
27 |         cur_id = self._next_id
28 |         cur_batch_size = min(batch_size, self.n - self._next_id)
29 |         self._next_id += cur_batch_size
30 | 
31 |         data_map = dict()
32 |         for key in self.data_map:
33 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 |         return data_map
35 | 
36 |     def iterate_once(self, batch_size):
37 |         if self.enable_shuffle: self.shuffle()
38 | 
39 |         while self._next_id <= self.n - batch_size:
40 |             yield self.next_batch(batch_size)
41 |         self._next_id = 0
42 | 
43 |     def subset(self, num_elements, deterministic=True):
44 |         data_map = dict()
45 |         for key in self.data_map:
46 |             data_map[key] = self.data_map[key][:num_elements]
47 |         return Dataset(data_map, deterministic)
48 | 
49 | 
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 |     arrays = tuple(map(np.asarray, arrays))
53 |     n = arrays[0].shape[0]
54 |     assert all(a.shape[0] == n for a in arrays[1:])
55 |     inds = np.arange(n)
56 |     if shuffle: np.random.shuffle(inds)
57 |     sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 |     for batch_inds in np.array_split(inds, sections):
59 |         if include_final_partial_batch or len(batch_inds) == batch_size:
60 |             yield tuple(a[batch_inds] for a in arrays)
61 | 


--------------------------------------------------------------------------------
/baselines/common/tests/envs/mnist_env.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import numpy as np
 3 | import tempfile
 4 | import filelock
 5 | from gym import Env
 6 | from gym.spaces import Discrete, Box
 7 | 
 8 | 
 9 | 
10 | class MnistEnv(Env):
11 |     def __init__(
12 |             self,
13 |             seed=0,
14 |             episode_len=None,
15 |             no_images=None
16 |     ):
17 |         from tensorflow.examples.tutorials.mnist import input_data
18 |         # we could use temporary directory for this with a context manager and 
19 |         # TemporaryDirecotry, but then each test that uses mnist would re-download the data
20 |         # this way the data is not cleaned up, but we only download it once per machine
21 |         mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
22 |         with filelock.FileLock(mnist_path + '.lock'):
23 |            self.mnist = input_data.read_data_sets(mnist_path)
24 | 
25 |         self.np_random = np.random.RandomState()
26 |         self.np_random.seed(seed)
27 | 
28 |         self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
29 |         self.action_space = Discrete(10)
30 |         self.episode_len = episode_len
31 |         self.time = 0
32 |         self.no_images = no_images
33 | 
34 |         self.train_mode()
35 |         self.reset()
36 |         
37 |     def reset(self):
38 |         self._choose_next_state()
39 |         self.time = 0
40 | 
41 |         return self.state[0]
42 | 
43 |     def step(self, actions):
44 |         rew = self._get_reward(actions)
45 |         self._choose_next_state()
46 |         done = False
47 |         if self.episode_len and self.time >= self.episode_len:
48 |             rew = 0
49 |             done = True
50 | 
51 |         return self.state[0], rew, done, {}
52 | 
53 |     def train_mode(self):
54 |         self.dataset = self.mnist.train
55 | 
56 |     def test_mode(self):
57 |         self.dataset = self.mnist.test
58 | 
59 |     def _choose_next_state(self):
60 |         max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
61 |         index = self.np_random.randint(0, max_index)
62 |         image = self.dataset.images[index].reshape(28,28,1)*255
63 |         label = self.dataset.labels[index]
64 |         self.state = (image, label)
65 |         self.time += 1
66 | 
67 |     def _get_reward(self, actions):
68 |         return 1 if self.state[1] == actions else 0
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/baselines/ddpg/noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class AdaptiveParamNoiseSpec(object):
 5 |     def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01):
 6 |         self.initial_stddev = initial_stddev
 7 |         self.desired_action_stddev = desired_action_stddev
 8 |         self.adoption_coefficient = adoption_coefficient
 9 | 
10 |         self.current_stddev = initial_stddev
11 | 
12 |     def adapt(self, distance):
13 |         if distance > self.desired_action_stddev:
14 |             # Decrease stddev.
15 |             self.current_stddev /= self.adoption_coefficient
16 |         else:
17 |             # Increase stddev.
18 |             self.current_stddev *= self.adoption_coefficient
19 | 
20 |     def get_stats(self):
21 |         stats = {
22 |             'param_noise_stddev': self.current_stddev,
23 |         }
24 |         return stats
25 | 
26 |     def __repr__(self):
27 |         fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})'
28 |         return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient)
29 | 
30 | 
31 | class ActionNoise(object):
32 |     def reset(self):
33 |         pass
34 | 
35 | 
36 | class NormalActionNoise(ActionNoise):
37 |     def __init__(self, mu, sigma):
38 |         self.mu = mu
39 |         self.sigma = sigma
40 | 
41 |     def __call__(self):
42 |         return np.random.normal(self.mu, self.sigma)
43 | 
44 |     def __repr__(self):
45 |         return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
46 | 
47 | 
48 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
49 | class OrnsteinUhlenbeckActionNoise(ActionNoise):
50 |     def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None):
51 |         self.theta = theta
52 |         self.mu = mu
53 |         self.sigma = sigma
54 |         self.dt = dt
55 |         self.x0 = x0
56 |         self.reset()
57 | 
58 |     def __call__(self):
59 |         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
60 |         self.x_prev = x
61 |         return x
62 | 
63 |     def reset(self):
64 |         self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
65 | 
66 |     def __repr__(self):
67 |         return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
68 | 


--------------------------------------------------------------------------------
/baselines/common/math_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | 
 5 | def discount(x, gamma):
 6 |     """
 7 |     computes discounted sums along 0th dimension of x.
 8 | 
 9 |     inputs
10 |     ------
11 |     x: ndarray
12 |     gamma: float
13 | 
14 |     outputs
15 |     -------
16 |     y: ndarray with same shape as x, satisfying
17 | 
18 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 |                 where k = len(x) - t - 1
20 | 
21 |     """
22 |     assert x.ndim >= 1
23 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 | 
25 | def explained_variance(ypred,y):
26 |     """
27 |     Computes fraction of variance that ypred explains about y.
28 |     Returns 1 - Var[y-ypred] / Var[y]
29 | 
30 |     interpretation:
31 |         ev=0  =>  might as well have predicted zero
32 |         ev=1  =>  perfect prediction
33 |         ev<0  =>  worse than just predicting zero
34 | 
35 |     """
36 |     assert y.ndim == 1 and ypred.ndim == 1
37 |     vary = np.var(y)
38 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 | 
40 | def explained_variance_2d(ypred, y):
41 |     assert y.ndim == 2 and ypred.ndim == 2
42 |     vary = np.var(y, axis=0)
43 |     out = 1 - np.var(y-ypred)/vary
44 |     out[vary < 1e-10] = 0
45 |     return out
46 | 
47 | def ncc(ypred, y):
48 |     return np.corrcoef(ypred, y)[1,0]
49 | 
50 | def flatten_arrays(arrs):
51 |     return np.concatenate([arr.flat for arr in arrs])
52 | 
53 | def unflatten_vector(vec, shapes):
54 |     i=0
55 |     arrs = []
56 |     for shape in shapes:
57 |         size = np.prod(shape)
58 |         arr = vec[i:i+size].reshape(shape)
59 |         arrs.append(arr)
60 |         i += size
61 |     return arrs
62 | 
63 | def discount_with_boundaries(X, New, gamma):
64 |     """
65 |     X: 2d array of floats, time x features
66 |     New: 2d array of bools, indicating when a new episode has started
67 |     """
68 |     Y = np.zeros_like(X)
69 |     T = X.shape[0]
70 |     Y[T-1] = X[T-1]
71 |     for t in range(T-2, -1, -1):
72 |         Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 |     return Y
74 | 
75 | def test_discount_with_boundaries():
76 |     gamma=0.9
77 |     x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 |     starts = [1.0, 0.0, 0.0, 1.0]
79 |     y = discount_with_boundaries(x, starts, gamma)
80 |     assert np.allclose(y, [
81 |         1 + gamma * 2 + gamma**2 * 3,
82 |         2 + gamma * 3,
83 |         3,
84 |         4
85 |     ])


--------------------------------------------------------------------------------
/baselines/ppo1/cnn_policy.py:
--------------------------------------------------------------------------------
 1 | import baselines.common.tf_util as U
 2 | import tensorflow as tf
 3 | import gym
 4 | from baselines.common.distributions import make_pdtype
 5 | 
 6 | class CnnPolicy(object):
 7 |     recurrent = False
 8 |     def __init__(self, name, ob_space, ac_space, kind='large'):
 9 |         with tf.variable_scope(name):
10 |             self._init(ob_space, ac_space, kind)
11 |             self.scope = tf.get_variable_scope().name
12 | 
13 |     def _init(self, ob_space, ac_space, kind):
14 |         assert isinstance(ob_space, gym.spaces.Box)
15 | 
16 |         self.pdtype = pdtype = make_pdtype(ac_space)
17 |         sequence_length = None
18 | 
19 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
20 | 
21 |         x = ob / 255.0
22 |         if kind == 'small': # from A3C paper
23 |             x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
24 |             x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
25 |             x = U.flattenallbut0(x)
26 |             x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
27 |         elif kind == 'large': # Nature DQN
28 |             x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
29 |             x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
30 |             x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
31 |             x = U.flattenallbut0(x)
32 |             x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
33 |         else:
34 |             raise NotImplementedError
35 | 
36 |         logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
37 |         self.pd = pdtype.pdfromflat(logits)
38 |         self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]
39 | 
40 |         self.state_in = []
41 |         self.state_out = []
42 | 
43 |         stochastic = tf.placeholder(dtype=tf.bool, shape=())
44 |         ac = self.pd.sample() # XXX
45 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
46 | 
47 |     def act(self, stochastic, ob):
48 |         ac1, vpred1 =  self._act(stochastic, ob[None])
49 |         return ac1[0], vpred1[0]
50 |     def get_variables(self):
51 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
52 |     def get_trainable_variables(self):
53 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
54 |     def get_initial_state(self):
55 |         return []
56 | 
57 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import spaces
 3 | from . import VecEnv
 4 | from .util import copy_obs_dict, dict_to_obs, obs_space_info
 5 | 
 6 | class DummyVecEnv(VecEnv):
 7 |     def __init__(self, env_fns):
 8 |         self.envs = [fn() for fn in env_fns]
 9 |         env = self.envs[0]
10 |         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
11 |         obs_space = env.observation_space
12 |  
13 |         self.keys, shapes, dtypes = obs_space_info(obs_space)       
14 |         self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
15 |         self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
16 |         self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
17 |         self.buf_infos = [{} for _ in range(self.num_envs)]
18 |         self.actions = None
19 | 
20 |     def step_async(self, actions):
21 |         listify = True
22 |         try:
23 |             if len(actions) == self.num_envs:
24 |                 listify = False
25 |         except TypeError:
26 |             pass
27 | 
28 |         if not listify:
29 |             self.actions = actions
30 |         else:
31 |             assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
32 |             self.actions = [actions]
33 | 
34 |     def step_wait(self):
35 |         for e in range(self.num_envs):
36 |             action = self.actions[e]
37 |             if isinstance(self.envs[e].action_space, spaces.Discrete):
38 |                 action = int(action)
39 | 
40 |             obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
41 |             if self.buf_dones[e]:
42 |                 obs = self.envs[e].reset()
43 |             self._save_obs(e, obs)
44 |         return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
45 |                 self.buf_infos.copy())
46 | 
47 |     def reset(self):
48 |         for e in range(self.num_envs):
49 |             obs = self.envs[e].reset()
50 |             self._save_obs(e, obs)
51 |         return self._obs_from_buf()
52 | 
53 |     def close(self):
54 |         return
55 | 
56 |     def _save_obs(self, e, obs):
57 |         for k in self.keys:
58 |             if k is None:
59 |                 self.buf_obs[k][e] = obs
60 |             else:
61 |                 self.buf_obs[k][e] = obs[k]
62 | 
63 |     def _obs_from_buf(self):
64 |         return dict_to_obs(copy_obs_dict(self.buf_obs))
65 | 
66 |     def get_images(self):
67 |         return [env.render(mode='rgb_array') for env in self.envs]
68 |         
69 | 


--------------------------------------------------------------------------------
/baselines/ppo1/run_humanoid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 4 | from baselines.common import tf_util as U
 5 | from baselines import logger
 6 | 
 7 | import gym
 8 | 
 9 | def train(num_timesteps, seed, model_path=None):
10 |     env_id = 'Humanoid-v2'
11 |     from baselines.ppo1 import mlp_policy, pposgd_simple
12 |     U.make_session(num_cpu=1).__enter__()
13 |     def policy_fn(name, ob_space, ac_space):
14 |         return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
15 |             hid_size=64, num_hid_layers=2)
16 |     env = make_mujoco_env(env_id, seed)
17 | 
18 |     # parameters below were the best found in a simple random search
19 |     # these are good enough to make humanoid walk, but whether those are
20 |     # an absolute best or not is not certain
21 |     env = RewScale(env, 0.1)
22 |     pi = pposgd_simple.learn(env, policy_fn,
23 |             max_timesteps=num_timesteps,
24 |             timesteps_per_actorbatch=2048,
25 |             clip_param=0.2, entcoeff=0.0,
26 |             optim_epochs=10, 
27 |             optim_stepsize=3e-4, 
28 |             optim_batchsize=64, 
29 |             gamma=0.99, 
30 |             lam=0.95,
31 |             schedule='linear',
32 |         )
33 |     env.close()
34 |     if model_path:
35 |         U.save_state(model_path)
36 |         
37 |     return pi
38 | 
39 | class RewScale(gym.RewardWrapper):
40 |     def __init__(self, env, scale):
41 |         gym.RewardWrapper.__init__(self, env)
42 |         self.scale = scale
43 |     def reward(self, r):
44 |         return r * self.scale
45 | 
46 | def main():
47 |     logger.configure()
48 |     parser = mujoco_arg_parser()
49 |     parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
50 |     parser.set_defaults(num_timesteps=int(2e7))
51 |    
52 |     args = parser.parse_args()
53 |     
54 |     if not args.play:
55 |         # train the model
56 |         train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
57 |     else:       
58 |         # construct the model object, load pre-trained model and render
59 |         pi = train(num_timesteps=1, seed=args.seed)
60 |         U.load_state(args.model_path)
61 |         env = make_mujoco_env('Humanoid-v2', seed=0)
62 | 
63 |         ob = env.reset()        
64 |         while True:
65 |             action = pi.act(stochastic=False, ob=ob)[0]
66 |             ob, _, done, _ =  env.step(action)
67 |             env.render()
68 |             if done:
69 |                 ob = env.reset()
70 |         
71 |         
72 |     
73 | 
74 | if __name__ == '__main__':
75 |     main()
76 | 


--------------------------------------------------------------------------------
/baselines/ddpg/models.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.contrib as tc
 3 | 
 4 | 
 5 | class Model(object):
 6 |     def __init__(self, name):
 7 |         self.name = name
 8 | 
 9 |     @property
10 |     def vars(self):
11 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
12 | 
13 |     @property
14 |     def trainable_vars(self):
15 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name)
16 | 
17 |     @property
18 |     def perturbable_vars(self):
19 |         return [var for var in self.trainable_vars if 'LayerNorm' not in var.name]
20 | 
21 | 
22 | class Actor(Model):
23 |     def __init__(self, nb_actions, name='actor', layer_norm=True):
24 |         super(Actor, self).__init__(name=name)
25 |         self.nb_actions = nb_actions
26 |         self.layer_norm = layer_norm
27 | 
28 |     def __call__(self, obs, reuse=False):
29 |         with tf.variable_scope(self.name) as scope:
30 |             if reuse:
31 |                 scope.reuse_variables()
32 | 
33 |             x = obs
34 |             x = tf.layers.dense(x, 64)
35 |             if self.layer_norm:
36 |                 x = tc.layers.layer_norm(x, center=True, scale=True)
37 |             x = tf.nn.relu(x)
38 |             
39 |             x = tf.layers.dense(x, 64)
40 |             if self.layer_norm:
41 |                 x = tc.layers.layer_norm(x, center=True, scale=True)
42 |             x = tf.nn.relu(x)
43 |             
44 |             x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
45 |             x = tf.nn.tanh(x)
46 |         return x
47 | 
48 | 
49 | class Critic(Model):
50 |     def __init__(self, name='critic', layer_norm=True):
51 |         super(Critic, self).__init__(name=name)
52 |         self.layer_norm = layer_norm
53 | 
54 |     def __call__(self, obs, action, reuse=False):
55 |         with tf.variable_scope(self.name) as scope:
56 |             if reuse:
57 |                 scope.reuse_variables()
58 | 
59 |             x = obs
60 |             x = tf.layers.dense(x, 64)
61 |             if self.layer_norm:
62 |                 x = tc.layers.layer_norm(x, center=True, scale=True)
63 |             x = tf.nn.relu(x)
64 | 
65 |             x = tf.concat([x, action], axis=-1)
66 |             x = tf.layers.dense(x, 64)
67 |             if self.layer_norm:
68 |                 x = tc.layers.layer_norm(x, center=True, scale=True)
69 |             x = tf.nn.relu(x)
70 | 
71 |             x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
72 |         return x
73 | 
74 |     @property
75 |     def output_vars(self):
76 |         output_vars = [var for var in self.trainable_vars if 'output' in var.name]
77 |         return output_vars
78 | 


--------------------------------------------------------------------------------
/baselines/gail/result/gail-result.md:
--------------------------------------------------------------------------------
 1 | # Results of GAIL/BC on Mujoco
 2 | 
 3 | Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including 
 4 | Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0.
 5 | 
 6 | ## Results
 7 | 
 8 | ### Training through iterations
 9 | 
10 | - Hoppers-v1
11 | <img src='hopper-training.png'> 
12 | 
13 | - HalfCheetah-v1
14 | <img src='halfcheetah-training.png'> 
15 | 
16 | - Walker2d-v1
17 | <img src='walker2d-training.png'> 
18 | 
19 | - Humanoid-v1
20 | <img src='humanoid-training.png'> 
21 | 
22 | - HumanoidStandup-v1
23 | <img src='humanoidstandup-training.png'> 
24 | 
25 | For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing)
26 | 
27 | ### Determinstic Polciy (Set std=0)
28 | |   | Un-normalized | Normalized |
29 | |---|---|---|
30 | | Hopper-v1 | <img src='Hopper-unnormalized-deterministic-scores.png'> | <img src='Hopper-normalized-deterministic-scores.png'> |
31 | | HalfCheetah-v1 | <img src='HalfCheetah-unnormalized-deterministic-scores.png'> | <img src='HalfCheetah-normalized-deterministic-scores.png'> |
32 | | Walker2d-v1 | <img src='Walker2d-unnormalized-deterministic-scores.png'> | <img src='Walker2d-normalized-deterministic-scores.png'> |
33 | | Humanoid-v1 | <img src='Humanoid-unnormalized-deterministic-scores.png'> | <img src='Humanoid-normalized-deterministic-scores.png'> |
34 | | HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-deterministic-scores.png'> | <img src='HumanoidStandup-normalized-deterministic-scores.png'> |
35 | 
36 | ### Stochatic Policy 
37 | |   | Un-normalized | Normalized |
38 | |---|---|---|
39 | | Hopper-v1 | <img src='Hopper-unnormalized-stochastic-scores.png'> | <img src='Hopper-normalized-stochastic-scores.png'> |
40 | | HalfCheetah-v1 | <img src='HalfCheetah-unnormalized-stochastic-scores.png'> | <img src='HalfCheetah-normalized-stochastic-scores.png'> |
41 | | Walker2d-v1 | <img src='Walker2d-unnormalized-stochastic-scores.png'> | <img src='Walker2d-normalized-stochastic-scores.png'> |
42 | | Humanoid-v1 | <img src='Humanoid-unnormalized-stochastic-scores.png'> | <img src='Humanoid-normalized-stochastic-scores.png'> |
43 | | HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-stochastic-scores.png'> | <img src='HumanoidStandup-normalized-stochastic-scores.png'> |
44 | 
45 | ### details about GAIL imitator
46 | 
47 | For all environments, the 
48 | imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most 
49 | 1024 transitions, and seed 0, 1, 2, 3, respectively.
50 | 
51 | ### details about the BC imitators
52 | 
53 | All BC imitators are trained with seed 0.
54 | 


--------------------------------------------------------------------------------
/baselines/acer/runner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from baselines.common.runners import AbstractEnvRunner
 3 | 
 4 | class Runner(AbstractEnvRunner):
 5 | 
 6 |     def __init__(self, env, model, nsteps, nstack):
 7 |         super().__init__(env=env, model=model, nsteps=nsteps)
 8 |         self.nstack = nstack
 9 |         nh, nw, nc = env.observation_space.shape
10 |         self.nc = nc  # nc = 1 for atari, but just in case
11 |         self.nact = env.action_space.n
12 |         nenv = self.nenv
13 |         self.nbatch = nenv * nsteps
14 |         self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
15 |         self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
16 |         obs = env.reset()
17 |         self.update_obs(obs)
18 | 
19 |     def update_obs(self, obs, dones=None):
20 |         #self.obs = obs
21 |         if dones is not None:
22 |             self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
23 |         self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
24 |         self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
25 | 
26 |     def run(self):
27 |         enc_obs = np.split(self.obs, self.nstack, axis=3)  # so now list of obs steps
28 |         mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
29 |         for _ in range(self.nsteps):
30 |             actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones)
31 |             mb_obs.append(np.copy(self.obs))
32 |             mb_actions.append(actions)
33 |             mb_mus.append(mus)
34 |             mb_dones.append(self.dones)
35 |             obs, rewards, dones, _ = self.env.step(actions)
36 |             # states information for statefull models like LSTM
37 |             self.states = states
38 |             self.dones = dones
39 |             self.update_obs(obs, dones)
40 |             mb_rewards.append(rewards)
41 |             enc_obs.append(obs)
42 |         mb_obs.append(np.copy(self.obs))
43 |         mb_dones.append(self.dones)
44 | 
45 |         enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
46 |         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
47 |         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
48 |         mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
49 |         mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
50 | 
51 |         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
52 | 
53 |         mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
54 |         mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
55 | 
56 |         # shapes are now [nenv, nsteps, []]
57 |         # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
58 | 
59 |         return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
60 | 
61 | 


--------------------------------------------------------------------------------
/baselines/acktr/value_functions.py:
--------------------------------------------------------------------------------
 1 | from baselines import logger
 2 | import numpy as np
 3 | import baselines.common as common
 4 | from baselines.common import tf_util as U
 5 | import tensorflow as tf
 6 | from baselines.acktr import kfac
 7 | from baselines.acktr.utils import dense
 8 | 
 9 | class NeuralNetValueFunction(object):
10 |     def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
11 |         X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
12 |         vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
13 |         wd_dict = {}
14 |         h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
15 |         h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
16 |         vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
17 |         sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
18 |         wd_loss = tf.get_collection("vf_losses", None)
19 |         loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
20 |         loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
21 |         self._predict = U.function([X], vpred_n)
22 |         optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
23 |                                     clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
24 |                                     async_=1, kfac_update=2, cold_iter=50, \
25 |                                     weight_decay_dict=wd_dict, max_grad_norm=None)
26 |         vf_var_list = []
27 |         for var in tf.trainable_variables():
28 |             if "vf" in var.name:
29 |                 vf_var_list.append(var)
30 | 
31 |         update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
32 |         self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
33 |         U.initialize() # Initialize uninitialized TF variables
34 |     def _preproc(self, path):
35 |         l = pathlength(path)
36 |         al = np.arange(l).reshape(-1,1)/10.0
37 |         act = path["action_dist"].astype('float32')
38 |         X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
39 |         return X
40 |     def predict(self, path):
41 |         return self._predict(self._preproc(path))
42 |     def fit(self, paths, targvals):
43 |         X = np.concatenate([self._preproc(p) for p in paths])
44 |         y = np.concatenate(targvals)
45 |         logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
46 |         for _ in range(25): self.do_update(X, y)
47 |         logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
48 | 
49 | def pathlength(path):
50 |     return path["reward"].shape[0]
51 | 


--------------------------------------------------------------------------------
/baselines/a2c/runner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from baselines.a2c.utils import discount_with_dones
 3 | from baselines.common.runners import AbstractEnvRunner
 4 | 
 5 | class Runner(AbstractEnvRunner):
 6 | 
 7 |     def __init__(self, env, model, nsteps=5, gamma=0.99):
 8 |         super().__init__(env=env, model=model, nsteps=nsteps)
 9 |         self.gamma = gamma
10 |         self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
11 |         self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
12 |     
13 |     def run(self):
14 |         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
15 |         mb_states = self.states
16 |         for n in range(self.nsteps):
17 |             actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
18 |             mb_obs.append(np.copy(self.obs))
19 |             mb_actions.append(actions)
20 |             mb_values.append(values)
21 |             mb_dones.append(self.dones)
22 |             obs, rewards, dones, _ = self.env.step(actions)
23 |             self.states = states
24 |             self.dones = dones
25 |             for n, done in enumerate(dones):
26 |                 if done:
27 |                     self.obs[n] = self.obs[n]*0
28 |             self.obs = obs
29 |             mb_rewards.append(rewards)
30 |         mb_dones.append(self.dones)
31 |         #batch of steps to batch of rollouts
32 | 
33 |         mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
34 |         mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
35 |         mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
36 |         mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
37 |         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
38 |         mb_masks = mb_dones[:, :-1]
39 |         mb_dones = mb_dones[:, 1:]
40 | 
41 | 
42 |         if self.gamma > 0.0:
43 |             #discount/bootstrap off value fn
44 |             last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
45 |             for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
46 |                 rewards = rewards.tolist()
47 |                 dones = dones.tolist()
48 |                 if dones[-1] == 0:
49 |                     rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
50 |                 else:
51 |                     rewards = discount_with_dones(rewards, dones, self.gamma)
52 | 
53 |                 mb_rewards[n] = rewards
54 |     
55 |         mb_actions = mb_actions.reshape(self.batch_action_shape)
56 | 
57 |         mb_rewards = mb_rewards.flatten()
58 |         mb_values = mb_values.flatten()
59 |         mb_masks = mb_masks.flatten()
60 |         return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
61 | 


--------------------------------------------------------------------------------
/baselines/deepq/utils.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.input import observation_input
 2 | from baselines.common.tf_util import adjust_shape
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | # ================================================================
 7 | # Placeholders
 8 | # ================================================================
 9 | 
10 | 
11 | class TfInput(object):
12 |     def __init__(self, name="(unnamed)"):
13 |         """Generalized Tensorflow placeholder. The main differences are:
14 |             - possibly uses multiple placeholders internally and returns multiple values
15 |             - can apply light postprocessing to the value feed to placeholder.
16 |         """
17 |         self.name = name
18 | 
19 |     def get(self):
20 |         """Return the tf variable(s) representing the possibly postprocessed value
21 |         of placeholder(s).
22 |         """
23 |         raise NotImplemented()
24 | 
25 |     def make_feed_dict(data):
26 |         """Given data input it to the placeholder(s)."""
27 |         raise NotImplemented()
28 | 
29 | 
30 | class PlaceholderTfInput(TfInput):
31 |     def __init__(self, placeholder):
32 |         """Wrapper for regular tensorflow placeholder."""
33 |         super().__init__(placeholder.name)
34 |         self._placeholder = placeholder
35 | 
36 |     def get(self):
37 |         return self._placeholder
38 | 
39 |     def make_feed_dict(self, data):
40 |         return {self._placeholder: adjust_shape(self._placeholder, data)}
41 | 
42 | 
43 | class Uint8Input(PlaceholderTfInput):
44 |     def __init__(self, shape, name=None):
45 |         """Takes input in uint8 format which is cast to float32 and divided by 255
46 |         before passing it to the model.
47 | 
48 |         On GPU this ensures lower data transfer times.
49 | 
50 |         Parameters
51 |         ----------
52 |         shape: [int]
53 |             shape of the tensor.
54 |         name: str
55 |             name of the underlying placeholder
56 |         """
57 | 
58 |         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
59 |         self._shape = shape
60 |         self._output = tf.cast(super().get(), tf.float32) / 255.0
61 | 
62 |     def get(self):
63 |         return self._output
64 | 
65 | 
66 | class ObservationInput(PlaceholderTfInput):
67 |     def __init__(self, observation_space, name=None):
68 |         """Creates an input placeholder tailored to a specific observation space
69 |         
70 |         Parameters
71 |         ----------
72 | 
73 |         observation_space: 
74 |                 observation space of the environment. Should be one of the gym.spaces types
75 |         name: str 
76 |                 tensorflow name of the underlying placeholder
77 |         """
78 |         inpt, self.processed_inpt = observation_input(observation_space, name=name)
79 |         super().__init__(inpt)
80 | 
81 |     def get(self):
82 |         return self.processed_inpt
83 |     
84 |     
85 | 


--------------------------------------------------------------------------------
/baselines/her/her.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
 5 |     """Creates a sample function that can be used for HER experience replay.
 6 | 
 7 |     Args:
 8 |         replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none',
 9 |             regular DDPG experience replay is used
10 |         replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
11 |             as many HER replays as regular replays are used)
12 |         reward_fun (function): function to re-compute the reward with substituted goals
13 |     """
14 |     if replay_strategy == 'future':
15 |         future_p = 1 - (1. / (1 + replay_k))
16 |     else:  # 'replay_strategy' == 'none'
17 |         future_p = 0
18 | 
19 |     def _sample_her_transitions(episode_batch, batch_size_in_transitions):
20 |         """episode_batch is {key: array(buffer_size x T x dim_key)}
21 |         """
22 |         T = episode_batch['u'].shape[1]
23 |         rollout_batch_size = episode_batch['u'].shape[0]
24 |         batch_size = batch_size_in_transitions
25 | 
26 |         # Select which episodes and time steps to use.
27 |         episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
28 |         t_samples = np.random.randint(T, size=batch_size)
29 |         transitions = {key: episode_batch[key][episode_idxs, t_samples].copy()
30 |                        for key in episode_batch.keys()}
31 | 
32 |         # Select future time indexes proportional with probability future_p. These
33 |         # will be used for HER replay by substituting in future goals.
34 |         her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
35 |         future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
36 |         future_offset = future_offset.astype(int)
37 |         future_t = (t_samples + 1 + future_offset)[her_indexes]
38 | 
39 |         # Replace goal with achieved goal but only for the previously-selected
40 |         # HER transitions (as defined by her_indexes). For the other transitions,
41 |         # keep the original goal.
42 |         future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
43 |         transitions['g'][her_indexes] = future_ag
44 | 
45 |         # Reconstruct info dictionary for reward  computation.
46 |         info = {}
47 |         for key, value in transitions.items():
48 |             if key.startswith('info_'):
49 |                 info[key.replace('info_', '')] = value
50 | 
51 |         # Re-compute reward since we may have substituted the goal.
52 |         reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
53 |         reward_params['info'] = info
54 |         transitions['r'] = reward_fun(**reward_params)
55 | 
56 |         transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
57 |                        for k in transitions.keys()}
58 | 
59 |         assert(transitions['u'].shape[0] == batch_size_in_transitions)
60 | 
61 |         return transitions
62 | 
63 |     return _sample_her_transitions
64 | 


--------------------------------------------------------------------------------
/baselines/common/tests/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from gym.spaces import np_random
 4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 5 | 
 6 | N_TRIALS = 10000
 7 | N_EPISODES = 100
 8 | 
 9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
10 |     np.random.seed(0)
11 |     np_random.seed(0)
12 | 
13 |     env = DummyVecEnv([env_fn])
14 | 
15 | 
16 |     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
17 |         tf.set_random_seed(0)
18 | 
19 |         model = learn_fn(env)
20 | 
21 |         sum_rew = 0
22 |         done = True
23 | 
24 |         for i in range(n_trials):
25 |             if done:
26 |                 obs = env.reset()
27 |                 state = model.initial_state
28 | 
29 |             if state is not None:
30 |                 a, v, state, _ = model.step(obs, S=state, M=[False])
31 |             else:
32 |                 a, v, _, _ = model.step(obs)
33 |             
34 |             obs, rew, done, _ = env.step(a)
35 |             sum_rew += float(rew)
36 | 
37 |         print("Reward in {} trials is {}".format(n_trials, sum_rew))
38 |         assert sum_rew > min_reward_fraction * n_trials, \
39 |             'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
40 | 
41 | 
42 | 
43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
44 |     env = DummyVecEnv([env_fn])
45 | 
46 |     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
47 |         model = learn_fn(env)
48 | 
49 |         N_TRIALS = 100    
50 | 
51 |         observations, actions, rewards = rollout(env, model, N_TRIALS)
52 |         rewards = [sum(r) for r in rewards]
53 | 
54 |         avg_rew = sum(rewards) / N_TRIALS
55 |         print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
56 |         assert avg_rew > min_avg_reward, \
57 |             'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
58 | 
59 | def rollout(env, model, n_trials):
60 |     rewards = []
61 |     actions = []
62 |     observations = []
63 | 
64 |     for i in range(n_trials):
65 |         obs = env.reset()
66 |         state = model.initial_state
67 |         episode_rew = []
68 |         episode_actions = []
69 |         episode_obs = []
70 | 
71 |         while True:
72 |             if state is not None:
73 |                 a, v, state, _ = model.step(obs, S=state, M=[False])
74 |             else:
75 |                 a,v, _, _ = model.step(obs)
76 | 
77 |             obs, rew, done, _ = env.step(a)
78 | 
79 |             episode_rew.append(rew)
80 |             episode_actions.append(a)
81 |             episode_obs.append(obs)
82 | 
83 |             if done:
84 |                 break
85 | 
86 |         rewards.append(episode_rew)
87 |         actions.append(episode_actions)
88 |         observations.append(episode_obs)
89 | 
90 |     return observations, actions, rewards
91 | 
92 | 


--------------------------------------------------------------------------------
/baselines/ddpg/memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RingBuffer(object):
 5 |     def __init__(self, maxlen, shape, dtype='float32'):
 6 |         self.maxlen = maxlen
 7 |         self.start = 0
 8 |         self.length = 0
 9 |         self.data = np.zeros((maxlen,) + shape).astype(dtype)
10 | 
11 |     def __len__(self):
12 |         return self.length
13 | 
14 |     def __getitem__(self, idx):
15 |         if idx < 0 or idx >= self.length:
16 |             raise KeyError()
17 |         return self.data[(self.start + idx) % self.maxlen]
18 | 
19 |     def get_batch(self, idxs):
20 |         return self.data[(self.start + idxs) % self.maxlen]
21 | 
22 |     def append(self, v):
23 |         if self.length < self.maxlen:
24 |             # We have space, simply increase the length.
25 |             self.length += 1
26 |         elif self.length == self.maxlen:
27 |             # No space, "remove" the first item.
28 |             self.start = (self.start + 1) % self.maxlen
29 |         else:
30 |             # This should never happen.
31 |             raise RuntimeError()
32 |         self.data[(self.start + self.length - 1) % self.maxlen] = v
33 | 
34 | 
35 | def array_min2d(x):
36 |     x = np.array(x)
37 |     if x.ndim >= 2:
38 |         return x
39 |     return x.reshape(-1, 1)
40 | 
41 | 
42 | class Memory(object):
43 |     def __init__(self, limit, action_shape, observation_shape):
44 |         self.limit = limit
45 | 
46 |         self.observations0 = RingBuffer(limit, shape=observation_shape)
47 |         self.actions = RingBuffer(limit, shape=action_shape)
48 |         self.rewards = RingBuffer(limit, shape=(1,))
49 |         self.terminals1 = RingBuffer(limit, shape=(1,))
50 |         self.observations1 = RingBuffer(limit, shape=observation_shape)
51 | 
52 |     def sample(self, batch_size):
53 |         # Draw such that we always have a proceeding element.
54 |         batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size)
55 | 
56 |         obs0_batch = self.observations0.get_batch(batch_idxs)
57 |         obs1_batch = self.observations1.get_batch(batch_idxs)
58 |         action_batch = self.actions.get_batch(batch_idxs)
59 |         reward_batch = self.rewards.get_batch(batch_idxs)
60 |         terminal1_batch = self.terminals1.get_batch(batch_idxs)
61 | 
62 |         result = {
63 |             'obs0': array_min2d(obs0_batch),
64 |             'obs1': array_min2d(obs1_batch),
65 |             'rewards': array_min2d(reward_batch),
66 |             'actions': array_min2d(action_batch),
67 |             'terminals1': array_min2d(terminal1_batch),
68 |         }
69 |         return result
70 | 
71 |     def append(self, obs0, action, reward, obs1, terminal1, training=True):
72 |         if not training:
73 |             return
74 |         
75 |         self.observations0.append(obs0)
76 |         self.actions.append(action)
77 |         self.rewards.append(reward)
78 |         self.observations1.append(obs1)
79 |         self.terminals1.append(terminal1)
80 | 
81 |     @property
82 |     def nb_entries(self):
83 |         return len(self.observations0)
84 | 


--------------------------------------------------------------------------------
/baselines/ppo1/mlp_policy.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.mpi_running_mean_std import RunningMeanStd
 2 | import baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import gym
 5 | from baselines.common.distributions import make_pdtype
 6 | 
 7 | class MlpPolicy(object):
 8 |     recurrent = False
 9 |     def __init__(self, name, *args, **kwargs):
10 |         with tf.variable_scope(name):
11 |             self._init(*args, **kwargs)
12 |             self.scope = tf.get_variable_scope().name
13 | 
14 |     def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
15 |         assert isinstance(ob_space, gym.spaces.Box)
16 | 
17 |         self.pdtype = pdtype = make_pdtype(ac_space)
18 |         sequence_length = None
19 | 
20 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 | 
22 |         with tf.variable_scope("obfilter"):
23 |             self.ob_rms = RunningMeanStd(shape=ob_space.shape)
24 | 
25 |         with tf.variable_scope('vf'):
26 |             obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
27 |             last_out = obz
28 |             for i in range(num_hid_layers):
29 |                 last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
30 |             self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
31 | 
32 |         with tf.variable_scope('pol'):
33 |             last_out = obz
34 |             for i in range(num_hid_layers):
35 |                 last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
36 |             if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
37 |                 mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
38 |                 logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
39 |                 pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
40 |             else:
41 |                 pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))
42 | 
43 |         self.pd = pdtype.pdfromflat(pdparam)
44 | 
45 |         self.state_in = []
46 |         self.state_out = []
47 | 
48 |         stochastic = tf.placeholder(dtype=tf.bool, shape=())
49 |         ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
50 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
51 | 
52 |     def act(self, stochastic, ob):
53 |         ac1, vpred1 =  self._act(stochastic, ob[None])
54 |         return ac1[0], vpred1[0]
55 |     def get_variables(self):
56 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
57 |     def get_trainable_variables(self):
58 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
59 |     def get_initial_state(self):
60 |         return []
61 | 
62 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | class MpiAdam(object):
 7 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
 8 |         self.var_list = var_list
 9 |         self.beta1 = beta1
10 |         self.beta2 = beta2
11 |         self.epsilon = epsilon
12 |         self.scale_grad_by_procs = scale_grad_by_procs
13 |         size = sum(U.numel(v) for v in var_list)
14 |         self.m = np.zeros(size, 'float32')
15 |         self.v = np.zeros(size, 'float32')
16 |         self.t = 0
17 |         self.setfromflat = U.SetFromFlat(var_list)
18 |         self.getflat = U.GetFlat(var_list)
19 |         self.comm = MPI.COMM_WORLD if comm is None else comm
20 | 
21 |     def update(self, localg, stepsize):
22 |         if self.t % 100 == 0:
23 |             self.check_synced()
24 |         localg = localg.astype('float32')
25 |         globalg = np.zeros_like(localg)
26 |         self.comm.Allreduce(localg, globalg, op=MPI.SUM)
27 |         if self.scale_grad_by_procs:
28 |             globalg /= self.comm.Get_size()
29 | 
30 |         self.t += 1
31 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
32 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
33 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
34 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
35 |         self.setfromflat(self.getflat() + step)
36 | 
37 |     def sync(self):
38 |         theta = self.getflat()
39 |         self.comm.Bcast(theta, root=0)
40 |         self.setfromflat(theta)
41 | 
42 |     def check_synced(self):
43 |         if self.comm.Get_rank() == 0: # this is root
44 |             theta = self.getflat()
45 |             self.comm.Bcast(theta, root=0)
46 |         else:
47 |             thetalocal = self.getflat()
48 |             thetaroot = np.empty_like(thetalocal)
49 |             self.comm.Bcast(thetaroot, root=0)
50 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
51 | 
52 | @U.in_session
53 | def test_MpiAdam():
54 |     np.random.seed(0)
55 |     tf.set_random_seed(0)
56 | 
57 |     a = tf.Variable(np.random.randn(3).astype('float32'))
58 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
59 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
60 | 
61 |     stepsize = 1e-2
62 |     update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
63 |     do_update = U.function([], loss, updates=[update_op])
64 | 
65 |     tf.get_default_session().run(tf.global_variables_initializer())
66 |     for i in range(10):
67 |         print(i,do_update())
68 | 
69 |     tf.set_random_seed(0)
70 |     tf.get_default_session().run(tf.global_variables_initializer())
71 | 
72 |     var_list = [a,b]
73 |     lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
74 |     adam = MpiAdam(var_list)
75 | 
76 |     for i in range(10):
77 |         l,g = lossandgrad()
78 |         adam.update(g, stepsize)
79 |         print(i,l)


--------------------------------------------------------------------------------
/baselines/acer/policies.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from baselines.common.policies import nature_cnn
 4 | from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
 5 | 
 6 | 
 7 | class AcerCnnPolicy(object):
 8 | 
 9 |     def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
10 |         nbatch = nenv * nsteps
11 |         nh, nw, nc = ob_space.shape
12 |         ob_shape = (nbatch, nh, nw, nc * nstack)
13 |         nact = ac_space.n
14 |         X = tf.placeholder(tf.uint8, ob_shape)  # obs
15 |         with tf.variable_scope("model", reuse=reuse):
16 |             h = nature_cnn(X)
17 |             pi_logits = fc(h, 'pi', nact, init_scale=0.01)
18 |             pi = tf.nn.softmax(pi_logits)
19 |             q = fc(h, 'q', nact)
20 | 
21 |         a = sample(tf.nn.softmax(pi_logits))  # could change this to use self.pi instead
22 |         self.initial_state = []  # not stateful
23 |         self.X = X
24 |         self.pi = pi  # actual policy params now
25 |         self.pi_logits = pi_logits
26 |         self.q = q
27 |         self.vf = q
28 | 
29 |         def step(ob, *args, **kwargs):
30 |             # returns actions, mus, states
31 |             a0, pi0 = sess.run([a, pi], {X: ob})
32 |             return a0, pi0, []  # dummy state
33 | 
34 |         def out(ob, *args, **kwargs):
35 |             pi0, q0 = sess.run([pi, q], {X: ob})
36 |             return pi0, q0
37 | 
38 |         def act(ob, *args, **kwargs):
39 |             return sess.run(a, {X: ob})
40 | 
41 |         self.step = step
42 |         self.out = out
43 |         self.act = act
44 | 
45 | class AcerLstmPolicy(object):
46 | 
47 |     def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
48 |         nbatch = nenv * nsteps
49 |         nh, nw, nc = ob_space.shape
50 |         ob_shape = (nbatch, nh, nw, nc * nstack)
51 |         nact = ac_space.n
52 |         X = tf.placeholder(tf.uint8, ob_shape)  # obs
53 |         M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
54 |         S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
55 |         with tf.variable_scope("model", reuse=reuse):
56 |             h = nature_cnn(X)
57 | 
58 |             # lstm
59 |             xs = batch_to_seq(h, nenv, nsteps)
60 |             ms = batch_to_seq(M, nenv, nsteps)
61 |             h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
62 |             h5 = seq_to_batch(h5)
63 | 
64 |             pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
65 |             pi = tf.nn.softmax(pi_logits)
66 |             q = fc(h5, 'q', nact)
67 | 
68 |         a = sample(pi_logits)  # could change this to use self.pi instead
69 |         self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
70 |         self.X = X
71 |         self.M = M
72 |         self.S = S
73 |         self.pi = pi  # actual policy params now
74 |         self.q = q
75 | 
76 |         def step(ob, state, mask, *args, **kwargs):
77 |             # returns actions, mus, states
78 |             a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
79 |             return a0, pi0, s
80 | 
81 |         self.step = step
82 | 


--------------------------------------------------------------------------------
/baselines/common/filters.py:
--------------------------------------------------------------------------------
 1 | from .running_stat import RunningStat
 2 | from collections import deque
 3 | import numpy as np
 4 | 
 5 | class Filter(object):
 6 |     def __call__(self, x, update=True):
 7 |         raise NotImplementedError
 8 |     def reset(self):
 9 |         pass
10 | 
11 | class IdentityFilter(Filter):
12 |     def __call__(self, x, update=True):
13 |         return x
14 | 
15 | class CompositionFilter(Filter):
16 |     def __init__(self, fs):
17 |         self.fs = fs
18 |     def __call__(self, x, update=True):
19 |         for f in self.fs:
20 |             x = f(x)
21 |         return x
22 |     def output_shape(self, input_space):
23 |         out = input_space.shape
24 |         for f in self.fs:
25 |             out = f.output_shape(out)
26 |         return out
27 | 
28 | class ZFilter(Filter):
29 |     """
30 |     y = (x-mean)/std
31 |     using running estimates of mean,std
32 |     """
33 | 
34 |     def __init__(self, shape, demean=True, destd=True, clip=10.0):
35 |         self.demean = demean
36 |         self.destd = destd
37 |         self.clip = clip
38 | 
39 |         self.rs = RunningStat(shape)
40 | 
41 |     def __call__(self, x, update=True):
42 |         if update: self.rs.push(x)
43 |         if self.demean:
44 |             x = x - self.rs.mean
45 |         if self.destd:
46 |             x = x / (self.rs.std+1e-8)
47 |         if self.clip:
48 |             x = np.clip(x, -self.clip, self.clip)
49 |         return x
50 |     def output_shape(self, input_space):
51 |         return input_space.shape
52 | 
53 | class AddClock(Filter):
54 |     def __init__(self):
55 |         self.count = 0
56 |     def reset(self):
57 |         self.count = 0
58 |     def __call__(self, x, update=True):
59 |         return np.append(x, self.count/100.0)
60 |     def output_shape(self, input_space):
61 |         return (input_space.shape[0]+1,)
62 | 
63 | class FlattenFilter(Filter):
64 |     def __call__(self, x, update=True):
65 |         return x.ravel()
66 |     def output_shape(self, input_space):
67 |         return (int(np.prod(input_space.shape)),)
68 | 
69 | class Ind2OneHotFilter(Filter):
70 |     def __init__(self, n):
71 |         self.n = n
72 |     def __call__(self, x, update=True):
73 |         out = np.zeros(self.n)
74 |         out[x] = 1
75 |         return out
76 |     def output_shape(self, input_space):
77 |         return (input_space.n,)
78 | 
79 | class DivFilter(Filter):
80 |     def __init__(self, divisor):
81 |         self.divisor = divisor
82 |     def __call__(self, x, update=True):
83 |         return x / self.divisor
84 |     def output_shape(self, input_space):
85 |         return input_space.shape
86 | 
87 | class StackFilter(Filter):
88 |     def __init__(self, length):
89 |         self.stack = deque(maxlen=length)
90 |     def reset(self):
91 |         self.stack.clear()
92 |     def __call__(self, x, update=True):
93 |         self.stack.append(x)
94 |         while len(self.stack) < self.stack.maxlen:
95 |             self.stack.append(x)
96 |         return np.concatenate(self.stack, axis=-1)
97 |     def output_shape(self, input_space):
98 |         return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
99 | 


--------------------------------------------------------------------------------
/baselines/gail/mlp_policy.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | from baselines/ppo1/mlp_policy.py and add simple modification
 3 | (1) add reuse argument
 4 | (2) cache the `stochastic` placeholder
 5 | '''
 6 | import tensorflow as tf
 7 | import gym
 8 | 
 9 | import baselines.common.tf_util as U
10 | from baselines.common.mpi_running_mean_std import RunningMeanStd
11 | from baselines.common.distributions import make_pdtype
12 | from baselines.acktr.utils import dense
13 | 
14 | 
15 | class MlpPolicy(object):
16 |     recurrent = False
17 | 
18 |     def __init__(self, name, reuse=False, *args, **kwargs):
19 |         with tf.variable_scope(name):
20 |             if reuse:
21 |                 tf.get_variable_scope().reuse_variables()
22 |             self._init(*args, **kwargs)
23 |             self.scope = tf.get_variable_scope().name
24 | 
25 |     def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
26 |         assert isinstance(ob_space, gym.spaces.Box)
27 | 
28 |         self.pdtype = pdtype = make_pdtype(ac_space)
29 |         sequence_length = None
30 | 
31 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
32 | 
33 |         with tf.variable_scope("obfilter"):
34 |             self.ob_rms = RunningMeanStd(shape=ob_space.shape)
35 | 
36 |         obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
37 |         last_out = obz
38 |         for i in range(num_hid_layers):
39 |             last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
40 |         self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
41 | 
42 |         last_out = obz
43 |         for i in range(num_hid_layers):
44 |             last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
45 | 
46 |         if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
47 |             mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
48 |             logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
49 |             pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
50 |         else:
51 |             pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
52 | 
53 |         self.pd = pdtype.pdfromflat(pdparam)
54 | 
55 |         self.state_in = []
56 |         self.state_out = []
57 | 
58 |         # change for BC
59 |         stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
60 |         ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
61 |         self.ac = ac
62 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
63 | 
64 |     def act(self, stochastic, ob):
65 |         ac1, vpred1 = self._act(stochastic, ob[None])
66 |         return ac1[0], vpred1[0]
67 | 
68 |     def get_variables(self):
69 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
70 | 
71 |     def get_trainable_variables(self):
72 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
73 | 
74 |     def get_initial_state(self):
75 |         return []
76 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  4 | 
  5 | 
  6 | def test_tree_set():
  7 |     tree = SumSegmentTree(4)
  8 | 
  9 |     tree[2] = 1.0
 10 |     tree[3] = 3.0
 11 | 
 12 |     assert np.isclose(tree.sum(), 4.0)
 13 |     assert np.isclose(tree.sum(0, 2), 0.0)
 14 |     assert np.isclose(tree.sum(0, 3), 1.0)
 15 |     assert np.isclose(tree.sum(2, 3), 1.0)
 16 |     assert np.isclose(tree.sum(2, -1), 1.0)
 17 |     assert np.isclose(tree.sum(2, 4), 4.0)
 18 | 
 19 | 
 20 | def test_tree_set_overlap():
 21 |     tree = SumSegmentTree(4)
 22 | 
 23 |     tree[2] = 1.0
 24 |     tree[2] = 3.0
 25 | 
 26 |     assert np.isclose(tree.sum(), 3.0)
 27 |     assert np.isclose(tree.sum(2, 3), 3.0)
 28 |     assert np.isclose(tree.sum(2, -1), 3.0)
 29 |     assert np.isclose(tree.sum(2, 4), 3.0)
 30 |     assert np.isclose(tree.sum(1, 2), 0.0)
 31 | 
 32 | 
 33 | def test_prefixsum_idx():
 34 |     tree = SumSegmentTree(4)
 35 | 
 36 |     tree[2] = 1.0
 37 |     tree[3] = 3.0
 38 | 
 39 |     assert tree.find_prefixsum_idx(0.0) == 2
 40 |     assert tree.find_prefixsum_idx(0.5) == 2
 41 |     assert tree.find_prefixsum_idx(0.99) == 2
 42 |     assert tree.find_prefixsum_idx(1.01) == 3
 43 |     assert tree.find_prefixsum_idx(3.00) == 3
 44 |     assert tree.find_prefixsum_idx(4.00) == 3
 45 | 
 46 | 
 47 | def test_prefixsum_idx2():
 48 |     tree = SumSegmentTree(4)
 49 | 
 50 |     tree[0] = 0.5
 51 |     tree[1] = 1.0
 52 |     tree[2] = 1.0
 53 |     tree[3] = 3.0
 54 | 
 55 |     assert tree.find_prefixsum_idx(0.00) == 0
 56 |     assert tree.find_prefixsum_idx(0.55) == 1
 57 |     assert tree.find_prefixsum_idx(0.99) == 1
 58 |     assert tree.find_prefixsum_idx(1.51) == 2
 59 |     assert tree.find_prefixsum_idx(3.00) == 3
 60 |     assert tree.find_prefixsum_idx(5.50) == 3
 61 | 
 62 | 
 63 | def test_max_interval_tree():
 64 |     tree = MinSegmentTree(4)
 65 | 
 66 |     tree[0] = 1.0
 67 |     tree[2] = 0.5
 68 |     tree[3] = 3.0
 69 | 
 70 |     assert np.isclose(tree.min(), 0.5)
 71 |     assert np.isclose(tree.min(0, 2), 1.0)
 72 |     assert np.isclose(tree.min(0, 3), 0.5)
 73 |     assert np.isclose(tree.min(0, -1), 0.5)
 74 |     assert np.isclose(tree.min(2, 4), 0.5)
 75 |     assert np.isclose(tree.min(3, 4), 3.0)
 76 | 
 77 |     tree[2] = 0.7
 78 | 
 79 |     assert np.isclose(tree.min(), 0.7)
 80 |     assert np.isclose(tree.min(0, 2), 1.0)
 81 |     assert np.isclose(tree.min(0, 3), 0.7)
 82 |     assert np.isclose(tree.min(0, -1), 0.7)
 83 |     assert np.isclose(tree.min(2, 4), 0.7)
 84 |     assert np.isclose(tree.min(3, 4), 3.0)
 85 | 
 86 |     tree[2] = 4.0
 87 | 
 88 |     assert np.isclose(tree.min(), 1.0)
 89 |     assert np.isclose(tree.min(0, 2), 1.0)
 90 |     assert np.isclose(tree.min(0, 3), 1.0)
 91 |     assert np.isclose(tree.min(0, -1), 1.0)
 92 |     assert np.isclose(tree.min(2, 4), 3.0)
 93 |     assert np.isclose(tree.min(2, 3), 4.0)
 94 |     assert np.isclose(tree.min(2, -1), 4.0)
 95 |     assert np.isclose(tree.min(3, 4), 3.0)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     test_tree_set()
100 |     test_tree_set_overlap()
101 |     test_prefixsum_idx()
102 |     test_prefixsum_idx2()
103 |     test_max_interval_tree()
104 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiprocessing import Process, Pipe
 3 | from . import VecEnv, CloudpickleWrapper
 4 | 
 5 | def worker(remote, parent_remote, env_fn_wrapper):
 6 |     parent_remote.close()
 7 |     env = env_fn_wrapper.x()
 8 |     try:
 9 |         while True:
10 |             cmd, data = remote.recv()
11 |             if cmd == 'step':
12 |                 ob, reward, done, info = env.step(data)
13 |                 if done:
14 |                     ob = env.reset()
15 |                 remote.send((ob, reward, done, info))
16 |             elif cmd == 'reset':
17 |                 ob = env.reset()
18 |                 remote.send(ob)
19 |             elif cmd == 'render':
20 |                 remote.send(env.render(mode='rgb_array'))
21 |             elif cmd == 'close':
22 |                 remote.close()
23 |                 break
24 |             elif cmd == 'get_spaces':
25 |                 remote.send((env.observation_space, env.action_space))
26 |             else:
27 |                 raise NotImplementedError
28 |     except KeyboardInterrupt:
29 |         print('SubprocVecEnv worker: got KeyboardInterrupt')
30 |     finally:
31 |         env.close()
32 | 
33 | 
34 | class SubprocVecEnv(VecEnv):
35 |     def __init__(self, env_fns, spaces=None):
36 |         """
37 |         envs: list of gym environments to run in subprocesses
38 |         """
39 |         self.waiting = False
40 |         nenvs = len(env_fns)
41 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
42 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
43 |                    for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
44 |         for p in self.ps:
45 |             p.daemon = True  # if the main process crashes, we should not cause things to hang
46 |             p.start()
47 |         for remote in self.work_remotes:
48 |             remote.close()
49 | 
50 |         self.remotes[0].send(('get_spaces', None))
51 |         observation_space, action_space = self.remotes[0].recv()
52 |         self.viewer = None
53 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
54 | 
55 |     def step_async(self, actions):
56 |         for remote, action in zip(self.remotes, actions):
57 |             remote.send(('step', action))
58 |         self.waiting = True
59 | 
60 |     def step_wait(self):
61 |         results = [remote.recv() for remote in self.remotes]
62 |         self.waiting = False
63 |         obs, rews, dones, infos = zip(*results)
64 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
65 | 
66 |     def reset(self):
67 |         for remote in self.remotes:
68 |             remote.send(('reset', None))
69 |         return np.stack([remote.recv() for remote in self.remotes])
70 | 
71 |     def close_extras(self):
72 |         if self.waiting:
73 |             for remote in self.remotes:
74 |                 remote.recv()
75 |         for remote in self.remotes:
76 |             remote.send(('close', None))
77 |         for p in self.ps:
78 |             p.join()
79 | 
80 |     def get_images(self):
81 |         for pipe in self.remotes:
82 |             pipe.send(('render', None))
83 |         imgs = [pipe.recv() for pipe in self.remotes]
84 |         return imgs
85 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_serialization.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import pytest
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | 
 7 | from baselines.common.tests.envs.mnist_env import MnistEnv
 8 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 9 | from baselines.run import get_learn_function
10 | from baselines.common.tf_util import make_session, get_session
11 | 
12 | from functools import partial
13 | 
14 | 
15 | learn_kwargs = {
16 |     'deepq': {},
17 |     'a2c': {}, 
18 |     'acktr': {},
19 |     'ppo2': {'nminibatches': 1, 'nsteps': 10},
20 |     'trpo_mpi': {},
21 | }
22 | 
23 | network_kwargs = {
24 |     'mlp': {}, 
25 |     'cnn': {'pad': 'SAME'}, 
26 |     'lstm': {},
27 |     'cnn_lnlstm': {'pad': 'SAME'}
28 | }
29 | 
30 | 
31 | @pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
32 | @pytest.mark.parametrize("network_fn", network_kwargs.keys())
33 | def test_serialization(learn_fn, network_fn):
34 |     '''
35 |     Test if the trained model can be serialized 
36 |     '''
37 | 
38 |     
39 |     if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
40 |             # TODO make acktr work with recurrent policies
41 |             # and test
42 |             # github issue: https://github.com/openai/baselines/issues/194
43 |             return 
44 | 
45 |     env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
46 |     ob = env.reset().copy()
47 |     learn = get_learn_function(learn_fn)
48 | 
49 |     kwargs = {}
50 |     kwargs.update(network_kwargs[network_fn])
51 |     kwargs.update(learn_kwargs[learn_fn])
52 | 
53 | 
54 |     learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)
55 | 
56 |     with tempfile.TemporaryDirectory() as td:
57 |         model_path = os.path.join(td, 'serialization_test_model')
58 | 
59 |         with tf.Graph().as_default(), make_session().as_default():
60 |             model = learn(total_timesteps=100)
61 |             model.save(model_path)
62 |             mean1, std1 = _get_action_stats(model, ob)
63 |             variables_dict1 = _serialize_variables()
64 | 
65 |         with tf.Graph().as_default(), make_session().as_default():
66 |             model = learn(total_timesteps=0, load_path=model_path)
67 |             mean2, std2 = _get_action_stats(model, ob)
68 |             variables_dict2 = _serialize_variables()
69 | 
70 |         for k, v in variables_dict1.items():
71 |             np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
72 |                 err_msg='saved and loaded variable {} value mismatch'.format(k))
73 | 
74 |         np.testing.assert_allclose(mean1, mean2, atol=0.5)
75 |         np.testing.assert_allclose(std1, std2, atol=0.5)
76 | 
77 |  
78 | 
79 | def _serialize_variables():
80 |     sess = get_session()
81 |     variables = tf.trainable_variables()    
82 |     values = sess.run(variables)
83 |     return {var.name: value for var, value in zip(variables, values)}
84 |     
85 | 
86 | def _get_action_stats(model, ob):
87 |     ntrials = 1000
88 |     if model.initial_state is None or model.initial_state == []:
89 |         actions = np.array([model.step(ob)[0] for _ in range(ntrials)])
90 |     else:
91 |         actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)])
92 | 
93 |     mean = np.mean(actions, axis=0)
94 |     std = np.std(actions, axis=0)
95 | 
96 |     return mean, std
97 | 
98 | 


--------------------------------------------------------------------------------
/baselines/results_plotter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib
 3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | plt.rcParams['svg.fonttype'] = 'none'
 7 | 
 8 | from baselines.bench.monitor import load_results
 9 | 
10 | X_TIMESTEPS = 'timesteps'
11 | X_EPISODES = 'episodes'
12 | X_WALLTIME = 'walltime_hrs'
13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
14 | EPISODES_WINDOW = 100
15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
16 |         'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
17 |         'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
18 | 
19 | def rolling_window(a, window):
20 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
21 |     strides = a.strides + (a.strides[-1],)
22 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
23 | 
24 | def window_func(x, y, window, func):
25 |     yw = rolling_window(y, window)
26 |     yw_func = func(yw, axis=-1)
27 |     return x[window-1:], yw_func
28 | 
29 | def ts2xy(ts, xaxis):
30 |     if xaxis == X_TIMESTEPS:
31 |         x = np.cumsum(ts.l.values)
32 |         y = ts.r.values
33 |     elif xaxis == X_EPISODES:
34 |         x = np.arange(len(ts))
35 |         y = ts.r.values
36 |     elif xaxis == X_WALLTIME:
37 |         x = ts.t.values / 3600.
38 |         y = ts.r.values
39 |     else:
40 |         raise NotImplementedError
41 |     return x, y
42 | 
43 | def plot_curves(xy_list, xaxis, title):
44 |     plt.figure(figsize=(8,2))
45 |     maxx = max(xy[0][-1] for xy in xy_list)
46 |     minx = 0
47 |     for (i, (x, y)) in enumerate(xy_list):
48 |         color = COLORS[i]
49 |         plt.scatter(x, y, s=2)
50 |         x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
51 |         plt.plot(x, y_mean, color=color)
52 |     plt.xlim(minx, maxx)
53 |     plt.title(title)
54 |     plt.xlabel(xaxis)
55 |     plt.ylabel("Episode Rewards")
56 |     plt.tight_layout()
57 | 
58 | def plot_results(dirs, num_timesteps, xaxis, task_name):
59 |     tslist = []
60 |     for dir in dirs:
61 |         ts = load_results(dir)
62 |         ts = ts[ts.l.cumsum() <= num_timesteps]
63 |         tslist.append(ts)
64 |     xy_list = [ts2xy(ts, xaxis) for ts in tslist]
65 |     plot_curves(xy_list, xaxis, task_name)
66 | 
67 | # Example usage in jupyter-notebook
68 | # from baselines import log_viewer
69 | # %matplotlib inline
70 | # log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
71 | # Here ./log is a directory containing the monitor.csv files
72 | 
73 | def main():
74 |     import argparse
75 |     import os
76 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
77 |     parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
78 |     parser.add_argument('--num_timesteps', type=int, default=int(10e6))
79 |     parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
80 |     parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
81 |     args = parser.parse_args()
82 |     args.dirs = [os.path.abspath(dir) for dir in args.dirs]
83 |     plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name)
84 |     plt.show()
85 | 
86 | if __name__ == '__main__':
87 |     main()


--------------------------------------------------------------------------------
/baselines/common/mpi_util.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from mpi4py import MPI
  3 | import os, numpy as np
  4 | import platform
  5 | import shutil
  6 | import subprocess
  7 | 
  8 | def sync_from_root(sess, variables, comm=None):
  9 |     """
 10 |     Send the root node's parameters to every worker.
 11 |     Arguments:
 12 |       sess: the TensorFlow session.
 13 |       variables: all parameter variables including optimizer's
 14 |     """
 15 |     if comm is None: comm = MPI.COMM_WORLD
 16 |     rank = comm.Get_rank()
 17 |     for var in variables:
 18 |         if rank == 0:
 19 |             comm.Bcast(sess.run(var))
 20 |         else:
 21 |             import tensorflow as tf
 22 |             returned_var = np.empty(var.shape, dtype='float32')
 23 |             comm.Bcast(returned_var)
 24 |             sess.run(tf.assign(var, returned_var))
 25 | 
 26 | def gpu_count():
 27 |     """
 28 |     Count the GPUs on this machine.
 29 |     """
 30 |     if shutil.which('nvidia-smi') is None:
 31 |         return 0
 32 |     output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
 33 |     return max(0, len(output.split(b'\n')) - 2)
 34 | 
 35 | def setup_mpi_gpus():
 36 |     """
 37 |     Set CUDA_VISIBLE_DEVICES using MPI.
 38 |     """
 39 |     num_gpus = gpu_count()
 40 |     if num_gpus == 0:
 41 |         return
 42 |     local_rank, _ = get_local_rank_size(MPI.COMM_WORLD)
 43 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus)
 44 | 
 45 | def get_local_rank_size(comm):
 46 |     """
 47 |     Returns the rank of each process on its machine
 48 |     The processes on a given machine will be assigned ranks
 49 |         0, 1, 2, ..., N-1,
 50 |     where N is the number of processes on this machine.
 51 | 
 52 |     Useful if you want to assign one gpu per machine
 53 |     """
 54 |     this_node = platform.node()
 55 |     ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
 56 |     node2rankssofar = defaultdict(int)
 57 |     local_rank = None
 58 |     for (rank, node) in ranks_nodes:
 59 |         if rank == comm.Get_rank():
 60 |             local_rank = node2rankssofar[node]
 61 |         node2rankssofar[node] += 1
 62 |     assert local_rank is not None
 63 |     return local_rank, node2rankssofar[this_node]
 64 | 
 65 | def share_file(comm, path):
 66 |     """
 67 |     Copies the file from rank 0 to all other ranks
 68 |     Puts it in the same place on all machines
 69 |     """
 70 |     localrank, _ = get_local_rank_size(comm)
 71 |     if comm.Get_rank() == 0:
 72 |         with open(path, 'rb') as fh:
 73 |             data = fh.read()
 74 |         comm.bcast(data)
 75 |     else:
 76 |         data = comm.bcast(None)
 77 |         if localrank == 0:
 78 |             os.makedirs(os.path.dirname(path), exist_ok=True)
 79 |             with open(path, 'wb') as fh:
 80 |                 fh.write(data)
 81 |     comm.Barrier()
 82 | 
 83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True):
 84 |     if comm is None: return d
 85 |     alldicts = comm.allgather(d)
 86 |     size = comm.size
 87 |     k2li = defaultdict(list)
 88 |     for d in alldicts:
 89 |         for (k,v) in d.items():
 90 |             k2li[k].append(v)
 91 |     result = {}
 92 |     for (k,li) in k2li.items():
 93 |         if assert_all_have_data:
 94 |             assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
 95 |         if op=='mean':
 96 |             result[k] = np.mean(li, axis=0)
 97 |         elif op=='sum':
 98 |             result[k] = np.sum(li, axis=0)
 99 |         else:
100 |             assert 0, op
101 |     return result
102 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/custom_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import itertools
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | import tensorflow.contrib.layers as layers
 6 | 
 7 | import baselines.common.tf_util as U
 8 | 
 9 | from baselines import logger
10 | from baselines import deepq
11 | from baselines.deepq.replay_buffer import ReplayBuffer
12 | from baselines.deepq.utils import ObservationInput
13 | from baselines.common.schedules import LinearSchedule
14 | 
15 | 
16 | def model(inpt, num_actions, scope, reuse=False):
17 |     """This model takes as input an observation and returns values of all actions."""
18 |     with tf.variable_scope(scope, reuse=reuse):
19 |         out = inpt
20 |         out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
21 |         out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
22 |         return out
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     with U.make_session(8):
27 |         # Create the environment
28 |         env = gym.make("CartPole-v0")
29 |         # Create all the functions necessary to train the model
30 |         act, train, update_target, debug = deepq.build_train(
31 |             make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
32 |             q_func=model,
33 |             num_actions=env.action_space.n,
34 |             optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
35 |         )
36 |         # Create the replay buffer
37 |         replay_buffer = ReplayBuffer(50000)
38 |         # Create the schedule for exploration starting from 1 (every action is random) down to
39 |         # 0.02 (98% of actions are selected according to values predicted by the model).
40 |         exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
41 | 
42 |         # Initialize the parameters and copy them to the target network.
43 |         U.initialize()
44 |         update_target()
45 | 
46 |         episode_rewards = [0.0]
47 |         obs = env.reset()
48 |         for t in itertools.count():
49 |             # Take action and update exploration to the newest value
50 |             action = act(obs[None], update_eps=exploration.value(t))[0]
51 |             new_obs, rew, done, _ = env.step(action)
52 |             # Store transition in the replay buffer.
53 |             replay_buffer.add(obs, action, rew, new_obs, float(done))
54 |             obs = new_obs
55 | 
56 |             episode_rewards[-1] += rew
57 |             if done:
58 |                 obs = env.reset()
59 |                 episode_rewards.append(0)
60 | 
61 |             is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
62 |             if is_solved:
63 |                 # Show off the result
64 |                 env.render()
65 |             else:
66 |                 # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
67 |                 if t > 1000:
68 |                     obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
69 |                     train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
70 |                 # Update target network periodically.
71 |                 if t % 1000 == 0:
72 |                     update_target()
73 | 
74 |             if done and len(episode_rewards) % 10 == 0:
75 |                 logger.record_tabular("steps", t)
76 |                 logger.record_tabular("episodes", len(episode_rewards))
77 |                 logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
78 |                 logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
79 |                 logger.dump_tabular()
80 | 


--------------------------------------------------------------------------------
/baselines/acktr/kfac_utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
 4 |     assert reduce_dim is not None
 5 | 
 6 |     # weird batch matmul
 7 |     if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
 8 |         # reshape reduce_dim to the left most dim in b
 9 |         b_shape = b.get_shape()
10 |         if reduce_dim != 0:
11 |             b_dims = list(range(len(b_shape)))
12 |             b_dims.remove(reduce_dim)
13 |             b_dims.insert(0, reduce_dim)
14 |             b = tf.transpose(b, b_dims)
15 |         b_t_shape = b.get_shape()
16 |         b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
17 |         result = tf.matmul(a, b, transpose_a=transpose_a,
18 |                            transpose_b=transpose_b)
19 |         result = tf.reshape(result, b_t_shape)
20 |         if reduce_dim != 0:
21 |             b_dims = list(range(len(b_shape)))
22 |             b_dims.remove(0)
23 |             b_dims.insert(reduce_dim, 0)
24 |             result = tf.transpose(result, b_dims)
25 |         return result
26 | 
27 |     elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
28 |         # reshape reduce_dim to the right most dim in a
29 |         a_shape = a.get_shape()
30 |         outter_dim = len(a_shape) - 1
31 |         reduce_dim = len(a_shape) - reduce_dim - 1
32 |         if reduce_dim != outter_dim:
33 |             a_dims = list(range(len(a_shape)))
34 |             a_dims.remove(reduce_dim)
35 |             a_dims.insert(outter_dim, reduce_dim)
36 |             a = tf.transpose(a, a_dims)
37 |         a_t_shape = a.get_shape()
38 |         a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
39 |         result = tf.matmul(a, b, transpose_a=transpose_a,
40 |                            transpose_b=transpose_b)
41 |         result = tf.reshape(result, a_t_shape)
42 |         if reduce_dim != outter_dim:
43 |             a_dims = list(range(len(a_shape)))
44 |             a_dims.remove(outter_dim)
45 |             a_dims.insert(reduce_dim, outter_dim)
46 |             result = tf.transpose(result, a_dims)
47 |         return result
48 | 
49 |     elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
50 |         return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
51 | 
52 |     assert False, 'something went wrong'
53 | 
54 | 
55 | def clipoutNeg(vec, threshold=1e-6):
56 |     mask = tf.cast(vec > threshold, tf.float32)
57 |     return mask * vec
58 | 
59 | 
60 | def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
61 |     eigen_min = tf.reduce_min(input_mat)
62 |     eigen_max = tf.reduce_max(input_mat)
63 |     eigen_ratio = eigen_max / eigen_min
64 |     input_mat_clipped = clipoutNeg(input_mat, threshold)
65 | 
66 |     if debug:
67 |         input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print(
68 |             input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio]))
69 | 
70 |     return input_mat_clipped
71 | 
72 | 
73 | def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
74 |     grad_shape = grad.get_shape()
75 |     if ftype == 'act':
76 |         assert e.get_shape()[0] == grad_shape[facIndx]
77 |         expanded_shape = [1, ] * len(grad_shape)
78 |         expanded_shape[facIndx] = -1
79 |         e = tf.reshape(e, expanded_shape)
80 |     if ftype == 'grad':
81 |         assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
82 |         expanded_shape = [1, ] * len(grad_shape)
83 |         expanded_shape[len(grad_shape) - facIndx - 1] = -1
84 |         e = tf.reshape(e, expanded_shape)
85 | 
86 |     return Q, e
87 | 


--------------------------------------------------------------------------------
/baselines/acktr/policies.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from baselines.acktr.utils import dense, kl_div
 4 | import baselines.common.tf_util as U
 5 | 
 6 | class GaussianMlpPolicy(object):
 7 |     def __init__(self, ob_dim, ac_dim):
 8 |         # Here we'll construct a bunch of expressions, which will be used in two places:
 9 |         # (1) When sampling actions
10 |         # (2) When computing loss functions, for the policy update
11 |         # Variables specific to (1) have the word "sampled" in them,
12 |         # whereas variables specific to (2) have the word "old" in them
13 |         ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
14 |         oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
15 |         oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
16 |         adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
17 |         wd_dict = {}
18 |         h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
19 |         h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
20 |         mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
21 |         self.wd_dict = wd_dict
22 |         self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
23 |         logstd_1a = tf.expand_dims(logstd_1a, 0)
24 |         std_1a = tf.exp(logstd_1a)
25 |         std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
26 |         ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
27 |         sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
28 |         logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
29 |         logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
30 |         kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
31 |         #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
32 |         surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
33 |         surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
34 |         self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
35 |         #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
36 |         self.compute_kl = U.function([ob_no, oldac_dist], kl)
37 |         self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
38 |         U.initialize() # Initialize uninitialized TF variables
39 | 
40 |     def act(self, ob):
41 |         ac, ac_dist, logp = self._act(ob[None])
42 |         return ac[0], ac_dist[0], logp[0]
43 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/test_vec_env.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for asynchronous vectorized environments.
  3 | """
  4 | 
  5 | import gym
  6 | import numpy as np
  7 | import pytest
  8 | from .dummy_vec_env import DummyVecEnv
  9 | from .shmem_vec_env import ShmemVecEnv
 10 | from .subproc_vec_env import SubprocVecEnv
 11 | 
 12 | 
 13 | def assert_envs_equal(env1, env2, num_steps):
 14 |     """
 15 |     Compare two environments over num_steps steps and make sure
 16 |     that the observations produced by each are the same when given
 17 |     the same actions.
 18 |     """
 19 |     assert env1.num_envs == env2.num_envs
 20 |     assert env1.action_space.shape == env2.action_space.shape
 21 |     assert env1.action_space.dtype == env2.action_space.dtype
 22 |     joint_shape = (env1.num_envs,) + env1.action_space.shape
 23 | 
 24 |     try:
 25 |         obs1, obs2 = env1.reset(), env2.reset()
 26 |         assert np.array(obs1).shape == np.array(obs2).shape
 27 |         assert np.array(obs1).shape == joint_shape
 28 |         assert np.allclose(obs1, obs2)
 29 |         np.random.seed(1337)
 30 |         for _ in range(num_steps):
 31 |             actions = np.array(np.random.randint(0, 0x100, size=joint_shape),
 32 |                                dtype=env1.action_space.dtype)
 33 |             for env in [env1, env2]:
 34 |                 env.step_async(actions)
 35 |             outs1 = env1.step_wait()
 36 |             outs2 = env2.step_wait()
 37 |             for out1, out2 in zip(outs1[:3], outs2[:3]):
 38 |                 assert np.array(out1).shape == np.array(out2).shape
 39 |                 assert np.allclose(out1, out2)
 40 |             assert list(outs1[3]) == list(outs2[3])
 41 |     finally:
 42 |         env1.close()
 43 |         env2.close()
 44 | 
 45 | 
 46 | @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv))
 47 | @pytest.mark.parametrize('dtype', ('uint8', 'float32'))
 48 | def test_vec_env(klass, dtype):  # pylint: disable=R0914
 49 |     """
 50 |     Test that a vectorized environment is equivalent to
 51 |     DummyVecEnv, since DummyVecEnv is less likely to be
 52 |     error prone.
 53 |     """
 54 |     num_envs = 3
 55 |     num_steps = 100
 56 |     shape = (3, 8)
 57 | 
 58 |     def make_fn(seed):
 59 |         """
 60 |         Get an environment constructor with a seed.
 61 |         """
 62 |         return lambda: SimpleEnv(seed, shape, dtype)
 63 |     fns = [make_fn(i) for i in range(num_envs)]
 64 |     env1 = DummyVecEnv(fns)
 65 |     env2 = klass(fns)
 66 |     assert_envs_equal(env1, env2, num_steps=num_steps)
 67 | 
 68 | 
 69 | class SimpleEnv(gym.Env):
 70 |     """
 71 |     An environment with a pre-determined observation space
 72 |     and RNG seed.
 73 |     """
 74 | 
 75 |     def __init__(self, seed, shape, dtype):
 76 |         np.random.seed(seed)
 77 |         self._dtype = dtype
 78 |         self._start_obs = np.array(np.random.randint(0, 0x100, size=shape),
 79 |                                    dtype=dtype)
 80 |         self._max_steps = seed + 1
 81 |         self._cur_obs = None
 82 |         self._cur_step = 0
 83 |         # this is 0xFF instead of 0x100 because the Box space includes
 84 |         # the high end, while randint does not
 85 |         self.action_space = gym.spaces.Box(low=0, high=0xFF, shape=shape, dtype=dtype)
 86 |         self.observation_space = self.action_space
 87 | 
 88 |     def step(self, action):
 89 |         self._cur_obs += np.array(action, dtype=self._dtype)
 90 |         self._cur_step += 1
 91 |         done = self._cur_step >= self._max_steps
 92 |         reward = self._cur_step / self._max_steps
 93 |         return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)}
 94 | 
 95 |     def reset(self):
 96 |         self._cur_obs = self._start_obs
 97 |         self._cur_step = 0
 98 |         return self._cur_obs
 99 | 
100 |     def render(self, mode=None):
101 |         raise NotImplementedError
102 | 


--------------------------------------------------------------------------------
/baselines/her/experiment/plot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | import json
  5 | import seaborn as sns; sns.set()
  6 | import glob2
  7 | import argparse
  8 | 
  9 | 
 10 | def smooth_reward_curve(x, y):
 11 |     halfwidth = int(np.ceil(len(x) / 60))  # Halfwidth of our smoothing convolution
 12 |     k = halfwidth
 13 |     xsmoo = x
 14 |     ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1),
 15 |         mode='same')
 16 |     return xsmoo, ysmoo
 17 | 
 18 | 
 19 | def load_results(file):
 20 |     if not os.path.exists(file):
 21 |         return None
 22 |     with open(file, 'r') as f:
 23 |         lines = [line for line in f]
 24 |     if len(lines) < 2:
 25 |         return None
 26 |     keys = [name.strip() for name in lines[0].split(',')]
 27 |     data = np.genfromtxt(file, delimiter=',', skip_header=1, filling_values=0.)
 28 |     if data.ndim == 1:
 29 |         data = data.reshape(1, -1)
 30 |     assert data.ndim == 2
 31 |     assert data.shape[-1] == len(keys)
 32 |     result = {}
 33 |     for idx, key in enumerate(keys):
 34 |         result[key] = data[:, idx]
 35 |     return result
 36 | 
 37 | 
 38 | def pad(xs, value=np.nan):
 39 |     maxlen = np.max([len(x) for x in xs])
 40 |     
 41 |     padded_xs = []
 42 |     for x in xs:
 43 |         if x.shape[0] >= maxlen:
 44 |             padded_xs.append(x)
 45 |     
 46 |         padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
 47 |         x_padded = np.concatenate([x, padding], axis=0)
 48 |         assert x_padded.shape[1:] == x.shape[1:]
 49 |         assert x_padded.shape[0] == maxlen
 50 |         padded_xs.append(x_padded)
 51 |     return np.array(padded_xs)
 52 | 
 53 | 
 54 | parser = argparse.ArgumentParser()
 55 | parser.add_argument('dir', type=str)
 56 | parser.add_argument('--smooth', type=int, default=1)
 57 | args = parser.parse_args()
 58 | 
 59 | # Load all data.
 60 | data = {}
 61 | paths = [os.path.abspath(os.path.join(path, '..')) for path in glob2.glob(os.path.join(args.dir, '**', 'progress.csv'))]
 62 | for curr_path in paths:
 63 |     if not os.path.isdir(curr_path):
 64 |         continue
 65 |     results = load_results(os.path.join(curr_path, 'progress.csv'))
 66 |     if not results:
 67 |         print('skipping {}'.format(curr_path))
 68 |         continue
 69 |     print('loading {} ({})'.format(curr_path, len(results['epoch'])))
 70 |     with open(os.path.join(curr_path, 'params.json'), 'r') as f:
 71 |         params = json.load(f)
 72 | 
 73 |     success_rate = np.array(results['test/success_rate'])
 74 |     epoch = np.array(results['epoch']) + 1
 75 |     env_id = params['env_name']
 76 |     replay_strategy = params['replay_strategy']
 77 | 
 78 |     if replay_strategy == 'future':
 79 |         config = 'her'
 80 |     else:
 81 |         config = 'ddpg'
 82 |     if 'Dense' in env_id:
 83 |         config += '-dense'
 84 |     else:
 85 |         config += '-sparse'
 86 |     env_id = env_id.replace('Dense', '')
 87 | 
 88 |     # Process and smooth data.
 89 |     assert success_rate.shape == epoch.shape
 90 |     x = epoch
 91 |     y = success_rate
 92 |     if args.smooth:
 93 |         x, y = smooth_reward_curve(epoch, success_rate)
 94 |     assert x.shape == y.shape
 95 | 
 96 |     if env_id not in data:
 97 |         data[env_id] = {}
 98 |     if config not in data[env_id]:
 99 |         data[env_id][config] = []
100 |     data[env_id][config].append((x, y))
101 | 
102 | # Plot data.
103 | for env_id in sorted(data.keys()):
104 |     print('exporting {}'.format(env_id))
105 |     plt.clf()
106 | 
107 |     for config in sorted(data[env_id].keys()):
108 |         xs, ys = zip(*data[env_id][config])
109 |         xs, ys = pad(xs), pad(ys)
110 |         assert xs.shape == ys.shape
111 | 
112 |         plt.plot(xs[0], np.nanmedian(ys, axis=0), label=config)
113 |         plt.fill_between(xs[0], np.nanpercentile(ys, 25, axis=0), np.nanpercentile(ys, 75, axis=0), alpha=0.25)
114 |     plt.title(env_id)
115 |     plt.xlabel('Epoch')
116 |     plt.ylabel('Median Success Rate')
117 |     plt.legend()
118 |     plt.savefig(os.path.join(args.dir, 'fig_{}.png'.format(env_id)))
119 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | from mpi4py import MPI
  2 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np
  3 | 
  4 | class RunningMeanStd(object):
  5 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
  6 |     def __init__(self, epsilon=1e-2, shape=()):
  7 | 
  8 |         self._sum = tf.get_variable(
  9 |             dtype=tf.float64,
 10 |             shape=shape,
 11 |             initializer=tf.constant_initializer(0.0),
 12 |             name="runningsum", trainable=False)
 13 |         self._sumsq = tf.get_variable(
 14 |             dtype=tf.float64,
 15 |             shape=shape,
 16 |             initializer=tf.constant_initializer(epsilon),
 17 |             name="runningsumsq", trainable=False)
 18 |         self._count = tf.get_variable(
 19 |             dtype=tf.float64,
 20 |             shape=(),
 21 |             initializer=tf.constant_initializer(epsilon),
 22 |             name="count", trainable=False)
 23 |         self.shape = shape
 24 | 
 25 |         self.mean = tf.to_float(self._sum / self._count)
 26 |         self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
 27 | 
 28 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 29 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 30 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 31 |         self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
 32 |             updates=[tf.assign_add(self._sum, newsum),
 33 |                      tf.assign_add(self._sumsq, newsumsq),
 34 |                      tf.assign_add(self._count, newcount)])
 35 | 
 36 | 
 37 |     def update(self, x):
 38 |         x = x.astype('float64')
 39 |         n = int(np.prod(self.shape))
 40 |         totalvec = np.zeros(n*2+1, 'float64')
 41 |         addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
 42 |         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 43 |         self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
 44 | 
 45 | @U.in_session
 46 | def test_runningmeanstd():
 47 |     for (x1, x2, x3) in [
 48 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
 49 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
 50 |         ]:
 51 | 
 52 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
 53 |         U.initialize()
 54 | 
 55 |         x = np.concatenate([x1, x2, x3], axis=0)
 56 |         ms1 = [x.mean(axis=0), x.std(axis=0)]
 57 |         rms.update(x1)
 58 |         rms.update(x2)
 59 |         rms.update(x3)
 60 |         ms2 = [rms.mean.eval(), rms.std.eval()]
 61 | 
 62 |         assert np.allclose(ms1, ms2)
 63 | 
 64 | @U.in_session
 65 | def test_dist():
 66 |     np.random.seed(0)
 67 |     p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
 68 |     q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
 69 | 
 70 |     # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
 71 |     # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
 72 | 
 73 |     comm = MPI.COMM_WORLD
 74 |     assert comm.Get_size()==2
 75 |     if comm.Get_rank()==0:
 76 |         x1,x2,x3 = p1,p2,p3
 77 |     elif comm.Get_rank()==1:
 78 |         x1,x2,x3 = q1,q2,q3
 79 |     else:
 80 |         assert False
 81 | 
 82 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 83 |     U.initialize()
 84 | 
 85 |     rms.update(x1)
 86 |     rms.update(x2)
 87 |     rms.update(x3)
 88 | 
 89 |     bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
 90 | 
 91 |     def checkallclose(x,y):
 92 |         print(x,y)
 93 |         return np.allclose(x,y)
 94 | 
 95 |     assert checkallclose(
 96 |         bigvec.mean(axis=0),
 97 |         rms.mean.eval(),
 98 |     )
 99 |     assert checkallclose(
100 |         bigvec.std(axis=0),
101 |         rms.std.eval(),
102 |     )
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     # Run with mpirun -np 2 python <filename>
107 |     test_dist()
108 | 


--------------------------------------------------------------------------------
/baselines/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t and t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/baselines/her/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | class ReplayBuffer:
  7 |     def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions):
  8 |         """Creates a replay buffer.
  9 | 
 10 |         Args:
 11 |             buffer_shapes (dict of ints): the shape for all buffers that are used in the replay
 12 |                 buffer
 13 |             size_in_transitions (int): the size of the buffer, measured in transitions
 14 |             T (int): the time horizon for episodes
 15 |             sample_transitions (function): a function that samples from the replay buffer
 16 |         """
 17 |         self.buffer_shapes = buffer_shapes
 18 |         self.size = size_in_transitions // T
 19 |         self.T = T
 20 |         self.sample_transitions = sample_transitions
 21 | 
 22 |         # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)}
 23 |         self.buffers = {key: np.empty([self.size, *shape])
 24 |                         for key, shape in buffer_shapes.items()}
 25 | 
 26 |         # memory management
 27 |         self.current_size = 0
 28 |         self.n_transitions_stored = 0
 29 | 
 30 |         self.lock = threading.Lock()
 31 | 
 32 |     @property
 33 |     def full(self):
 34 |         with self.lock:
 35 |             return self.current_size == self.size
 36 | 
 37 |     def sample(self, batch_size):
 38 |         """Returns a dict {key: array(batch_size x shapes[key])}
 39 |         """
 40 |         buffers = {}
 41 | 
 42 |         with self.lock:
 43 |             assert self.current_size > 0
 44 |             for key in self.buffers.keys():
 45 |                 buffers[key] = self.buffers[key][:self.current_size]
 46 | 
 47 |         buffers['o_2'] = buffers['o'][:, 1:, :]
 48 |         buffers['ag_2'] = buffers['ag'][:, 1:, :]
 49 | 
 50 |         transitions = self.sample_transitions(buffers, batch_size)
 51 | 
 52 |         for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())):
 53 |             assert key in transitions, "key %s missing from transitions" % key
 54 | 
 55 |         return transitions
 56 | 
 57 |     def store_episode(self, episode_batch):
 58 |         """episode_batch: array(batch_size x (T or T+1) x dim_key)
 59 |         """
 60 |         batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()]
 61 |         assert np.all(np.array(batch_sizes) == batch_sizes[0])
 62 |         batch_size = batch_sizes[0]
 63 | 
 64 |         with self.lock:
 65 |             idxs = self._get_storage_idx(batch_size)
 66 | 
 67 |             # load inputs into buffers
 68 |             for key in self.buffers.keys():
 69 |                 self.buffers[key][idxs] = episode_batch[key]
 70 | 
 71 |             self.n_transitions_stored += batch_size * self.T
 72 | 
 73 |     def get_current_episode_size(self):
 74 |         with self.lock:
 75 |             return self.current_size
 76 | 
 77 |     def get_current_size(self):
 78 |         with self.lock:
 79 |             return self.current_size * self.T
 80 | 
 81 |     def get_transitions_stored(self):
 82 |         with self.lock:
 83 |             return self.n_transitions_stored
 84 | 
 85 |     def clear_buffer(self):
 86 |         with self.lock:
 87 |             self.current_size = 0
 88 | 
 89 |     def _get_storage_idx(self, inc=None):
 90 |         inc = inc or 1   # size increment
 91 |         assert inc <= self.size, "Batch committed to replay is too large!"
 92 |         # go consecutively until you hit the end, and then go randomly.
 93 |         if self.current_size+inc <= self.size:
 94 |             idx = np.arange(self.current_size, self.current_size+inc)
 95 |         elif self.current_size < self.size:
 96 |             overflow = inc - (self.size - self.current_size)
 97 |             idx_a = np.arange(self.current_size, self.size)
 98 |             idx_b = np.random.randint(0, self.current_size, overflow)
 99 |             idx = np.concatenate([idx_a, idx_b])
100 |         else:
101 |             idx = np.random.randint(0, self.size, inc)
102 | 
103 |         # update replay size
104 |         self.current_size = min(self.size, self.current_size+inc)
105 | 
106 |         if inc == 1:
107 |             idx = idx[0]
108 |         return idx
109 | 


--------------------------------------------------------------------------------
/baselines/her/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import sys
  4 | import importlib
  5 | import inspect
  6 | import functools
  7 | 
  8 | import tensorflow as tf
  9 | import numpy as np
 10 | 
 11 | from baselines.common import tf_util as U
 12 | 
 13 | 
 14 | def store_args(method):
 15 |     """Stores provided method args as instance attributes.
 16 |     """
 17 |     argspec = inspect.getfullargspec(method)
 18 |     defaults = {}
 19 |     if argspec.defaults is not None:
 20 |         defaults = dict(
 21 |             zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
 22 |     if argspec.kwonlydefaults is not None:
 23 |         defaults.update(argspec.kwonlydefaults)
 24 |     arg_names = argspec.args[1:]
 25 | 
 26 |     @functools.wraps(method)
 27 |     def wrapper(*positional_args, **keyword_args):
 28 |         self = positional_args[0]
 29 |         # Get default arg values
 30 |         args = defaults.copy()
 31 |         # Add provided arg values
 32 |         for name, value in zip(arg_names, positional_args[1:]):
 33 |             args[name] = value
 34 |         args.update(keyword_args)
 35 |         self.__dict__.update(args)
 36 |         return method(*positional_args, **keyword_args)
 37 | 
 38 |     return wrapper
 39 | 
 40 | 
 41 | def import_function(spec):
 42 |     """Import a function identified by a string like "pkg.module:fn_name".
 43 |     """
 44 |     mod_name, fn_name = spec.split(':')
 45 |     module = importlib.import_module(mod_name)
 46 |     fn = getattr(module, fn_name)
 47 |     return fn
 48 | 
 49 | 
 50 | def flatten_grads(var_list, grads):
 51 |     """Flattens a variables and their gradients.
 52 |     """
 53 |     return tf.concat([tf.reshape(grad, [U.numel(v)])
 54 |                       for (v, grad) in zip(var_list, grads)], 0)
 55 | 
 56 | 
 57 | def nn(input, layers_sizes, reuse=None, flatten=False, name=""):
 58 |     """Creates a simple neural network
 59 |     """
 60 |     for i, size in enumerate(layers_sizes):
 61 |         activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
 62 |         input = tf.layers.dense(inputs=input,
 63 |                                 units=size,
 64 |                                 kernel_initializer=tf.contrib.layers.xavier_initializer(),
 65 |                                 reuse=reuse,
 66 |                                 name=name + '_' + str(i))
 67 |         if activation:
 68 |             input = activation(input)
 69 |     if flatten:
 70 |         assert layers_sizes[-1] == 1
 71 |         input = tf.reshape(input, [-1])
 72 |     return input
 73 | 
 74 | 
 75 | def install_mpi_excepthook():
 76 |     import sys
 77 |     from mpi4py import MPI
 78 |     old_hook = sys.excepthook
 79 | 
 80 |     def new_hook(a, b, c):
 81 |         old_hook(a, b, c)
 82 |         sys.stdout.flush()
 83 |         sys.stderr.flush()
 84 |         MPI.COMM_WORLD.Abort()
 85 |     sys.excepthook = new_hook
 86 | 
 87 | 
 88 | def mpi_fork(n, extra_mpi_args=[]):
 89 |     """Re-launches the current script with workers
 90 |     Returns "parent" for original parent, "child" for MPI children
 91 |     """
 92 |     if n <= 1:
 93 |         return "child"
 94 |     if os.getenv("IN_MPI") is None:
 95 |         env = os.environ.copy()
 96 |         env.update(
 97 |             MKL_NUM_THREADS="1",
 98 |             OMP_NUM_THREADS="1",
 99 |             IN_MPI="1"
100 |         )
101 |         # "-bind-to core" is crucial for good performance
102 |         args = ["mpirun", "-np", str(n)] + \
103 |             extra_mpi_args + \
104 |             [sys.executable]
105 | 
106 |         args += sys.argv
107 |         subprocess.check_call(args, env=env)
108 |         return "parent"
109 |     else:
110 |         install_mpi_excepthook()
111 |         return "child"
112 | 
113 | 
114 | def convert_episode_to_batch_major(episode):
115 |     """Converts an episode to have the batch dimension in the major (first)
116 |     dimension.
117 |     """
118 |     episode_batch = {}
119 |     for key in episode.keys():
120 |         val = np.array(episode[key]).copy()
121 |         # make inputs batch-major instead of time-major
122 |         episode_batch[key] = val.swapaxes(0, 1)
123 | 
124 |     return episode_batch
125 | 
126 | 
127 | def transitions_in_episode_batch(episode_batch):
128 |     """Number of transitions in a given episode batch.
129 |     """
130 |     shape = episode_batch['u'].shape
131 |     return shape[0] * shape[1]
132 | 
133 | 
134 | def reshape_for_broadcasting(source, target):
135 |     """Reshapes a tensor (source) to have the correct shape and dtype of the target
136 |     before broadcasting it with MPI.
137 |     """
138 |     dim = len(target.get_shape())
139 |     shape = ([1] * (dim - 1)) + [-1]
140 |     return tf.reshape(tf.cast(source, target.dtype), shape)
141 | 


--------------------------------------------------------------------------------
/baselines/gail/dataset/mujoco_dset.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Data structure of the input .npz:
  3 | the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs'
  4 | the values of each item is a list storing the expert trajectory sequentially
  5 | a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t]
  6 | '''
  7 | 
  8 | from baselines import logger
  9 | import numpy as np
 10 | 
 11 | 
 12 | class Dset(object):
 13 |     def __init__(self, inputs, labels, randomize):
 14 |         self.inputs = inputs
 15 |         self.labels = labels
 16 |         assert len(self.inputs) == len(self.labels)
 17 |         self.randomize = randomize
 18 |         self.num_pairs = len(inputs)
 19 |         self.init_pointer()
 20 | 
 21 |     def init_pointer(self):
 22 |         self.pointer = 0
 23 |         if self.randomize:
 24 |             idx = np.arange(self.num_pairs)
 25 |             np.random.shuffle(idx)
 26 |             self.inputs = self.inputs[idx, :]
 27 |             self.labels = self.labels[idx, :]
 28 | 
 29 |     def get_next_batch(self, batch_size):
 30 |         # if batch_size is negative -> return all
 31 |         if batch_size < 0:
 32 |             return self.inputs, self.labels
 33 |         if self.pointer + batch_size >= self.num_pairs:
 34 |             self.init_pointer()
 35 |         end = self.pointer + batch_size
 36 |         inputs = self.inputs[self.pointer:end, :]
 37 |         labels = self.labels[self.pointer:end, :]
 38 |         self.pointer = end
 39 |         return inputs, labels
 40 | 
 41 | 
 42 | class Mujoco_Dset(object):
 43 |     def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True):
 44 |         traj_data = np.load(expert_path)
 45 |         if traj_limitation < 0:
 46 |             traj_limitation = len(traj_data['obs'])
 47 |         obs = traj_data['obs'][:traj_limitation]
 48 |         acs = traj_data['acs'][:traj_limitation]
 49 | 
 50 |         # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
 51 |         # and S is the environment observation/action space.
 52 |         # Flatten to (N * L, prod(S))
 53 |         self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
 54 |         self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
 55 | 
 56 |         self.rets = traj_data['ep_rets'][:traj_limitation]
 57 |         self.avg_ret = sum(self.rets)/len(self.rets)
 58 |         self.std_ret = np.std(np.array(self.rets))
 59 |         if len(self.acs) > 2:
 60 |             self.acs = np.squeeze(self.acs)
 61 |         assert len(self.obs) == len(self.acs)
 62 |         self.num_traj = min(traj_limitation, len(traj_data['obs']))
 63 |         self.num_transition = len(self.obs)
 64 |         self.randomize = randomize
 65 |         self.dset = Dset(self.obs, self.acs, self.randomize)
 66 |         # for behavior cloning
 67 |         self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction), :],
 68 |                               self.acs[:int(self.num_transition*train_fraction), :],
 69 |                               self.randomize)
 70 |         self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):, :],
 71 |                             self.acs[int(self.num_transition*train_fraction):, :],
 72 |                             self.randomize)
 73 |         self.log_info()
 74 | 
 75 |     def log_info(self):
 76 |         logger.log("Total trajectorues: %d" % self.num_traj)
 77 |         logger.log("Total transitions: %d" % self.num_transition)
 78 |         logger.log("Average returns: %f" % self.avg_ret)
 79 |         logger.log("Std for returns: %f" % self.std_ret)
 80 | 
 81 |     def get_next_batch(self, batch_size, split=None):
 82 |         if split is None:
 83 |             return self.dset.get_next_batch(batch_size)
 84 |         elif split == 'train':
 85 |             return self.train_set.get_next_batch(batch_size)
 86 |         elif split == 'val':
 87 |             return self.val_set.get_next_batch(batch_size)
 88 |         else:
 89 |             raise NotImplementedError
 90 | 
 91 |     def plot(self):
 92 |         import matplotlib.pyplot as plt
 93 |         plt.hist(self.rets)
 94 |         plt.savefig("histogram_rets.png")
 95 |         plt.close()
 96 | 
 97 | 
 98 | def test(expert_path, traj_limitation, plot):
 99 |     dset = Mujoco_Dset(expert_path, traj_limitation=traj_limitation)
100 |     if plot:
101 |         dset.plot()
102 | 
103 | if __name__ == '__main__':
104 |     import argparse
105 |     parser = argparse.ArgumentParser()
106 |     parser.add_argument("--expert_path", type=str, default="../data/deterministic.trpo.Hopper.0.00.npz")
107 |     parser.add_argument("--traj_limitation", type=int, default=None)
108 |     parser.add_argument("--plot", type=bool, default=False)
109 |     args = parser.parse_args()
110 |     test(args.expert_path, args.traj_limitation, args.plot)
111 | 


--------------------------------------------------------------------------------
/baselines/acer/buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | class Buffer(object):
  4 |     # gets obs, actions, rewards, mu's, (states, masks), dones
  5 |     def __init__(self, env, nsteps, nstack, size=50000):
  6 |         self.nenv = env.num_envs
  7 |         self.nsteps = nsteps
  8 |         self.nh, self.nw, self.nc = env.observation_space.shape
  9 |         self.nstack = nstack
 10 |         self.nbatch = self.nenv * self.nsteps
 11 |         self.size = size // (self.nsteps)  # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
 12 | 
 13 |         # Memory
 14 |         self.enc_obs = None
 15 |         self.actions = None
 16 |         self.rewards = None
 17 |         self.mus = None
 18 |         self.dones = None
 19 |         self.masks = None
 20 | 
 21 |         # Size indexes
 22 |         self.next_idx = 0
 23 |         self.num_in_buffer = 0
 24 | 
 25 |     def has_atleast(self, frames):
 26 |         # Frames per env, so total (nenv * frames) Frames needed
 27 |         # Each buffer loc has nenv * nsteps frames
 28 |         return self.num_in_buffer >= (frames // self.nsteps)
 29 | 
 30 |     def can_sample(self):
 31 |         return self.num_in_buffer > 0
 32 | 
 33 |     # Generate stacked frames
 34 |     def decode(self, enc_obs, dones):
 35 |         # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
 36 |         # dones has shape [nenvs, nsteps, nh, nw, nc]
 37 |         # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
 38 |         nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc
 39 |         y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
 40 |         obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8)
 41 |         x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1,
 42 |                                                                               0)  # [nsteps + nstack, nenv, nh, nw, nc]
 43 |         y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0)  # keep
 44 |         y[:3] = 1.0
 45 |         # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
 46 |         for i in range(nstack):
 47 |             obs[-(i + 1), i:] = x
 48 |             # obs[:,i:,:,:,-(i+1),:] = x
 49 |             x = x[:-1] * y
 50 |             y = y[1:]
 51 |         return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc])
 52 | 
 53 |     def put(self, enc_obs, actions, rewards, mus, dones, masks):
 54 |         # enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
 55 |         # actions, rewards, dones [nenv, nsteps]
 56 |         # mus [nenv, nsteps, nact]
 57 | 
 58 |         if self.enc_obs is None:
 59 |             self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8)
 60 |             self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32)
 61 |             self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32)
 62 |             self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32)
 63 |             self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool)
 64 |             self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool)
 65 | 
 66 |         self.enc_obs[self.next_idx] = enc_obs
 67 |         self.actions[self.next_idx] = actions
 68 |         self.rewards[self.next_idx] = rewards
 69 |         self.mus[self.next_idx] = mus
 70 |         self.dones[self.next_idx] = dones
 71 |         self.masks[self.next_idx] = masks
 72 | 
 73 |         self.next_idx = (self.next_idx + 1) % self.size
 74 |         self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
 75 | 
 76 |     def take(self, x, idx, envx):
 77 |         nenv = self.nenv
 78 |         out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
 79 |         for i in range(nenv):
 80 |             out[i] = x[idx[i], envx[i]]
 81 |         return out
 82 | 
 83 |     def get(self):
 84 |         # returns
 85 |         # obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
 86 |         # actions, rewards, dones [nenv, nsteps]
 87 |         # mus [nenv, nsteps, nact]
 88 |         nenv = self.nenv
 89 |         assert self.can_sample()
 90 | 
 91 |         # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
 92 |         idx = np.random.randint(0, self.num_in_buffer, nenv)
 93 |         envx = np.arange(nenv)
 94 | 
 95 |         take = lambda x: self.take(x, idx, envx)  # for i in range(nenv)], axis = 0)
 96 |         dones = take(self.dones)
 97 |         enc_obs = take(self.enc_obs)
 98 |         obs = self.decode(enc_obs, dones)
 99 |         actions = take(self.actions)
100 |         rewards = take(self.rewards)
101 |         mus = take(self.mus)
102 |         masks = take(self.masks)
103 |         return obs, actions, rewards, mus, dones, masks
104 | 


--------------------------------------------------------------------------------
/baselines/gail/adversary.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Reference: https://github.com/openai/imitation
 3 | I follow the architecture from the official repository
 4 | '''
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | 
 8 | from baselines.common.mpi_running_mean_std import RunningMeanStd
 9 | from baselines.common import tf_util as U
10 | 
11 | def logsigmoid(a):
12 |     '''Equivalent to tf.log(tf.sigmoid(a))'''
13 |     return -tf.nn.softplus(-a)
14 | 
15 | """ Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
16 | def logit_bernoulli_entropy(logits):
17 |     ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
18 |     return ent
19 | 
20 | class TransitionClassifier(object):
21 |     def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):
22 |         self.scope = scope
23 |         self.observation_shape = env.observation_space.shape
24 |         self.actions_shape = env.action_space.shape
25 |         self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)])
26 |         self.num_actions = env.action_space.shape[0]
27 |         self.hidden_size = hidden_size
28 |         self.build_ph()
29 |         # Build grpah
30 |         generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False)
31 |         expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True)
32 |         # Build accuracy
33 |         generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
34 |         expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
35 |         # Build regression loss
36 |         # let x = logits, z = targets.
37 |         # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
38 |         generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits))
39 |         generator_loss = tf.reduce_mean(generator_loss)
40 |         expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits))
41 |         expert_loss = tf.reduce_mean(expert_loss)
42 |         # Build entropy loss
43 |         logits = tf.concat([generator_logits, expert_logits], 0)
44 |         entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
45 |         entropy_loss = -entcoeff*entropy
46 |         # Loss + Accuracy terms
47 |         self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
48 |         self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"]
49 |         self.total_loss = generator_loss + expert_loss + entropy_loss
50 |         # Build Reward for policy
51 |         self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8)
52 |         var_list = self.get_trainable_variables()
53 |         self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph],
54 |                                       self.losses + [U.flatgrad(self.total_loss, var_list)])
55 | 
56 |     def build_ph(self):
57 |         self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph")
58 |         self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph")
59 |         self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph")
60 |         self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph")
61 | 
62 |     def build_graph(self, obs_ph, acs_ph, reuse=False):
63 |         with tf.variable_scope(self.scope):
64 |             if reuse:
65 |                 tf.get_variable_scope().reuse_variables()
66 | 
67 |             with tf.variable_scope("obfilter"):
68 |                 self.obs_rms = RunningMeanStd(shape=self.observation_shape)
69 |             obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std)
70 |             _input = tf.concat([obs, acs_ph], axis=1)  # concatenate the two input -> form a transition
71 |             p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh)
72 |             p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)
73 |             logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity)
74 |         return logits
75 | 
76 |     def get_trainable_variables(self):
77 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
78 | 
79 |     def get_reward(self, obs, acs):
80 |         sess = tf.get_default_session()
81 |         if len(obs.shape) == 1:
82 |             obs = np.expand_dims(obs, 0)
83 |         if len(acs.shape) == 1:
84 |             acs = np.expand_dims(acs, 0)
85 |         feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs}
86 |         reward = sess.run(self.reward_op, feed_dict)
87 |         return reward
88 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from baselines.common.tile_images import tile_images
  3 | 
  4 | class AlreadySteppingError(Exception):
  5 |     """
  6 |     Raised when an asynchronous step is running while
  7 |     step_async() is called again.
  8 |     """
  9 | 
 10 |     def __init__(self):
 11 |         msg = 'already running an async step'
 12 |         Exception.__init__(self, msg)
 13 | 
 14 | 
 15 | class NotSteppingError(Exception):
 16 |     """
 17 |     Raised when an asynchronous step is not running but
 18 |     step_wait() is called.
 19 |     """
 20 | 
 21 |     def __init__(self):
 22 |         msg = 'not running an async step'
 23 |         Exception.__init__(self, msg)
 24 | 
 25 | 
 26 | class VecEnv(ABC):
 27 |     """
 28 |     An abstract asynchronous, vectorized environment.
 29 |     """
 30 | 
 31 |     def __init__(self, num_envs, observation_space, action_space):
 32 |         self.num_envs = num_envs
 33 |         self.observation_space = observation_space
 34 |         self.action_space = action_space
 35 |         self.closed = False
 36 |         self.viewer = None # For rendering
 37 | 
 38 |     @abstractmethod
 39 |     def reset(self):
 40 |         """
 41 |         Reset all the environments and return an array of
 42 |         observations, or a dict of observation arrays.
 43 | 
 44 |         If step_async is still doing work, that work will
 45 |         be cancelled and step_wait() should not be called
 46 |         until step_async() is invoked again.
 47 |         """
 48 |         pass
 49 | 
 50 |     @abstractmethod
 51 |     def step_async(self, actions):
 52 |         """
 53 |         Tell all the environments to start taking a step
 54 |         with the given actions.
 55 |         Call step_wait() to get the results of the step.
 56 | 
 57 |         You should not call this if a step_async run is
 58 |         already pending.
 59 |         """
 60 |         pass
 61 | 
 62 |     @abstractmethod
 63 |     def step_wait(self):
 64 |         """
 65 |         Wait for the step taken with step_async().
 66 | 
 67 |         Returns (obs, rews, dones, infos):
 68 |          - obs: an array of observations, or a dict of
 69 |                 arrays of observations.
 70 |          - rews: an array of rewards
 71 |          - dones: an array of "episode done" booleans
 72 |          - infos: a sequence of info objects
 73 |         """
 74 |         pass
 75 | 
 76 |     def close_extras(self):
 77 |         """
 78 |         Clean up the  extra resources, beyond what's in this base class.
 79 |         Only runs when not self.closed.
 80 |         """
 81 |         pass
 82 | 
 83 |     def close(self):
 84 |         if self.closed:
 85 |             return
 86 |         if self.viewer is not None:
 87 |             self.viewer.close()
 88 |         self.close_extras()
 89 |         self.closed = True
 90 | 
 91 |     def step(self, actions):
 92 |         """
 93 |         Step the environments synchronously.
 94 | 
 95 |         This is available for backwards compatibility.
 96 |         """
 97 |         self.step_async(actions)
 98 |         return self.step_wait()
 99 | 
100 |     def render(self, mode='human'):
101 |         imgs = self.get_images()
102 |         bigimg = tile_images(imgs)
103 |         if mode == 'human':
104 |             self.get_viewer().imshow(bigimg)
105 |         elif mode == 'rgb_array':
106 |             return bigimg
107 |         else:
108 |             raise NotImplementedError
109 | 
110 |     def get_images(self):
111 |         """
112 |         Return RGB images from each environment
113 |         """
114 |         raise NotImplementedError
115 | 
116 |     @property
117 |     def unwrapped(self):
118 |         if isinstance(self, VecEnvWrapper):
119 |             return self.venv.unwrapped
120 |         else:
121 |             return self
122 | 
123 |     def get_viewer(self):
124 |         if self.viewer is None:
125 |             from gym.envs.classic_control import rendering
126 |             self.viewer = rendering.SimpleImageViewer()
127 |         return self.viewer
128 | 
129 | 
130 | class VecEnvWrapper(VecEnv):
131 |     """
132 |     An environment wrapper that applies to an entire batch
133 |     of environments at once.
134 |     """
135 | 
136 |     def __init__(self, venv, observation_space=None, action_space=None):
137 |         self.venv = venv
138 |         VecEnv.__init__(self,
139 |                         num_envs=venv.num_envs,
140 |                         observation_space=observation_space or venv.observation_space,
141 |                         action_space=action_space or venv.action_space)
142 | 
143 |     def step_async(self, actions):
144 |         self.venv.step_async(actions)
145 | 
146 |     @abstractmethod
147 |     def reset(self):
148 |         pass
149 | 
150 |     @abstractmethod
151 |     def step_wait(self):
152 |         pass
153 | 
154 |     def close(self):
155 |         return self.venv.close()
156 | 
157 |     def render(self, mode='human'):
158 |         return self.venv.render(mode=mode)
159 | 
160 |     def get_images(self):
161 |         return self.venv.get_images()
162 | 
163 | class CloudpickleWrapper(object):
164 |     """
165 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
166 |     """
167 | 
168 |     def __init__(self, x):
169 |         self.x = x
170 | 
171 |     def __getstate__(self):
172 |         import cloudpickle
173 |         return cloudpickle.dumps(self.x)
174 | 
175 |     def __setstate__(self, ob):
176 |         import pickle
177 |         self.x = pickle.loads(ob)
178 | 


--------------------------------------------------------------------------------
/baselines/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """Build a Segment Tree data structure.
  7 | 
  8 |         https://en.wikipedia.org/wiki/Segment_tree
  9 | 
 10 |         Can be used as regular array, but with two
 11 |         important differences:
 12 | 
 13 |             a) setting item's value is slightly slower.
 14 |                It is O(lg capacity) instead of O(1).
 15 |             b) user has access to an efficient ( O(log segment size) )
 16 |                `reduce` operation which reduces `operation` over
 17 |                a contiguous subsequence of items in the array.
 18 | 
 19 |         Paramters
 20 |         ---------
 21 |         capacity: int
 22 |             Total size of the array - must be a power of two.
 23 |         operation: lambda obj, obj -> obj
 24 |             and operation for combining elements (eg. sum, max)
 25 |             must form a mathematical group together with the set of
 26 |             possible values for array elements (i.e. be associative)
 27 |         neutral_element: obj
 28 |             neutral element for the operation above. eg. float('-inf')
 29 |             for max and 0 for sum.
 30 |         """
 31 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 32 |         self._capacity = capacity
 33 |         self._value = [neutral_element for _ in range(2 * capacity)]
 34 |         self._operation = operation
 35 | 
 36 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 37 |         if start == node_start and end == node_end:
 38 |             return self._value[node]
 39 |         mid = (node_start + node_end) // 2
 40 |         if end <= mid:
 41 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 42 |         else:
 43 |             if mid + 1 <= start:
 44 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 45 |             else:
 46 |                 return self._operation(
 47 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 48 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 49 |                 )
 50 | 
 51 |     def reduce(self, start=0, end=None):
 52 |         """Returns result of applying `self.operation`
 53 |         to a contiguous subsequence of the array.
 54 | 
 55 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         start: int
 60 |             beginning of the subsequence
 61 |         end: int
 62 |             end of the subsequences
 63 | 
 64 |         Returns
 65 |         -------
 66 |         reduced: obj
 67 |             result of reducing self.operation over the specified range of array elements.
 68 |         """
 69 |         if end is None:
 70 |             end = self._capacity
 71 |         if end < 0:
 72 |             end += self._capacity
 73 |         end -= 1
 74 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 75 | 
 76 |     def __setitem__(self, idx, val):
 77 |         # index of the leaf
 78 |         idx += self._capacity
 79 |         self._value[idx] = val
 80 |         idx //= 2
 81 |         while idx >= 1:
 82 |             self._value[idx] = self._operation(
 83 |                 self._value[2 * idx],
 84 |                 self._value[2 * idx + 1]
 85 |             )
 86 |             idx //= 2
 87 | 
 88 |     def __getitem__(self, idx):
 89 |         assert 0 <= idx < self._capacity
 90 |         return self._value[self._capacity + idx]
 91 | 
 92 | 
 93 | class SumSegmentTree(SegmentTree):
 94 |     def __init__(self, capacity):
 95 |         super(SumSegmentTree, self).__init__(
 96 |             capacity=capacity,
 97 |             operation=operator.add,
 98 |             neutral_element=0.0
 99 |         )
100 | 
101 |     def sum(self, start=0, end=None):
102 |         """Returns arr[start] + ... + arr[end]"""
103 |         return super(SumSegmentTree, self).reduce(start, end)
104 | 
105 |     def find_prefixsum_idx(self, prefixsum):
106 |         """Find the highest index `i` in the array such that
107 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
108 | 
109 |         if array values are probabilities, this function
110 |         allows to sample indexes according to the discrete
111 |         probability efficiently.
112 | 
113 |         Parameters
114 |         ----------
115 |         perfixsum: float
116 |             upperbound on the sum of array prefix
117 | 
118 |         Returns
119 |         -------
120 |         idx: int
121 |             highest index satisfying the prefixsum constraint
122 |         """
123 |         assert 0 <= prefixsum <= self.sum() + 1e-5
124 |         idx = 1
125 |         while idx < self._capacity:  # while non-leaf
126 |             if self._value[2 * idx] > prefixsum:
127 |                 idx = 2 * idx
128 |             else:
129 |                 prefixsum -= self._value[2 * idx]
130 |                 idx = 2 * idx + 1
131 |         return idx - self._capacity
132 | 
133 | 
134 | class MinSegmentTree(SegmentTree):
135 |     def __init__(self, capacity):
136 |         super(MinSegmentTree, self).__init__(
137 |             capacity=capacity,
138 |             operation=min,
139 |             neutral_element=float('inf')
140 |         )
141 | 
142 |     def min(self, start=0, end=None):
143 |         """Returns min(arr[start], ...,  arr[end])"""
144 | 
145 |         return super(MinSegmentTree, self).reduce(start, end)
146 | 


--------------------------------------------------------------------------------