├── .benchmark_pattern
├── baselines
├── __init__.py
├── a2c
│ ├── __init__.py
│ ├── README.md
│ └── runner.py
├── acer
│ ├── __init__.py
│ ├── defaults.py
│ ├── README.md
│ ├── runner.py
│ ├── policies.py
│ └── buffer.py
├── acktr
│ ├── __init__.py
│ ├── acktr.py
│ ├── README.md
│ ├── run_mujoco.py
│ ├── utils.py
│ ├── value_functions.py
│ ├── kfac_utils.py
│ └── policies.py
├── ddpg
│ ├── __init__.py
│ ├── README.md
│ ├── noise.py
│ ├── models.py
│ └── memory.py
├── gail
│ ├── __init__.py
│ ├── dataset
│ │ ├── __init__.py
│ │ └── mujoco_dset.py
│ ├── result
│ │ ├── hopper-training.png
│ │ ├── humanoid-training.png
│ │ ├── walker2d-training.png
│ │ ├── halfcheetah-training.png
│ │ ├── humanoidstandup-training.png
│ │ ├── Hopper-normalized-stochastic-scores.png
│ │ ├── Hopper-normalized-deterministic-scores.png
│ │ ├── Hopper-unnormalized-stochastic-scores.png
│ │ ├── Humanoid-normalized-stochastic-scores.png
│ │ ├── Walker2d-normalized-stochastic-scores.png
│ │ ├── HalfCheetah-normalized-stochastic-scores.png
│ │ ├── Hopper-unnormalized-deterministic-scores.png
│ │ ├── Humanoid-normalized-deterministic-scores.png
│ │ ├── Humanoid-unnormalized-stochastic-scores.png
│ │ ├── Walker2d-normalized-deterministic-scores.png
│ │ ├── Walker2d-unnormalized-stochastic-scores.png
│ │ ├── HalfCheetah-normalized-deterministic-scores.png
│ │ ├── HalfCheetah-unnormalized-stochastic-scores.png
│ │ ├── Humanoid-unnormalized-deterministic-scores.png
│ │ ├── Walker2d-unnormalized-deterministic-scores.png
│ │ ├── HalfCheetah-unnormalized-deterministic-scores.png
│ │ ├── HumanoidStandup-normalized-stochastic-scores.png
│ │ ├── HumanoidStandup-unnormalized-stochastic-scores.png
│ │ ├── HumanoidStandup-normalized-deterministic-scores.png
│ │ ├── HumanoidStandup-unnormalized-deterministic-scores.png
│ │ └── gail-result.md
│ ├── README.md
│ ├── statistics.py
│ ├── mlp_policy.py
│ └── adversary.py
├── her
│ ├── __init__.py
│ ├── experiment
│ │ ├── __init__.py
│ │ ├── play.py
│ │ └── plot.py
│ ├── README.md
│ ├── actor_critic.py
│ ├── her.py
│ ├── replay_buffer.py
│ └── util.py
├── ppo1
│ ├── __init__.py
│ ├── README.md
│ ├── run_mujoco.py
│ ├── run_robotics.py
│ ├── run_atari.py
│ ├── cnn_policy.py
│ ├── run_humanoid.py
│ └── mlp_policy.py
├── ppo2
│ ├── __init__.py
│ ├── README.md
│ └── defaults.py
├── trpo_mpi
│ ├── __init__.py
│ ├── README.md
│ └── defaults.py
├── common
│ ├── tests
│ │ ├── __init__.py
│ │ ├── envs
│ │ │ ├── __init__.py
│ │ │ ├── fixed_sequence_env.py
│ │ │ ├── identity_env.py
│ │ │ └── mnist_env.py
│ │ ├── test_schedules.py
│ │ ├── test_tf_util.py
│ │ ├── test_cartpole.py
│ │ ├── test_doc_examples.py
│ │ ├── test_fixed_sequence.py
│ │ ├── test_mnist.py
│ │ ├── test_identity.py
│ │ ├── util.py
│ │ ├── test_segment_tree.py
│ │ └── test_serialization.py
│ ├── __init__.py
│ ├── runners.py
│ ├── mpi_fork.py
│ ├── identity_env.py
│ ├── tile_images.py
│ ├── vec_env
│ │ ├── vec_monitor.py
│ │ ├── vec_frame_stack.py
│ │ ├── util.py
│ │ ├── vec_normalize.py
│ │ ├── dummy_vec_env.py
│ │ ├── subproc_vec_env.py
│ │ ├── test_vec_env.py
│ │ └── __init__.py
│ ├── cg.py
│ ├── mpi_adam_optimizer.py
│ ├── running_stat.py
│ ├── input.py
│ ├── mpi_moments.py
│ ├── console_util.py
│ ├── dataset.py
│ ├── math_util.py
│ ├── mpi_adam.py
│ ├── filters.py
│ ├── mpi_util.py
│ ├── mpi_running_mean_std.py
│ ├── schedules.py
│ └── segment_tree.py
├── deepq
│ ├── experiments
│ │ ├── __init__.py
│ │ ├── enjoy_cartpole.py
│ │ ├── enjoy_mountaincar.py
│ │ ├── enjoy_pong.py
│ │ ├── train_mountaincar.py
│ │ ├── train_cartpole.py
│ │ ├── enjoy_retro.py
│ │ ├── run_retro.py
│ │ ├── run_atari.py
│ │ └── custom_cartpole.py
│ ├── __init__.py
│ ├── defaults.py
│ ├── README.md
│ └── utils.py
├── bench
│ └── __init__.py
└── results_plotter.py
├── data
├── logo.jpg
└── cartpole.gif
├── setup.cfg
├── .travis.yml
├── .gitignore
├── Dockerfile
├── LICENSE
└── setup.py
/.benchmark_pattern:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/baselines/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/a2c/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/acer/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/acktr/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/ddpg/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/gail/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/her/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/ppo1/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/ppo2/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/trpo_mpi/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/common/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/gail/dataset/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/her/experiment/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/common/tests/envs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/baselines/acktr/acktr.py:
--------------------------------------------------------------------------------
1 | from baselines.acktr.acktr_disc import *
2 |
--------------------------------------------------------------------------------
/data/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/data/logo.jpg
--------------------------------------------------------------------------------
/data/cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/data/cartpole.gif
--------------------------------------------------------------------------------
/baselines/acer/defaults.py:
--------------------------------------------------------------------------------
1 | def atari():
2 | return dict(
3 | lrschedule='constant'
4 | )
5 |
--------------------------------------------------------------------------------
/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.bench.benchmarks import *
2 | from baselines.bench.monitor import *
--------------------------------------------------------------------------------
/baselines/gail/result/hopper-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/hopper-training.png
--------------------------------------------------------------------------------
/baselines/gail/result/humanoid-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/humanoid-training.png
--------------------------------------------------------------------------------
/baselines/gail/result/walker2d-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/walker2d-training.png
--------------------------------------------------------------------------------
/baselines/gail/result/halfcheetah-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/halfcheetah-training.png
--------------------------------------------------------------------------------
/baselines/gail/result/humanoidstandup-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/humanoidstandup-training.png
--------------------------------------------------------------------------------
/baselines/gail/result/Hopper-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Hopper-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Humanoid-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Walker2d-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Humanoid-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Walker2d-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from baselines.common.console_util import *
3 | from baselines.common.dataset import Dataset
4 | from baselines.common.math_util import *
5 | from baselines.common.misc_util import *
6 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | select = F,E999
3 | exclude =
4 | .git,
5 | __pycache__,
6 | baselines/her,
7 | baselines/ddpg,
8 | baselines/ppo1,
9 | baselines/bench,
10 | baselines/acktr,
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 |
5 | services:
6 | - docker
7 |
8 | install:
9 | - pip install flake8
10 | - docker build . -t baselines-test
11 |
12 | script:
13 | - flake8 .
14 | - docker run baselines-test pytest -v .
15 |
--------------------------------------------------------------------------------
/baselines/ddpg/README.md:
--------------------------------------------------------------------------------
1 | # DDPG
2 |
3 | - Original paper: https://arxiv.org/abs/1509.02971
4 | - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/
5 | - `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options.
--------------------------------------------------------------------------------
/baselines/acer/README.md:
--------------------------------------------------------------------------------
1 | # ACER
2 |
3 | - Original paper: https://arxiv.org/abs/1611.01224
4 | - `python -m baselines.run --alg=acer --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
5 | - also refer to the repo-wide [README.md](../../README.md#training-models)
6 |
7 |
--------------------------------------------------------------------------------
/baselines/a2c/README.md:
--------------------------------------------------------------------------------
1 | # A2C
2 |
3 | - Original paper: https://arxiv.org/abs/1602.01783
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options
6 | - also refer to the repo-wide [README.md](../../README.md#training-models)
7 |
--------------------------------------------------------------------------------
/baselines/acktr/README.md:
--------------------------------------------------------------------------------
1 | # ACKTR
2 |
3 | - Original paper: https://arxiv.org/abs/1708.05144
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
6 | - also refer to the repo-wide [README.md](../../README.md#training-models)
7 |
8 |
9 |
--------------------------------------------------------------------------------
/baselines/deepq/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.deepq import models # noqa
2 | from baselines.deepq.build_graph import build_act, build_train # noqa
3 | from baselines.deepq.deepq import learn, load_act # noqa
4 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa
5 |
6 | def wrap_atari_dqn(env):
7 | from baselines.common.atari_wrappers import wrap_deepmind
8 | return wrap_deepmind(env, frame_stack=True, scale=True)
9 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.pyc
3 | *.pkl
4 | *.py~
5 | .pytest_cache
6 | .DS_Store
7 | .idea
8 |
9 | # Setuptools distribution and build folders.
10 | /dist/
11 | /build
12 | keys/
13 |
14 | # Virtualenv
15 | /env
16 |
17 |
18 | *.sublime-project
19 | *.sublime-workspace
20 |
21 | .idea
22 |
23 | logs/
24 |
25 | .ipynb_checkpoints
26 | ghostdriver.log
27 |
28 | htmlcov
29 |
30 | junk
31 | src
32 |
33 | *.egg-info
34 | .cache
35 |
36 | MUJOCO_LOG.TXT
37 |
--------------------------------------------------------------------------------
/baselines/ppo2/README.md:
--------------------------------------------------------------------------------
1 | # PPO2
2 |
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 |
6 | - `python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
7 | - `python -m baselines.run --alg=ppo2 --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M frames on a Mujoco Ant environment.
8 | - also refer to the repo-wide [README.md](../../README.md#training-models)
9 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_cartpole.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def main():
7 | env = gym.make("CartPole-v0")
8 | act = deepq.load("cartpole_model.pkl")
9 |
10 | while True:
11 | obs, done = env.reset(), False
12 | episode_rew = 0
13 | while not done:
14 | env.render()
15 | obs, rew, done, _ = env.step(act(obs[None])[0])
16 | episode_rew += rew
17 | print("Episode reward", episode_rew)
18 |
19 |
20 | if __name__ == '__main__':
21 | main()
22 |
--------------------------------------------------------------------------------
/baselines/trpo_mpi/README.md:
--------------------------------------------------------------------------------
1 | # trpo_mpi
2 |
3 | - Original paper: https://arxiv.org/abs/1502.05477
4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 16 python -m baselines.run --alg=trpo_mpi --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
6 | - `python -m baselines.run --alg=trpo_mpi --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M timesteps on a Mujoco Ant environment.
7 | - also refer to the repo-wide [README.md](../../README.md#training-models)
8 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_mountaincar.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def main():
7 | env = gym.make("MountainCar-v0")
8 | act = deepq.load("mountaincar_model.pkl")
9 |
10 | while True:
11 | obs, done = env.reset(), False
12 | episode_rew = 0
13 | while not done:
14 | env.render()
15 | obs, rew, done, _ = env.step(act(obs[None])[0])
16 | episode_rew += rew
17 | print("Episode reward", episode_rew)
18 |
19 |
20 | if __name__ == '__main__':
21 | main()
22 |
--------------------------------------------------------------------------------
/baselines/deepq/defaults.py:
--------------------------------------------------------------------------------
1 | def atari():
2 | return dict(
3 | network='conv_only',
4 | lr=1e-4,
5 | buffer_size=10000,
6 | exploration_fraction=0.1,
7 | exploration_final_eps=0.01,
8 | train_freq=4,
9 | learning_starts=10000,
10 | target_network_update_freq=1000,
11 | gamma=0.99,
12 | prioritized_replay=True,
13 | prioritized_replay_alpha=0.6,
14 | checkpoint_freq=10000,
15 | checkpoint_path=None,
16 | dueling=True
17 | )
18 |
19 | def retro():
20 | return atari()
21 |
22 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_pong.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from baselines import deepq
3 |
4 |
5 | def main():
6 | env = gym.make("PongNoFrameskip-v4")
7 | env = deepq.wrap_atari_dqn(env)
8 | act = deepq.load("pong_model.pkl")
9 |
10 | while True:
11 | obs, done = env.reset(), False
12 | episode_rew = 0
13 | while not done:
14 | env.render()
15 | obs, rew, done, _ = env.step(act(obs[None])[0])
16 | episode_rew += rew
17 | print("Episode reward", episode_rew)
18 |
19 |
20 | if __name__ == '__main__':
21 | main()
22 |
--------------------------------------------------------------------------------
/baselines/ppo2/defaults.py:
--------------------------------------------------------------------------------
1 | def mujoco():
2 | return dict(
3 | nsteps=2048,
4 | nminibatches=32,
5 | lam=0.95,
6 | gamma=0.99,
7 | noptepochs=10,
8 | log_interval=1,
9 | ent_coef=0.0,
10 | lr=lambda f: 3e-4 * f,
11 | cliprange=0.2,
12 | value_network='copy'
13 | )
14 |
15 | def atari():
16 | return dict(
17 | nsteps=128, nminibatches=4,
18 | lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
19 | ent_coef=.01,
20 | lr=lambda f : f * 2.5e-4,
21 | cliprange=lambda f : f * 0.1,
22 | )
23 |
--------------------------------------------------------------------------------
/baselines/ppo1/README.md:
--------------------------------------------------------------------------------
1 | # PPOSGD
2 |
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
7 |
8 | - Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model`
9 | - Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model`
10 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:16.04
2 |
3 | RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
4 | ENV CODE_DIR /root/code
5 | ENV VENV /root/venv
6 |
7 | RUN \
8 | pip install virtualenv && \
9 | virtualenv $VENV --python=python3 && \
10 | . $VENV/bin/activate && \
11 | pip install --upgrade pip
12 |
13 | ENV PATH=$VENV/bin:$PATH
14 |
15 | COPY . $CODE_DIR/baselines
16 | WORKDIR $CODE_DIR/baselines
17 |
18 | # Clean up pycache and pyc files
19 | RUN rm -rf __pycache__ && \
20 | find . -name "*.pyc" -delete && \
21 | pip install tensorflow && \
22 | pip install -e .[test]
23 |
24 |
25 | CMD /bin/bash
26 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_mountaincar.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def main():
7 | env = gym.make("MountainCar-v0")
8 | # Enabling layer_norm here is import for parameter space noise!
9 | model = deepq.models.mlp([64], layer_norm=True)
10 | act = deepq.learn(
11 | env,
12 | q_func=model,
13 | lr=1e-3,
14 | max_timesteps=100000,
15 | buffer_size=50000,
16 | exploration_fraction=0.1,
17 | exploration_final_eps=0.1,
18 | print_freq=10,
19 | param_noise=True
20 | )
21 | print("Saving model to mountaincar_model.pkl")
22 | act.save("mountaincar_model.pkl")
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/baselines/common/runners.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from abc import ABC, abstractmethod
3 |
4 | class AbstractEnvRunner(ABC):
5 | def __init__(self, *, env, model, nsteps):
6 | self.env = env
7 | self.model = model
8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
11 | self.obs[:] = env.reset()
12 | self.nsteps = nsteps
13 | self.states = model.initial_state
14 | self.dones = [False for _ in range(nenv)]
15 |
16 | @abstractmethod
17 | def run(self):
18 | raise NotImplementedError
19 |
20 |
--------------------------------------------------------------------------------
/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
1 | import os, subprocess, sys
2 |
3 | def mpi_fork(n, bind_to_core=False):
4 | """Re-launches the current script with workers
5 | Returns "parent" for original parent, "child" for MPI children
6 | """
7 | if n<=1:
8 | return "child"
9 | if os.getenv("IN_MPI") is None:
10 | env = os.environ.copy()
11 | env.update(
12 | MKL_NUM_THREADS="1",
13 | OMP_NUM_THREADS="1",
14 | IN_MPI="1"
15 | )
16 | args = ["mpirun", "-np", str(n)]
17 | if bind_to_core:
18 | args += ["-bind-to", "core"]
19 | args += [sys.executable] + sys.argv
20 | subprocess.check_call(args, env=env)
21 | return "parent"
22 | else:
23 | return "child"
24 |
--------------------------------------------------------------------------------
/baselines/trpo_mpi/defaults.py:
--------------------------------------------------------------------------------
1 | from baselines.common.models import mlp, cnn_small
2 |
3 |
4 | def atari():
5 | return dict(
6 | network = cnn_small(),
7 | timesteps_per_batch=512,
8 | max_kl=0.001,
9 | cg_iters=10,
10 | cg_damping=1e-3,
11 | gamma=0.98,
12 | lam=1.0,
13 | vf_iters=3,
14 | vf_stepsize=1e-4,
15 | entcoeff=0.00,
16 | )
17 |
18 | def mujoco():
19 | return dict(
20 | network = mlp(num_hidden=32, num_layers=2),
21 | timesteps_per_batch=1024,
22 | max_kl=0.01,
23 | cg_iters=10,
24 | cg_damping=0.1,
25 | gamma=0.99,
26 | lam=0.98,
27 | vf_iters=5,
28 | vf_stepsize=1e-3,
29 | normalize_observations=True,
30 | )
31 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_cartpole.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def callback(lcl, _glb):
7 | # stop training if reward exceeds 199
8 | is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
9 | return is_solved
10 |
11 |
12 | def main():
13 | env = gym.make("CartPole-v0")
14 | act = deepq.learn(
15 | env,
16 | network='mlp',
17 | lr=1e-3,
18 | total_timesteps=100000,
19 | buffer_size=50000,
20 | exploration_fraction=0.1,
21 | exploration_final_eps=0.02,
22 | print_freq=10,
23 | callback=callback
24 | )
25 | print("Saving model to cartpole_model.pkl")
26 | act.save("cartpole_model.pkl")
27 |
28 |
29 | if __name__ == '__main__':
30 | main()
31 |
--------------------------------------------------------------------------------
/baselines/common/identity_env.py:
--------------------------------------------------------------------------------
1 | from gym import Env
2 | from gym.spaces import Discrete
3 |
4 |
5 | class IdentityEnv(Env):
6 | def __init__(
7 | self,
8 | dim,
9 | ep_length=100,
10 | ):
11 |
12 | self.action_space = Discrete(dim)
13 | self.reset()
14 |
15 | def reset(self):
16 | self._choose_next_state()
17 | self.observation_space = self.action_space
18 |
19 | return self.state
20 |
21 | def step(self, actions):
22 | rew = self._get_reward(actions)
23 | self._choose_next_state()
24 | return self.state, rew, False, {}
25 |
26 | def _choose_next_state(self):
27 | self.state = self.action_space.sample()
28 |
29 | def _get_reward(self, actions):
30 | return 1 if self.state == actions else 0
31 |
--------------------------------------------------------------------------------
/baselines/common/tile_images.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def tile_images(img_nhwc):
4 | """
5 | Tile N images into one big PxQ image
6 | (P,Q) are chosen to be as close as possible, and if N
7 | is square, then P=Q.
8 |
9 | input: img_nhwc, list or array of images, ndim=4 once turned into array
10 | n = batch index, h = height, w = width, c = channel
11 | returns:
12 | bigim_HWc, ndarray with ndim=3
13 | """
14 | img_nhwc = np.asarray(img_nhwc)
15 | N, h, w, c = img_nhwc.shape
16 | H = int(np.ceil(np.sqrt(N)))
17 | W = int(np.ceil(float(N)/H))
18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
22 | return img_Hh_Ww_c
23 |
24 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
4 |
5 |
6 | def test_piecewise_schedule():
7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
8 |
9 | assert np.isclose(ps.value(-10), 500)
10 | assert np.isclose(ps.value(0), 150)
11 | assert np.isclose(ps.value(5), 200)
12 | assert np.isclose(ps.value(9), 80)
13 | assert np.isclose(ps.value(50), 50)
14 | assert np.isclose(ps.value(80), 50)
15 | assert np.isclose(ps.value(150), 0)
16 | assert np.isclose(ps.value(175), -25)
17 | assert np.isclose(ps.value(201), 500)
18 | assert np.isclose(ps.value(500), 500)
19 |
20 | assert np.isclose(ps.value(200 - 1e-10), -50)
21 |
22 |
23 | def test_constant_schedule():
24 | cs = ConstantSchedule(5)
25 | for i in range(-100, 100):
26 | assert np.isclose(cs.value(i), 5)
27 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/vec_monitor.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | import numpy as np
3 |
4 |
5 | class VecMonitor(VecEnvWrapper):
6 | def __init__(self, venv):
7 | VecEnvWrapper.__init__(self, venv)
8 | self.eprets = None
9 | self.eplens = None
10 |
11 | def reset(self):
12 | obs = self.venv.reset()
13 | self.eprets = np.zeros(self.num_envs, 'f')
14 | self.eplens = np.zeros(self.num_envs, 'i')
15 | return obs
16 |
17 | def step_wait(self):
18 | obs, rews, dones, infos = self.venv.step_wait()
19 | self.eprets += rews
20 | self.eplens += 1
21 | newinfos = []
22 | for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)):
23 | info = info.copy()
24 | if done:
25 | info['episode'] = {'r': ret, 'l': eplen}
26 | self.eprets[i] = 0
27 | self.eplens[i] = 0
28 | newinfos.append(info)
29 | return obs, rews, dones, newinfos
30 |
--------------------------------------------------------------------------------
/baselines/common/cg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
3 | """
4 | Demmel p 312
5 | """
6 | p = b.copy()
7 | r = b.copy()
8 | x = np.zeros_like(b)
9 | rdotr = r.dot(r)
10 |
11 | fmtstr = "%10i %10.3g %10.3g"
12 | titlestr = "%10s %10s %10s"
13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 |
15 | for i in range(cg_iters):
16 | if callback is not None:
17 | callback(x)
18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 | z = f_Ax(p)
20 | v = rdotr / p.dot(z)
21 | x += v*p
22 | r -= v*z
23 | newrdotr = r.dot(r)
24 | mu = newrdotr/rdotr
25 | p = r + mu*p
26 |
27 | rdotr = newrdotr
28 | if rdotr < residual_tol:
29 | break
30 |
31 | if callback is not None:
32 | callback(x)
33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
34 | return x
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2017 OpenAI (http://openai.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/baselines/ppo1/run_mujoco.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
4 | from baselines.common import tf_util as U
5 | from baselines import logger
6 |
7 | def train(env_id, num_timesteps, seed):
8 | from baselines.ppo1 import mlp_policy, pposgd_simple
9 | U.make_session(num_cpu=1).__enter__()
10 | def policy_fn(name, ob_space, ac_space):
11 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
12 | hid_size=64, num_hid_layers=2)
13 | env = make_mujoco_env(env_id, seed)
14 | pposgd_simple.learn(env, policy_fn,
15 | max_timesteps=num_timesteps,
16 | timesteps_per_actorbatch=2048,
17 | clip_param=0.2, entcoeff=0.0,
18 | optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
19 | gamma=0.99, lam=0.95, schedule='linear',
20 | )
21 | env.close()
22 |
23 | def main():
24 | args = mujoco_arg_parser().parse_args()
25 | logger.configure()
26 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
27 |
28 | if __name__ == '__main__':
29 | main()
30 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
1 | # tests for tf_util
2 | import tensorflow as tf
3 | from baselines.common.tf_util import (
4 | function,
5 | initialize,
6 | single_threaded_session
7 | )
8 |
9 |
10 | def test_function():
11 | with tf.Graph().as_default():
12 | x = tf.placeholder(tf.int32, (), name="x")
13 | y = tf.placeholder(tf.int32, (), name="y")
14 | z = 3 * x + 2 * y
15 | lin = function([x, y], z, givens={y: 0})
16 |
17 | with single_threaded_session():
18 | initialize()
19 |
20 | assert lin(2) == 6
21 | assert lin(2, 2) == 10
22 |
23 |
24 | def test_multikwargs():
25 | with tf.Graph().as_default():
26 | x = tf.placeholder(tf.int32, (), name="x")
27 | with tf.variable_scope("other"):
28 | x2 = tf.placeholder(tf.int32, (), name="x")
29 | z = 3 * x + 2 * x2
30 |
31 | lin = function([x, x2], z, givens={x2: 0})
32 | with single_threaded_session():
33 | initialize()
34 | assert lin(2) == 6
35 | assert lin(2, 2) == 10
36 |
37 |
38 | if __name__ == '__main__':
39 | test_function()
40 | test_multikwargs()
41 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_cartpole.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 |
4 | from baselines.run import get_learn_function
5 | from baselines.common.tests.util import reward_per_episode_test
6 |
7 | common_kwargs = dict(
8 | total_timesteps=30000,
9 | network='mlp',
10 | gamma=1.0,
11 | seed=0,
12 | )
13 |
14 | learn_kwargs = {
15 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
16 | 'acktr': dict(nsteps=32, value_network='copy'),
17 | 'deepq': dict(total_timesteps=20000),
18 | 'ppo2': dict(value_network='copy'),
19 | 'trpo_mpi': {}
20 | }
21 |
22 | @pytest.mark.slow
23 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
24 | def test_cartpole(alg):
25 | '''
26 | Test if the algorithm (with an mlp policy)
27 | can learn to balance the cartpole
28 | '''
29 |
30 | kwargs = common_kwargs.copy()
31 | kwargs.update(learn_kwargs[alg])
32 |
33 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
34 | def env_fn():
35 |
36 | env = gym.make('CartPole-v0')
37 | env.seed(0)
38 | return env
39 |
40 | reward_per_episode_test(env_fn, learn_fn, 100)
41 |
42 | if __name__ == '__main__':
43 | test_cartpole('deepq')
44 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_retro.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import numpy as np
4 |
5 | from baselines import deepq
6 | from baselines.common import retro_wrappers
7 |
8 |
9 | def main():
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes')
12 | parser.add_argument('--gamestate', help='game state to load', default='Level1-1')
13 | parser.add_argument('--model', help='model pickle file from ActWrapper.save', default='model.pkl')
14 | args = parser.parse_args()
15 |
16 | env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=None)
17 | env = retro_wrappers.wrap_deepmind_retro(env)
18 | act = deepq.load(args.model)
19 |
20 | while True:
21 | obs, done = env.reset(), False
22 | episode_rew = 0
23 | while not done:
24 | env.render()
25 | action = act(obs[None])[0]
26 | env_action = np.zeros(env.action_space.n)
27 | env_action[action] = 1
28 | obs, rew, done, _ = env.step(env_action)
29 | episode_rew += rew
30 | print('Episode reward', episode_rew)
31 |
32 |
33 | if __name__ == '__main__':
34 | main()
35 |
--------------------------------------------------------------------------------
/baselines/acktr/run_mujoco.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import tensorflow as tf
4 | from baselines import logger
5 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
6 | from baselines.acktr.acktr_cont import learn
7 | from baselines.acktr.policies import GaussianMlpPolicy
8 | from baselines.acktr.value_functions import NeuralNetValueFunction
9 |
10 | def train(env_id, num_timesteps, seed):
11 | env = make_mujoco_env(env_id, seed)
12 |
13 | with tf.Session(config=tf.ConfigProto()):
14 | ob_dim = env.observation_space.shape[0]
15 | ac_dim = env.action_space.shape[0]
16 | with tf.variable_scope("vf"):
17 | vf = NeuralNetValueFunction(ob_dim, ac_dim)
18 | with tf.variable_scope("pi"):
19 | policy = GaussianMlpPolicy(ob_dim, ac_dim)
20 |
21 | learn(env, policy=policy, vf=vf,
22 | gamma=0.99, lam=0.97, timesteps_per_batch=2500,
23 | desired_kl=0.002,
24 | num_timesteps=num_timesteps, animate=False)
25 |
26 | env.close()
27 |
28 | def main():
29 | args = mujoco_arg_parser().parse_args()
30 | logger.configure()
31 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
32 |
33 | if __name__ == "__main__":
34 | main()
35 |
--------------------------------------------------------------------------------
/baselines/common/tests/envs/fixed_sequence_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import Env
3 | from gym.spaces import Discrete
4 |
5 |
6 | class FixedSequenceEnv(Env):
7 | def __init__(
8 | self,
9 | n_actions=10,
10 | seed=0,
11 | episode_len=100
12 | ):
13 | self.np_random = np.random.RandomState()
14 | self.np_random.seed(seed)
15 | self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)]
16 |
17 | self.action_space = Discrete(n_actions)
18 | self.observation_space = Discrete(1)
19 |
20 | self.episode_len = episode_len
21 | self.time = 0
22 | self.reset()
23 |
24 | def reset(self):
25 | self.time = 0
26 | return 0
27 |
28 | def step(self, actions):
29 | rew = self._get_reward(actions)
30 | self._choose_next_state()
31 | done = False
32 | if self.episode_len and self.time >= self.episode_len:
33 | rew = 0
34 | done = True
35 |
36 | return 0, rew, done, {}
37 |
38 | def _choose_next_state(self):
39 | self.time += 1
40 |
41 | def _get_reward(self, actions):
42 | return 1 if actions == self.sequence[self.time] else 0
43 |
44 |
45 |
--------------------------------------------------------------------------------
/baselines/gail/README.md:
--------------------------------------------------------------------------------
1 | # Generative Adversarial Imitation Learning (GAIL)
2 |
3 | - Original paper: https://arxiv.org/abs/1606.03476
4 |
5 | For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md)
6 |
7 | ## If you want to train an imitation learning agent
8 |
9 | ### Step 1: Download expert data
10 |
11 | Download the expert data into `./data`, [download link](https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing)
12 |
13 | ### Step 2: Run GAIL
14 |
15 | Run with single thread:
16 |
17 | ```bash
18 | python -m baselines.gail.run_mujoco
19 | ```
20 |
21 | Run with multiple threads:
22 |
23 | ```bash
24 | mpirun -np 16 python -m baselines.gail.run_mujoco
25 | ```
26 |
27 | See help (`-h`) for more options.
28 |
29 | #### In case you want to run Behavior Cloning (BC)
30 |
31 | ```bash
32 | python -m baselines.gail.behavior_clone
33 | ```
34 |
35 | See help (`-h`) for more options.
36 |
37 |
38 | ## Contributing
39 |
40 | Bug reports and pull requests are welcome on GitHub at https://github.com/openai/baselines/pulls.
41 |
42 | ## Maintainers
43 |
44 | - Yuan-Hong Liao, andrewliao11_at_gmail_dot_com
45 | - Ryan Julian, ryanjulian_at_gmail_dot_com
46 |
47 | ## Others
48 |
49 | Thanks to the open source:
50 |
51 | - @openai/imitation
52 | - @carpedm20/deep-rl-tensorflow
53 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | import numpy as np
3 | from gym import spaces
4 |
5 |
6 | class VecFrameStack(VecEnvWrapper):
7 | def __init__(self, venv, nstack):
8 | self.venv = venv
9 | self.nstack = nstack
10 | wos = venv.observation_space # wrapped ob space
11 | low = np.repeat(wos.low, self.nstack, axis=-1)
12 | high = np.repeat(wos.high, self.nstack, axis=-1)
13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
16 |
17 | def step_wait(self):
18 | obs, rews, news, infos = self.venv.step_wait()
19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
20 | for (i, new) in enumerate(news):
21 | if new:
22 | self.stackedobs[i] = 0
23 | self.stackedobs[..., -obs.shape[-1]:] = obs
24 | return self.stackedobs, rews, news, infos
25 |
26 | def reset(self):
27 | obs = self.venv.reset()
28 | self.stackedobs[...] = 0
29 | self.stackedobs[..., -obs.shape[-1]:] = obs
30 | return self.stackedobs
31 |
32 | def close(self):
33 | self.venv.close()
34 |
--------------------------------------------------------------------------------
/baselines/acktr/utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
4 | with tf.variable_scope(name, reuse=reuse):
5 | assert (len(tf.get_variable_scope().name.split('/')) == 2)
6 |
7 | w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
8 | b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
9 | weight_decay_fc = 3e-4
10 |
11 | if weight_loss_dict is not None:
12 | weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
13 | if weight_loss_dict is not None:
14 | weight_loss_dict[w] = weight_decay_fc
15 | weight_loss_dict[b] = 0.0
16 |
17 | tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
18 |
19 | return tf.nn.bias_add(tf.matmul(x, w), b)
20 |
21 | def kl_div(action_dist1, action_dist2, action_size):
22 | mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
23 | mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
24 |
25 | numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
26 | denominator = 2 * tf.square(std2) + 1e-8
27 | return tf.reduce_sum(
28 | numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
29 |
--------------------------------------------------------------------------------
/baselines/ppo1/run_robotics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from mpi4py import MPI
4 | from baselines.common import set_global_seeds
5 | from baselines import logger
6 | from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
7 | import mujoco_py
8 |
9 |
10 | def train(env_id, num_timesteps, seed):
11 | from baselines.ppo1 import mlp_policy, pposgd_simple
12 | import baselines.common.tf_util as U
13 | rank = MPI.COMM_WORLD.Get_rank()
14 | sess = U.single_threaded_session()
15 | sess.__enter__()
16 | mujoco_py.ignore_mujoco_warnings().__enter__()
17 | workerseed = seed + 10000 * rank
18 | set_global_seeds(workerseed)
19 | env = make_robotics_env(env_id, workerseed, rank=rank)
20 | def policy_fn(name, ob_space, ac_space):
21 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
22 | hid_size=256, num_hid_layers=3)
23 |
24 | pposgd_simple.learn(env, policy_fn,
25 | max_timesteps=num_timesteps,
26 | timesteps_per_actorbatch=2048,
27 | clip_param=0.2, entcoeff=0.0,
28 | optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
29 | gamma=0.99, lam=0.95, schedule='linear',
30 | )
31 | env.close()
32 |
33 |
34 | def main():
35 | args = robotics_arg_parser().parse_args()
36 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
37 |
38 |
39 | if __name__ == '__main__':
40 | main()
41 |
--------------------------------------------------------------------------------
/baselines/common/mpi_adam_optimizer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from mpi4py import MPI
4 |
5 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
6 | """Adam optimizer that averages gradients across mpi processes."""
7 | def __init__(self, comm, **kwargs):
8 | self.comm = comm
9 | tf.train.AdamOptimizer.__init__(self, **kwargs)
10 | def compute_gradients(self, loss, var_list, **kwargs):
11 | grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
12 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
13 | flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
14 | shapes = [v.shape.as_list() for g, v in grads_and_vars]
15 | sizes = [int(np.prod(s)) for s in shapes]
16 |
17 | num_tasks = self.comm.Get_size()
18 | buf = np.zeros(sum(sizes), np.float32)
19 |
20 | def _collect_grads(flat_grad):
21 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
22 | np.divide(buf, float(num_tasks), out=buf)
23 | return buf
24 |
25 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
26 | avg_flat_grad.set_shape(flat_grad.shape)
27 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
28 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
29 | for g, (_, v) in zip(avg_grads, grads_and_vars)]
30 |
31 | return avg_grads_and_vars
32 |
--------------------------------------------------------------------------------
/baselines/common/running_stat.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | # http://www.johndcook.com/blog/standard_deviation/
4 | class RunningStat(object):
5 | def __init__(self, shape):
6 | self._n = 0
7 | self._M = np.zeros(shape)
8 | self._S = np.zeros(shape)
9 | def push(self, x):
10 | x = np.asarray(x)
11 | assert x.shape == self._M.shape
12 | self._n += 1
13 | if self._n == 1:
14 | self._M[...] = x
15 | else:
16 | oldM = self._M.copy()
17 | self._M[...] = oldM + (x - oldM)/self._n
18 | self._S[...] = self._S + (x - oldM)*(x - self._M)
19 | @property
20 | def n(self):
21 | return self._n
22 | @property
23 | def mean(self):
24 | return self._M
25 | @property
26 | def var(self):
27 | return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
28 | @property
29 | def std(self):
30 | return np.sqrt(self.var)
31 | @property
32 | def shape(self):
33 | return self._M.shape
34 |
35 | def test_running_stat():
36 | for shp in ((), (3,), (3,4)):
37 | li = []
38 | rs = RunningStat(shp)
39 | for _ in range(5):
40 | val = np.random.randn(*shp)
41 | rs.push(val)
42 | li.append(val)
43 | m = np.mean(li, axis=0)
44 | assert np.allclose(rs.mean, m)
45 | v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
46 | assert np.allclose(rs.var, v)
47 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/util.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers for dealing with vectorized environments.
3 | """
4 |
5 | from collections import OrderedDict
6 |
7 | import gym
8 | import numpy as np
9 |
10 |
11 | def copy_obs_dict(obs):
12 | """
13 | Deep-copy an observation dict.
14 | """
15 | return {k: np.copy(v) for k, v in obs.items()}
16 |
17 |
18 | def dict_to_obs(obs_dict):
19 | """
20 | Convert an observation dict into a raw array if the
21 | original observation space was not a Dict space.
22 | """
23 | if set(obs_dict.keys()) == {None}:
24 | return obs_dict[None]
25 | return obs_dict
26 |
27 |
28 | def obs_space_info(obs_space):
29 | """
30 | Get dict-structured information about a gym.Space.
31 |
32 | Returns:
33 | A tuple (keys, shapes, dtypes):
34 | keys: a list of dict keys.
35 | shapes: a dict mapping keys to shapes.
36 | dtypes: a dict mapping keys to dtypes.
37 | """
38 | if isinstance(obs_space, gym.spaces.Dict):
39 | assert isinstance(obs_space.spaces, OrderedDict)
40 | subspaces = obs_space.spaces
41 | else:
42 | subspaces = {None: obs_space}
43 | keys = []
44 | shapes = {}
45 | dtypes = {}
46 | for key, box in subspaces.items():
47 | keys.append(key)
48 | shapes[key] = box.shape
49 | dtypes[key] = box.dtype
50 | return keys, shapes, dtypes
51 |
52 |
53 | def obs_to_dict(obs):
54 | """
55 | Convert an observation into a dict.
56 | """
57 | if isinstance(obs, dict):
58 | return obs
59 | return {None: obs}
60 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_doc_examples.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | try:
3 | import mujoco_py
4 | _mujoco_present = True
5 | except BaseException:
6 | mujoco_py = None
7 | _mujoco_present = False
8 |
9 |
10 | @pytest.mark.skipif(
11 | not _mujoco_present,
12 | reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
13 | )
14 | def test_lstm_example():
15 | import tensorflow as tf
16 | from baselines.common import policies, models, cmd_util
17 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
18 |
19 | # create vectorized environment
20 | venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])
21 |
22 | with tf.Session() as sess:
23 | # build policy based on lstm network with 128 units
24 | policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)
25 |
26 | # initialize tensorflow variables
27 | sess.run(tf.global_variables_initializer())
28 |
29 | # prepare environment variables
30 | ob = venv.reset()
31 | state = policy.initial_state
32 | done = [False]
33 | step_counter = 0
34 |
35 | # run a single episode until the end (i.e. until done)
36 | while True:
37 | action, _, state, _ = policy.step(ob, S=state, M=done)
38 | ob, reward, done, _ = venv.step(action)
39 | step_counter += 1
40 | if done:
41 | break
42 |
43 |
44 | assert step_counter > 5
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | import sys
3 |
4 | if sys.version_info.major != 3:
5 | print('This Python is only compatible with Python 3, but you are running '
6 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major))
7 |
8 |
9 | extras = {
10 | 'test': [
11 | 'filelock',
12 | 'pytest'
13 | ]
14 | }
15 |
16 |
17 | all_deps = []
18 | for group_name in extras:
19 | all_deps += extras[group_name]
20 |
21 | extras['all'] = all_deps
22 |
23 | setup(name='baselines',
24 | packages=[package for package in find_packages()
25 | if package.startswith('baselines')],
26 | install_requires=[
27 | 'gym[mujoco,atari,classic_control,robotics]',
28 | 'scipy',
29 | 'tqdm',
30 | 'joblib',
31 | 'dill',
32 | 'progressbar2',
33 | 'mpi4py',
34 | 'cloudpickle',
35 | 'click',
36 | 'opencv-python'
37 | ],
38 | extras_require=extras,
39 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
40 | author='OpenAI',
41 | url='https://github.com/openai/baselines',
42 | author_email='gym@openai.com',
43 | version='0.1.5')
44 |
45 |
46 | # ensure there is some tensorflow build with version above 1.4
47 | try:
48 | from distutils.version import StrictVersion
49 | import tensorflow
50 | assert StrictVersion(tensorflow.__version__) >= StrictVersion('1.4.0')
51 | except ImportError:
52 | assert False, "TensorFlow needed, of version above 1.4"
53 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_fixed_sequence.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
3 |
4 | from baselines.common.tests.util import simple_test
5 | from baselines.run import get_learn_function
6 |
7 | common_kwargs = dict(
8 | seed=0,
9 | total_timesteps=50000,
10 | )
11 |
12 | learn_kwargs = {
13 | 'a2c': {},
14 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
15 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
16 | # github issue: https://github.com/openai/baselines/issues/188
17 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
18 | }
19 |
20 |
21 | alg_list = learn_kwargs.keys()
22 | rnn_list = ['lstm']
23 |
24 | @pytest.mark.slow
25 | @pytest.mark.parametrize("alg", alg_list)
26 | @pytest.mark.parametrize("rnn", rnn_list)
27 | def test_fixed_sequence(alg, rnn):
28 | '''
29 | Test if the algorithm (with a given policy)
30 | can learn an identity transformation (i.e. return observation as an action)
31 | '''
32 |
33 | kwargs = learn_kwargs[alg]
34 | kwargs.update(common_kwargs)
35 |
36 | episode_len = 5
37 | env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
38 | learn = lambda e: get_learn_function(alg)(
39 | env=e,
40 | network=rnn,
41 | **kwargs
42 | )
43 |
44 | simple_test(env_fn, learn, 0.7)
45 |
46 |
47 | if __name__ == '__main__':
48 | test_fixed_sequence('ppo2', 'lstm')
49 |
50 |
51 |
52 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | from baselines.common.running_mean_std import RunningMeanStd
3 | import numpy as np
4 |
5 |
6 | class VecNormalize(VecEnvWrapper):
7 | """
8 | A vectorized wrapper that normalizes the observations
9 | and returns from an environment.
10 | """
11 |
12 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
13 | VecEnvWrapper.__init__(self, venv)
14 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
15 | self.ret_rms = RunningMeanStd(shape=()) if ret else None
16 | self.clipob = clipob
17 | self.cliprew = cliprew
18 | self.ret = np.zeros(self.num_envs)
19 | self.gamma = gamma
20 | self.epsilon = epsilon
21 |
22 | def step_wait(self):
23 | obs, rews, news, infos = self.venv.step_wait()
24 | self.ret = self.ret * self.gamma + rews
25 | obs = self._obfilt(obs)
26 | if self.ret_rms:
27 | self.ret_rms.update(self.ret)
28 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
29 | return obs, rews, news, infos
30 |
31 | def _obfilt(self, obs):
32 | if self.ob_rms:
33 | self.ob_rms.update(obs)
34 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
35 | return obs
36 | else:
37 | return obs
38 |
39 | def reset(self):
40 | obs = self.venv.reset()
41 | return self._obfilt(obs)
42 |
--------------------------------------------------------------------------------
/baselines/her/README.md:
--------------------------------------------------------------------------------
1 | # Hindsight Experience Replay
2 | For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/abs/1707.01495).
3 |
4 | ## How to use Hindsight Experience Replay
5 |
6 | ### Getting started
7 | Training an agent is very simple:
8 | ```bash
9 | python -m baselines.her.experiment.train
10 | ```
11 | This will train a DDPG+HER agent on the `FetchReach` environment.
12 | You should see the success rate go up quickly to `1.0`, which means that the agent achieves the
13 | desired goal in 100% of the cases.
14 | The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate),
15 | the latest policy, and, if enabled, a history of policies every K epochs.
16 |
17 | To inspect what the agent has learned, use the play script:
18 | ```bash
19 | python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl
20 | ```
21 | You can try it right now with the results of the training step (the script prints out the path for you).
22 | This should visualize the current policy for 10 episodes and will also print statistics.
23 |
24 |
25 | ### Reproducing results
26 | In order to reproduce the results from [Plappert et al. (2018)](https://arxiv.org/abs/1802.09464), run the following command:
27 | ```bash
28 | python -m baselines.her.experiment.train --num_cpu 19
29 | ```
30 | This will require a machine with sufficient amount of physical CPU cores. In our experiments,
31 | we used [Azure's D15v2 instances](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes),
32 | which have 20 physical cores. We only scheduled the experiment on 19 of those to leave some head-room on the system.
33 |
--------------------------------------------------------------------------------
/baselines/deepq/README.md:
--------------------------------------------------------------------------------
1 | ## If you are curious.
2 |
3 | ##### Train a Cartpole agent and watch it play once it converges!
4 |
5 | Here's a list of commands to run to quickly get a working example:
6 |
7 |
8 |
9 |
10 | ```bash
11 | # Train model and save the results to cartpole_model.pkl
12 | python -m baselines.run --alg=deepq --env=CartPole-v0 --save_path=./cartpole_model.pkl --num_timesteps=1e5
13 | # Load the model saved in cartpole_model.pkl and visualize the learned policy
14 | python -m baselines.run --alg=deepq --env=CartPole-v0 --load_path=./cartpole_model.pkl --num_timesteps=0 --play
15 | ```
16 |
17 | ## If you wish to apply DQN to solve a problem.
18 |
19 | Check out our simple agent trained with one stop shop `deepq.learn` function.
20 |
21 | - [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent.
22 |
23 | In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. Complimentary file `enjoy_cartpole.py` loads and visualizes the learned policy.
24 |
25 | ## If you wish to experiment with the algorithm
26 |
27 | ##### Check out the examples
28 |
29 | - [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm.
30 | - [baselines/deepq/defaults.py](defaults.py) - settings for training on atari. Run
31 |
32 | ```bash
33 | python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4
34 | ```
35 | to train on Atari Pong (see more in repo-wide [README.md](../../README.md#training-models))
36 |
37 |
38 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/run_retro.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | from baselines import deepq
4 | from baselines.common import set_global_seeds
5 | from baselines import bench
6 | from baselines import logger
7 | from baselines.common import retro_wrappers
8 | import retro
9 |
10 |
11 | def main():
12 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
13 | parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes')
14 | parser.add_argument('--gamestate', help='game state to load', default='Level1-1')
15 | parser.add_argument('--seed', help='seed', type=int, default=0)
16 | parser.add_argument('--num-timesteps', type=int, default=int(10e6))
17 | args = parser.parse_args()
18 | logger.configure()
19 | set_global_seeds(args.seed)
20 | env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE)
21 | env.seed(args.seed)
22 | env = bench.Monitor(env, logger.get_dir())
23 | env = retro_wrappers.wrap_deepmind_retro(env)
24 |
25 | model = deepq.models.cnn_to_mlp(
26 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
27 | hiddens=[256],
28 | dueling=True
29 | )
30 | act = deepq.learn(
31 | env,
32 | q_func=model,
33 | lr=1e-4,
34 | max_timesteps=args.num_timesteps,
35 | buffer_size=10000,
36 | exploration_fraction=0.1,
37 | exploration_final_eps=0.01,
38 | train_freq=4,
39 | learning_starts=10000,
40 | target_network_update_freq=1000,
41 | gamma=0.99,
42 | prioritized_replay=True
43 | )
44 | act.save()
45 | env.close()
46 |
47 |
48 | if __name__ == '__main__':
49 | main()
50 |
--------------------------------------------------------------------------------
/baselines/ppo1/run_atari.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from mpi4py import MPI
4 | from baselines.common import set_global_seeds
5 | from baselines import bench
6 | import os.path as osp
7 | from baselines import logger
8 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind
9 | from baselines.common.cmd_util import atari_arg_parser
10 |
11 | def train(env_id, num_timesteps, seed):
12 | from baselines.ppo1 import pposgd_simple, cnn_policy
13 | import baselines.common.tf_util as U
14 | rank = MPI.COMM_WORLD.Get_rank()
15 | sess = U.single_threaded_session()
16 | sess.__enter__()
17 | if rank == 0:
18 | logger.configure()
19 | else:
20 | logger.configure(format_strs=[])
21 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
22 | set_global_seeds(workerseed)
23 | env = make_atari(env_id)
24 | def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
25 | return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
26 | env = bench.Monitor(env, logger.get_dir() and
27 | osp.join(logger.get_dir(), str(rank)))
28 | env.seed(workerseed)
29 |
30 | env = wrap_deepmind(env)
31 | env.seed(workerseed)
32 |
33 | pposgd_simple.learn(env, policy_fn,
34 | max_timesteps=int(num_timesteps * 1.1),
35 | timesteps_per_actorbatch=256,
36 | clip_param=0.2, entcoeff=0.01,
37 | optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
38 | gamma=0.99, lam=0.95,
39 | schedule='linear'
40 | )
41 | env.close()
42 |
43 | def main():
44 | args = atari_arg_parser().parse_args()
45 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
46 |
47 | if __name__ == '__main__':
48 | main()
49 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_mnist.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | # from baselines.acer import acer_simple as acer
4 | from baselines.common.tests.envs.mnist_env import MnistEnv
5 | from baselines.common.tests.util import simple_test
6 | from baselines.run import get_learn_function
7 |
8 |
9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
10 | # GitHub issue https://github.com/openai/baselines/issues/189
11 | common_kwargs = {
12 | 'seed': 0,
13 | 'network':'cnn',
14 | 'gamma':0.9,
15 | 'pad':'SAME'
16 | }
17 |
18 | learn_args = {
19 | 'a2c': dict(total_timesteps=50000),
20 | # TODO need to resolve inference (step) API differences for acer; also slow
21 | # 'acer': dict(seed=0, total_timesteps=1000),
22 | 'deepq': dict(total_timesteps=5000),
23 | 'acktr': dict(total_timesteps=30000),
24 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
25 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
26 | }
27 |
28 |
29 | #tests pass, but are too slow on travis. Same algorithms are covered
30 | # by other tests with less compute-hungry nn's and by benchmarks
31 | @pytest.mark.skip
32 | @pytest.mark.slow
33 | @pytest.mark.parametrize("alg", learn_args.keys())
34 | def test_mnist(alg):
35 | '''
36 | Test if the algorithm can learn to classify MNIST digits.
37 | Uses CNN policy.
38 | '''
39 |
40 | learn_kwargs = learn_args[alg]
41 | learn_kwargs.update(common_kwargs)
42 |
43 | learn = get_learn_function(alg)
44 | learn_fn = lambda e: learn(env=e, **learn_kwargs)
45 | env_fn = lambda: MnistEnv(seed=0, episode_len=100)
46 |
47 | simple_test(env_fn, learn_fn, 0.6)
48 |
49 | if __name__ == '__main__':
50 | test_mnist('deepq')
51 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_identity.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv
3 | from baselines.run import get_learn_function
4 | from baselines.common.tests.util import simple_test
5 |
6 | common_kwargs = dict(
7 | total_timesteps=30000,
8 | network='mlp',
9 | gamma=0.9,
10 | seed=0,
11 | )
12 |
13 | learn_kwargs = {
14 | 'a2c' : {},
15 | 'acktr': {},
16 | 'deepq': {},
17 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
18 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
19 | }
20 |
21 |
22 | @pytest.mark.slow
23 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
24 | def test_discrete_identity(alg):
25 | '''
26 | Test if the algorithm (with an mlp policy)
27 | can learn an identity transformation (i.e. return observation as an action)
28 | '''
29 |
30 | kwargs = learn_kwargs[alg]
31 | kwargs.update(common_kwargs)
32 |
33 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
34 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
35 | simple_test(env_fn, learn_fn, 0.9)
36 |
37 | @pytest.mark.slow
38 | @pytest.mark.parametrize("alg", ['a2c', 'ppo2', 'trpo_mpi'])
39 | def test_continuous_identity(alg):
40 | '''
41 | Test if the algorithm (with an mlp policy)
42 | can learn an identity transformation (i.e. return observation as an action)
43 | to a required precision
44 | '''
45 |
46 | kwargs = learn_kwargs[alg]
47 | kwargs.update(common_kwargs)
48 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
49 |
50 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
51 | simple_test(env_fn, learn_fn, -0.1)
52 |
53 | if __name__ == '__main__':
54 | test_continuous_identity('a2c')
55 |
56 |
--------------------------------------------------------------------------------
/baselines/common/input.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from gym.spaces import Discrete, Box
3 |
4 | def observation_placeholder(ob_space, batch_size=None, name='Ob'):
5 | '''
6 | Create placeholder to feed observations into of the size appropriate to the observation space
7 |
8 | Parameters:
9 | ----------
10 |
11 | ob_space: gym.Space observation space
12 |
13 | batch_size: int size of the batch to be fed into input. Can be left None in most cases.
14 |
15 | name: str name of the placeholder
16 |
17 | Returns:
18 | -------
19 |
20 | tensorflow placeholder tensor
21 | '''
22 |
23 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \
24 | 'Can only deal with Discrete and Box observation spaces for now'
25 |
26 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name)
27 |
28 |
29 | def observation_input(ob_space, batch_size=None, name='Ob'):
30 | '''
31 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input
32 | encoder of the appropriate type.
33 | '''
34 |
35 | placeholder = observation_placeholder(ob_space, batch_size, name)
36 | return placeholder, encode_observation(ob_space, placeholder)
37 |
38 | def encode_observation(ob_space, placeholder):
39 | '''
40 | Encode input in the way that is appropriate to the observation space
41 |
42 | Parameters:
43 | ----------
44 |
45 | ob_space: gym.Space observation space
46 |
47 | placeholder: tf.placeholder observation input placeholder
48 | '''
49 | if isinstance(ob_space, Discrete):
50 | return tf.to_float(tf.one_hot(placeholder, ob_space.n))
51 |
52 | elif isinstance(ob_space, Box):
53 | return tf.to_float(placeholder)
54 | else:
55 | raise NotImplementedError
56 |
57 |
--------------------------------------------------------------------------------
/baselines/common/tests/envs/identity_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from abc import abstractmethod
3 | from gym import Env
4 | from gym.spaces import Discrete, Box
5 |
6 |
7 | class IdentityEnv(Env):
8 | def __init__(
9 | self,
10 | episode_len=None
11 | ):
12 |
13 | self.episode_len = episode_len
14 | self.time = 0
15 | self.reset()
16 |
17 | def reset(self):
18 | self._choose_next_state()
19 | self.time = 0
20 | self.observation_space = self.action_space
21 |
22 | return self.state
23 |
24 | def step(self, actions):
25 | rew = self._get_reward(actions)
26 | self._choose_next_state()
27 | done = False
28 | if self.episode_len and self.time >= self.episode_len:
29 | rew = 0
30 | done = True
31 |
32 | return self.state, rew, done, {}
33 |
34 | def _choose_next_state(self):
35 | self.state = self.action_space.sample()
36 | self.time += 1
37 |
38 | @abstractmethod
39 | def _get_reward(self, actions):
40 | raise NotImplementedError
41 |
42 |
43 | class DiscreteIdentityEnv(IdentityEnv):
44 | def __init__(
45 | self,
46 | dim,
47 | episode_len=None,
48 | ):
49 |
50 | self.action_space = Discrete(dim)
51 | super().__init__(episode_len=episode_len)
52 |
53 | def _get_reward(self, actions):
54 | return 1 if self.state == actions else 0
55 |
56 |
57 | class BoxIdentityEnv(IdentityEnv):
58 | def __init__(
59 | self,
60 | shape,
61 | episode_len=None,
62 | ):
63 |
64 | self.action_space = Box(low=-1.0, high=1.0, shape=shape)
65 | super().__init__(episode_len=episode_len)
66 |
67 | def _get_reward(self, actions):
68 | diff = actions - self.state
69 | diff = diff[:]
70 | return -0.5 * np.dot(diff, diff)
71 |
--------------------------------------------------------------------------------
/baselines/gail/statistics.py:
--------------------------------------------------------------------------------
1 | '''
2 | This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py
3 | '''
4 |
5 | import tensorflow as tf
6 | import numpy as np
7 |
8 | import baselines.common.tf_util as U
9 |
10 |
11 | class stats():
12 |
13 | def __init__(self, scalar_keys=[], histogram_keys=[]):
14 | self.scalar_keys = scalar_keys
15 | self.histogram_keys = histogram_keys
16 | self.scalar_summaries = []
17 | self.scalar_summaries_ph = []
18 | self.histogram_summaries_ph = []
19 | self.histogram_summaries = []
20 | with tf.variable_scope('summary'):
21 | for k in scalar_keys:
22 | ph = tf.placeholder('float32', None, name=k+'.scalar.summary')
23 | sm = tf.summary.scalar(k+'.scalar.summary', ph)
24 | self.scalar_summaries_ph.append(ph)
25 | self.scalar_summaries.append(sm)
26 | for k in histogram_keys:
27 | ph = tf.placeholder('float32', None, name=k+'.histogram.summary')
28 | sm = tf.summary.scalar(k+'.histogram.summary', ph)
29 | self.histogram_summaries_ph.append(ph)
30 | self.histogram_summaries.append(sm)
31 |
32 | self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries)
33 |
34 | def add_all_summary(self, writer, values, iter):
35 | # Note that the order of the incoming ```values``` should be the same as the that of the
36 | # ```scalar_keys``` given in ```__init__```
37 | if np.sum(np.isnan(values)+0) != 0:
38 | return
39 | sess = U.get_session()
40 | keys = self.scalar_summaries_ph + self.histogram_summaries_ph
41 | feed_dict = {}
42 | for k, v in zip(keys, values):
43 | feed_dict.update({k: v})
44 | summaries_str = sess.run(self.summaries, feed_dict)
45 | writer.add_summary(summaries_str, iter)
46 |
--------------------------------------------------------------------------------
/baselines/her/experiment/play.py:
--------------------------------------------------------------------------------
1 | import click
2 | import numpy as np
3 | import pickle
4 |
5 | from baselines import logger
6 | from baselines.common import set_global_seeds
7 | import baselines.her.experiment.config as config
8 | from baselines.her.rollout import RolloutWorker
9 |
10 |
11 | @click.command()
12 | @click.argument('policy_file', type=str)
13 | @click.option('--seed', type=int, default=0)
14 | @click.option('--n_test_rollouts', type=int, default=10)
15 | @click.option('--render', type=int, default=1)
16 | def main(policy_file, seed, n_test_rollouts, render):
17 | set_global_seeds(seed)
18 |
19 | # Load policy.
20 | with open(policy_file, 'rb') as f:
21 | policy = pickle.load(f)
22 | env_name = policy.info['env_name']
23 |
24 | # Prepare params.
25 | params = config.DEFAULT_PARAMS
26 | if env_name in config.DEFAULT_ENV_PARAMS:
27 | params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in
28 | params['env_name'] = env_name
29 | params = config.prepare_params(params)
30 | config.log_params(params, logger=logger)
31 |
32 | dims = config.configure_dims(params)
33 |
34 | eval_params = {
35 | 'exploit': True,
36 | 'use_target_net': params['test_with_polyak'],
37 | 'compute_Q': True,
38 | 'rollout_batch_size': 1,
39 | 'render': bool(render),
40 | }
41 |
42 | for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
43 | eval_params[name] = params[name]
44 |
45 | evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
46 | evaluator.seed(seed)
47 |
48 | # Run evaluation.
49 | evaluator.clear_history()
50 | for _ in range(n_test_rollouts):
51 | evaluator.generate_rollouts()
52 |
53 | # record logs
54 | for key, val in evaluator.logs('test'):
55 | logger.record_tabular(key, np.mean(val))
56 | logger.dump_tabular()
57 |
58 |
59 | if __name__ == '__main__':
60 | main()
61 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/run_atari.py:
--------------------------------------------------------------------------------
1 | from baselines import deepq
2 | from baselines.common import set_global_seeds
3 | from baselines import bench
4 | import argparse
5 | from baselines import logger
6 | from baselines.common.atari_wrappers import make_atari
7 |
8 |
9 | def main():
10 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
11 | parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
12 | parser.add_argument('--seed', help='RNG seed', type=int, default=0)
13 | parser.add_argument('--prioritized', type=int, default=1)
14 | parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
15 | parser.add_argument('--dueling', type=int, default=1)
16 | parser.add_argument('--num-timesteps', type=int, default=int(10e6))
17 | parser.add_argument('--checkpoint-freq', type=int, default=10000)
18 | parser.add_argument('--checkpoint-path', type=str, default=None)
19 |
20 | args = parser.parse_args()
21 | logger.configure()
22 | set_global_seeds(args.seed)
23 | env = make_atari(args.env)
24 | env = bench.Monitor(env, logger.get_dir())
25 | env = deepq.wrap_atari_dqn(env)
26 |
27 | deepq.learn(
28 | env,
29 | "conv_only",
30 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
31 | hiddens=[256],
32 | dueling=bool(args.dueling),
33 | lr=1e-4,
34 | total_timesteps=args.num_timesteps,
35 | buffer_size=10000,
36 | exploration_fraction=0.1,
37 | exploration_final_eps=0.01,
38 | train_freq=4,
39 | learning_starts=10000,
40 | target_network_update_freq=1000,
41 | gamma=0.99,
42 | prioritized_replay=bool(args.prioritized),
43 | prioritized_replay_alpha=args.prioritized_replay_alpha,
44 | checkpoint_freq=args.checkpoint_freq,
45 | checkpoint_path=args.checkpoint_path,
46 | )
47 |
48 | env.close()
49 |
50 |
51 | if __name__ == '__main__':
52 | main()
53 |
--------------------------------------------------------------------------------
/baselines/her/actor_critic.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from baselines.her.util import store_args, nn
3 |
4 |
5 | class ActorCritic:
6 | @store_args
7 | def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
8 | **kwargs):
9 | """The actor-critic network and related training code.
10 |
11 | Args:
12 | inputs_tf (dict of tensors): all necessary inputs for the network: the
13 | observation (o), the goal (g), and the action (u)
14 | dimo (int): the dimension of the observations
15 | dimg (int): the dimension of the goals
16 | dimu (int): the dimension of the actions
17 | max_u (float): the maximum magnitude of actions; action outputs will be scaled
18 | accordingly
19 | o_stats (baselines.her.Normalizer): normalizer for observations
20 | g_stats (baselines.her.Normalizer): normalizer for goals
21 | hidden (int): number of hidden units that should be used in hidden layers
22 | layers (int): number of hidden layers
23 | """
24 | self.o_tf = inputs_tf['o']
25 | self.g_tf = inputs_tf['g']
26 | self.u_tf = inputs_tf['u']
27 |
28 | # Prepare inputs for actor and critic.
29 | o = self.o_stats.normalize(self.o_tf)
30 | g = self.g_stats.normalize(self.g_tf)
31 | input_pi = tf.concat(axis=1, values=[o, g]) # for actor
32 |
33 | # Networks.
34 | with tf.variable_scope('pi'):
35 | self.pi_tf = self.max_u * tf.tanh(nn(
36 | input_pi, [self.hidden] * self.layers + [self.dimu]))
37 | with tf.variable_scope('Q'):
38 | # for policy training
39 | input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
40 | self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
41 | # for critic training
42 | input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
43 | self._input_Q = input_Q # exposed for tests
44 | self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
45 |
--------------------------------------------------------------------------------
/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
1 | from mpi4py import MPI
2 | import numpy as np
3 | from baselines.common import zipsame
4 |
5 |
6 | def mpi_mean(x, axis=0, comm=None, keepdims=False):
7 | x = np.asarray(x)
8 | assert x.ndim > 0
9 | if comm is None: comm = MPI.COMM_WORLD
10 | xsum = x.sum(axis=axis, keepdims=keepdims)
11 | n = xsum.size
12 | localsum = np.zeros(n+1, x.dtype)
13 | localsum[:n] = xsum.ravel()
14 | localsum[n] = x.shape[axis]
15 | globalsum = np.zeros_like(localsum)
16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM)
17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
18 |
19 | def mpi_moments(x, axis=0, comm=None, keepdims=False):
20 | x = np.asarray(x)
21 | assert x.ndim > 0
22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
23 | sqdiffs = np.square(x - mean)
24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
25 | assert count1 == count
26 | std = np.sqrt(meansqdiff)
27 | if not keepdims:
28 | newshape = mean.shape[:axis] + mean.shape[axis+1:]
29 | mean = mean.reshape(newshape)
30 | std = std.reshape(newshape)
31 | return mean, std, count
32 |
33 |
34 | def test_runningmeanstd():
35 | import subprocess
36 | subprocess.check_call(['mpirun', '-np', '3',
37 | 'python','-c',
38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
39 |
40 | def _helper_runningmeanstd():
41 | comm = MPI.COMM_WORLD
42 | np.random.seed(0)
43 | for (triple,axis) in [
44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
47 | ]:
48 |
49 |
50 | x = np.concatenate(triple, axis=axis)
51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
52 |
53 |
54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
55 |
56 | for (a1,a2) in zipsame(ms1, ms2):
57 | print(a1, a2)
58 | assert np.allclose(a1, a2)
59 | print("ok!")
60 |
61 |
--------------------------------------------------------------------------------
/baselines/common/console_util.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from contextlib import contextmanager
3 | import numpy as np
4 | import time
5 | import shlex
6 | import subprocess
7 |
8 | # ================================================================
9 | # Misc
10 | # ================================================================
11 |
12 | def fmt_row(width, row, header=False):
13 | out = " | ".join(fmt_item(x, width) for x in row)
14 | if header: out = out + "\n" + "-"*len(out)
15 | return out
16 |
17 | def fmt_item(x, l):
18 | if isinstance(x, np.ndarray):
19 | assert x.ndim==0
20 | x = x.item()
21 | if isinstance(x, (float, np.float32, np.float64)):
22 | v = abs(x)
23 | if (v < 1e-4 or v > 1e+4) and v > 0:
24 | rep = "%7.2e" % x
25 | else:
26 | rep = "%7.5f" % x
27 | else: rep = str(x)
28 | return " "*(l - len(rep)) + rep
29 |
30 | color2num = dict(
31 | gray=30,
32 | red=31,
33 | green=32,
34 | yellow=33,
35 | blue=34,
36 | magenta=35,
37 | cyan=36,
38 | white=37,
39 | crimson=38
40 | )
41 |
42 | def colorize(string, color='green', bold=False, highlight=False):
43 | attr = []
44 | num = color2num[color]
45 | if highlight: num += 10
46 | attr.append(str(num))
47 | if bold: attr.append('1')
48 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
49 |
50 | def print_cmd(cmd, dry=False):
51 | if isinstance(cmd, str): # for shell=True
52 | pass
53 | else:
54 | cmd = ' '.join(shlex.quote(arg) for arg in cmd)
55 | print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd))
56 |
57 |
58 | def get_git_commit(cwd=None):
59 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8')
60 |
61 | def ccap(cmd, dry=False, env=None, **kwargs):
62 | print_cmd(cmd, dry)
63 | if not dry:
64 | subprocess.check_call(cmd, env=env, **kwargs)
65 |
66 |
67 | MESSAGE_DEPTH = 0
68 |
69 | @contextmanager
70 | def timed(msg):
71 | global MESSAGE_DEPTH #pylint: disable=W0603
72 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
73 | tstart = time.time()
74 | MESSAGE_DEPTH += 1
75 | yield
76 | MESSAGE_DEPTH -= 1
77 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
78 |
--------------------------------------------------------------------------------
/baselines/common/dataset.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class Dataset(object):
4 | def __init__(self, data_map, deterministic=False, shuffle=True):
5 | self.data_map = data_map
6 | self.deterministic = deterministic
7 | self.enable_shuffle = shuffle
8 | self.n = next(iter(data_map.values())).shape[0]
9 | self._next_id = 0
10 | self.shuffle()
11 |
12 | def shuffle(self):
13 | if self.deterministic:
14 | return
15 | perm = np.arange(self.n)
16 | np.random.shuffle(perm)
17 |
18 | for key in self.data_map:
19 | self.data_map[key] = self.data_map[key][perm]
20 |
21 | self._next_id = 0
22 |
23 | def next_batch(self, batch_size):
24 | if self._next_id >= self.n and self.enable_shuffle:
25 | self.shuffle()
26 |
27 | cur_id = self._next_id
28 | cur_batch_size = min(batch_size, self.n - self._next_id)
29 | self._next_id += cur_batch_size
30 |
31 | data_map = dict()
32 | for key in self.data_map:
33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 | return data_map
35 |
36 | def iterate_once(self, batch_size):
37 | if self.enable_shuffle: self.shuffle()
38 |
39 | while self._next_id <= self.n - batch_size:
40 | yield self.next_batch(batch_size)
41 | self._next_id = 0
42 |
43 | def subset(self, num_elements, deterministic=True):
44 | data_map = dict()
45 | for key in self.data_map:
46 | data_map[key] = self.data_map[key][:num_elements]
47 | return Dataset(data_map, deterministic)
48 |
49 |
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 | arrays = tuple(map(np.asarray, arrays))
53 | n = arrays[0].shape[0]
54 | assert all(a.shape[0] == n for a in arrays[1:])
55 | inds = np.arange(n)
56 | if shuffle: np.random.shuffle(inds)
57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 | for batch_inds in np.array_split(inds, sections):
59 | if include_final_partial_batch or len(batch_inds) == batch_size:
60 | yield tuple(a[batch_inds] for a in arrays)
61 |
--------------------------------------------------------------------------------
/baselines/common/tests/envs/mnist_env.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import numpy as np
3 | import tempfile
4 | import filelock
5 | from gym import Env
6 | from gym.spaces import Discrete, Box
7 |
8 |
9 |
10 | class MnistEnv(Env):
11 | def __init__(
12 | self,
13 | seed=0,
14 | episode_len=None,
15 | no_images=None
16 | ):
17 | from tensorflow.examples.tutorials.mnist import input_data
18 | # we could use temporary directory for this with a context manager and
19 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data
20 | # this way the data is not cleaned up, but we only download it once per machine
21 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
22 | with filelock.FileLock(mnist_path + '.lock'):
23 | self.mnist = input_data.read_data_sets(mnist_path)
24 |
25 | self.np_random = np.random.RandomState()
26 | self.np_random.seed(seed)
27 |
28 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
29 | self.action_space = Discrete(10)
30 | self.episode_len = episode_len
31 | self.time = 0
32 | self.no_images = no_images
33 |
34 | self.train_mode()
35 | self.reset()
36 |
37 | def reset(self):
38 | self._choose_next_state()
39 | self.time = 0
40 |
41 | return self.state[0]
42 |
43 | def step(self, actions):
44 | rew = self._get_reward(actions)
45 | self._choose_next_state()
46 | done = False
47 | if self.episode_len and self.time >= self.episode_len:
48 | rew = 0
49 | done = True
50 |
51 | return self.state[0], rew, done, {}
52 |
53 | def train_mode(self):
54 | self.dataset = self.mnist.train
55 |
56 | def test_mode(self):
57 | self.dataset = self.mnist.test
58 |
59 | def _choose_next_state(self):
60 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
61 | index = self.np_random.randint(0, max_index)
62 | image = self.dataset.images[index].reshape(28,28,1)*255
63 | label = self.dataset.labels[index]
64 | self.state = (image, label)
65 | self.time += 1
66 |
67 | def _get_reward(self, actions):
68 | return 1 if self.state[1] == actions else 0
69 |
70 |
71 |
--------------------------------------------------------------------------------
/baselines/ddpg/noise.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class AdaptiveParamNoiseSpec(object):
5 | def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01):
6 | self.initial_stddev = initial_stddev
7 | self.desired_action_stddev = desired_action_stddev
8 | self.adoption_coefficient = adoption_coefficient
9 |
10 | self.current_stddev = initial_stddev
11 |
12 | def adapt(self, distance):
13 | if distance > self.desired_action_stddev:
14 | # Decrease stddev.
15 | self.current_stddev /= self.adoption_coefficient
16 | else:
17 | # Increase stddev.
18 | self.current_stddev *= self.adoption_coefficient
19 |
20 | def get_stats(self):
21 | stats = {
22 | 'param_noise_stddev': self.current_stddev,
23 | }
24 | return stats
25 |
26 | def __repr__(self):
27 | fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})'
28 | return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient)
29 |
30 |
31 | class ActionNoise(object):
32 | def reset(self):
33 | pass
34 |
35 |
36 | class NormalActionNoise(ActionNoise):
37 | def __init__(self, mu, sigma):
38 | self.mu = mu
39 | self.sigma = sigma
40 |
41 | def __call__(self):
42 | return np.random.normal(self.mu, self.sigma)
43 |
44 | def __repr__(self):
45 | return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
46 |
47 |
48 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
49 | class OrnsteinUhlenbeckActionNoise(ActionNoise):
50 | def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None):
51 | self.theta = theta
52 | self.mu = mu
53 | self.sigma = sigma
54 | self.dt = dt
55 | self.x0 = x0
56 | self.reset()
57 |
58 | def __call__(self):
59 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
60 | self.x_prev = x
61 | return x
62 |
63 | def reset(self):
64 | self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
65 |
66 | def __repr__(self):
67 | return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
68 |
--------------------------------------------------------------------------------
/baselines/common/math_util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.signal
3 |
4 |
5 | def discount(x, gamma):
6 | """
7 | computes discounted sums along 0th dimension of x.
8 |
9 | inputs
10 | ------
11 | x: ndarray
12 | gamma: float
13 |
14 | outputs
15 | -------
16 | y: ndarray with same shape as x, satisfying
17 |
18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 | where k = len(x) - t - 1
20 |
21 | """
22 | assert x.ndim >= 1
23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 |
25 | def explained_variance(ypred,y):
26 | """
27 | Computes fraction of variance that ypred explains about y.
28 | Returns 1 - Var[y-ypred] / Var[y]
29 |
30 | interpretation:
31 | ev=0 => might as well have predicted zero
32 | ev=1 => perfect prediction
33 | ev<0 => worse than just predicting zero
34 |
35 | """
36 | assert y.ndim == 1 and ypred.ndim == 1
37 | vary = np.var(y)
38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 |
40 | def explained_variance_2d(ypred, y):
41 | assert y.ndim == 2 and ypred.ndim == 2
42 | vary = np.var(y, axis=0)
43 | out = 1 - np.var(y-ypred)/vary
44 | out[vary < 1e-10] = 0
45 | return out
46 |
47 | def ncc(ypred, y):
48 | return np.corrcoef(ypred, y)[1,0]
49 |
50 | def flatten_arrays(arrs):
51 | return np.concatenate([arr.flat for arr in arrs])
52 |
53 | def unflatten_vector(vec, shapes):
54 | i=0
55 | arrs = []
56 | for shape in shapes:
57 | size = np.prod(shape)
58 | arr = vec[i:i+size].reshape(shape)
59 | arrs.append(arr)
60 | i += size
61 | return arrs
62 |
63 | def discount_with_boundaries(X, New, gamma):
64 | """
65 | X: 2d array of floats, time x features
66 | New: 2d array of bools, indicating when a new episode has started
67 | """
68 | Y = np.zeros_like(X)
69 | T = X.shape[0]
70 | Y[T-1] = X[T-1]
71 | for t in range(T-2, -1, -1):
72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 | return Y
74 |
75 | def test_discount_with_boundaries():
76 | gamma=0.9
77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 | starts = [1.0, 0.0, 0.0, 1.0]
79 | y = discount_with_boundaries(x, starts, gamma)
80 | assert np.allclose(y, [
81 | 1 + gamma * 2 + gamma**2 * 3,
82 | 2 + gamma * 3,
83 | 3,
84 | 4
85 | ])
--------------------------------------------------------------------------------
/baselines/ppo1/cnn_policy.py:
--------------------------------------------------------------------------------
1 | import baselines.common.tf_util as U
2 | import tensorflow as tf
3 | import gym
4 | from baselines.common.distributions import make_pdtype
5 |
6 | class CnnPolicy(object):
7 | recurrent = False
8 | def __init__(self, name, ob_space, ac_space, kind='large'):
9 | with tf.variable_scope(name):
10 | self._init(ob_space, ac_space, kind)
11 | self.scope = tf.get_variable_scope().name
12 |
13 | def _init(self, ob_space, ac_space, kind):
14 | assert isinstance(ob_space, gym.spaces.Box)
15 |
16 | self.pdtype = pdtype = make_pdtype(ac_space)
17 | sequence_length = None
18 |
19 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
20 |
21 | x = ob / 255.0
22 | if kind == 'small': # from A3C paper
23 | x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
24 | x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
25 | x = U.flattenallbut0(x)
26 | x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
27 | elif kind == 'large': # Nature DQN
28 | x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
29 | x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
30 | x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
31 | x = U.flattenallbut0(x)
32 | x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
33 | else:
34 | raise NotImplementedError
35 |
36 | logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
37 | self.pd = pdtype.pdfromflat(logits)
38 | self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]
39 |
40 | self.state_in = []
41 | self.state_out = []
42 |
43 | stochastic = tf.placeholder(dtype=tf.bool, shape=())
44 | ac = self.pd.sample() # XXX
45 | self._act = U.function([stochastic, ob], [ac, self.vpred])
46 |
47 | def act(self, stochastic, ob):
48 | ac1, vpred1 = self._act(stochastic, ob[None])
49 | return ac1[0], vpred1[0]
50 | def get_variables(self):
51 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
52 | def get_trainable_variables(self):
53 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
54 | def get_initial_state(self):
55 | return []
56 |
57 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import spaces
3 | from . import VecEnv
4 | from .util import copy_obs_dict, dict_to_obs, obs_space_info
5 |
6 | class DummyVecEnv(VecEnv):
7 | def __init__(self, env_fns):
8 | self.envs = [fn() for fn in env_fns]
9 | env = self.envs[0]
10 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
11 | obs_space = env.observation_space
12 |
13 | self.keys, shapes, dtypes = obs_space_info(obs_space)
14 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
15 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
16 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
17 | self.buf_infos = [{} for _ in range(self.num_envs)]
18 | self.actions = None
19 |
20 | def step_async(self, actions):
21 | listify = True
22 | try:
23 | if len(actions) == self.num_envs:
24 | listify = False
25 | except TypeError:
26 | pass
27 |
28 | if not listify:
29 | self.actions = actions
30 | else:
31 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
32 | self.actions = [actions]
33 |
34 | def step_wait(self):
35 | for e in range(self.num_envs):
36 | action = self.actions[e]
37 | if isinstance(self.envs[e].action_space, spaces.Discrete):
38 | action = int(action)
39 |
40 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
41 | if self.buf_dones[e]:
42 | obs = self.envs[e].reset()
43 | self._save_obs(e, obs)
44 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
45 | self.buf_infos.copy())
46 |
47 | def reset(self):
48 | for e in range(self.num_envs):
49 | obs = self.envs[e].reset()
50 | self._save_obs(e, obs)
51 | return self._obs_from_buf()
52 |
53 | def close(self):
54 | return
55 |
56 | def _save_obs(self, e, obs):
57 | for k in self.keys:
58 | if k is None:
59 | self.buf_obs[k][e] = obs
60 | else:
61 | self.buf_obs[k][e] = obs[k]
62 |
63 | def _obs_from_buf(self):
64 | return dict_to_obs(copy_obs_dict(self.buf_obs))
65 |
66 | def get_images(self):
67 | return [env.render(mode='rgb_array') for env in self.envs]
68 |
69 |
--------------------------------------------------------------------------------
/baselines/ppo1/run_humanoid.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
4 | from baselines.common import tf_util as U
5 | from baselines import logger
6 |
7 | import gym
8 |
9 | def train(num_timesteps, seed, model_path=None):
10 | env_id = 'Humanoid-v2'
11 | from baselines.ppo1 import mlp_policy, pposgd_simple
12 | U.make_session(num_cpu=1).__enter__()
13 | def policy_fn(name, ob_space, ac_space):
14 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
15 | hid_size=64, num_hid_layers=2)
16 | env = make_mujoco_env(env_id, seed)
17 |
18 | # parameters below were the best found in a simple random search
19 | # these are good enough to make humanoid walk, but whether those are
20 | # an absolute best or not is not certain
21 | env = RewScale(env, 0.1)
22 | pi = pposgd_simple.learn(env, policy_fn,
23 | max_timesteps=num_timesteps,
24 | timesteps_per_actorbatch=2048,
25 | clip_param=0.2, entcoeff=0.0,
26 | optim_epochs=10,
27 | optim_stepsize=3e-4,
28 | optim_batchsize=64,
29 | gamma=0.99,
30 | lam=0.95,
31 | schedule='linear',
32 | )
33 | env.close()
34 | if model_path:
35 | U.save_state(model_path)
36 |
37 | return pi
38 |
39 | class RewScale(gym.RewardWrapper):
40 | def __init__(self, env, scale):
41 | gym.RewardWrapper.__init__(self, env)
42 | self.scale = scale
43 | def reward(self, r):
44 | return r * self.scale
45 |
46 | def main():
47 | logger.configure()
48 | parser = mujoco_arg_parser()
49 | parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
50 | parser.set_defaults(num_timesteps=int(2e7))
51 |
52 | args = parser.parse_args()
53 |
54 | if not args.play:
55 | # train the model
56 | train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
57 | else:
58 | # construct the model object, load pre-trained model and render
59 | pi = train(num_timesteps=1, seed=args.seed)
60 | U.load_state(args.model_path)
61 | env = make_mujoco_env('Humanoid-v2', seed=0)
62 |
63 | ob = env.reset()
64 | while True:
65 | action = pi.act(stochastic=False, ob=ob)[0]
66 | ob, _, done, _ = env.step(action)
67 | env.render()
68 | if done:
69 | ob = env.reset()
70 |
71 |
72 |
73 |
74 | if __name__ == '__main__':
75 | main()
76 |
--------------------------------------------------------------------------------
/baselines/ddpg/models.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow.contrib as tc
3 |
4 |
5 | class Model(object):
6 | def __init__(self, name):
7 | self.name = name
8 |
9 | @property
10 | def vars(self):
11 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
12 |
13 | @property
14 | def trainable_vars(self):
15 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name)
16 |
17 | @property
18 | def perturbable_vars(self):
19 | return [var for var in self.trainable_vars if 'LayerNorm' not in var.name]
20 |
21 |
22 | class Actor(Model):
23 | def __init__(self, nb_actions, name='actor', layer_norm=True):
24 | super(Actor, self).__init__(name=name)
25 | self.nb_actions = nb_actions
26 | self.layer_norm = layer_norm
27 |
28 | def __call__(self, obs, reuse=False):
29 | with tf.variable_scope(self.name) as scope:
30 | if reuse:
31 | scope.reuse_variables()
32 |
33 | x = obs
34 | x = tf.layers.dense(x, 64)
35 | if self.layer_norm:
36 | x = tc.layers.layer_norm(x, center=True, scale=True)
37 | x = tf.nn.relu(x)
38 |
39 | x = tf.layers.dense(x, 64)
40 | if self.layer_norm:
41 | x = tc.layers.layer_norm(x, center=True, scale=True)
42 | x = tf.nn.relu(x)
43 |
44 | x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
45 | x = tf.nn.tanh(x)
46 | return x
47 |
48 |
49 | class Critic(Model):
50 | def __init__(self, name='critic', layer_norm=True):
51 | super(Critic, self).__init__(name=name)
52 | self.layer_norm = layer_norm
53 |
54 | def __call__(self, obs, action, reuse=False):
55 | with tf.variable_scope(self.name) as scope:
56 | if reuse:
57 | scope.reuse_variables()
58 |
59 | x = obs
60 | x = tf.layers.dense(x, 64)
61 | if self.layer_norm:
62 | x = tc.layers.layer_norm(x, center=True, scale=True)
63 | x = tf.nn.relu(x)
64 |
65 | x = tf.concat([x, action], axis=-1)
66 | x = tf.layers.dense(x, 64)
67 | if self.layer_norm:
68 | x = tc.layers.layer_norm(x, center=True, scale=True)
69 | x = tf.nn.relu(x)
70 |
71 | x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
72 | return x
73 |
74 | @property
75 | def output_vars(self):
76 | output_vars = [var for var in self.trainable_vars if 'output' in var.name]
77 | return output_vars
78 |
--------------------------------------------------------------------------------
/baselines/gail/result/gail-result.md:
--------------------------------------------------------------------------------
1 | # Results of GAIL/BC on Mujoco
2 |
3 | Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including
4 | Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0.
5 |
6 | ## Results
7 |
8 | ### Training through iterations
9 |
10 | - Hoppers-v1
11 |
12 |
13 | - HalfCheetah-v1
14 |
15 |
16 | - Walker2d-v1
17 |
18 |
19 | - Humanoid-v1
20 |
21 |
22 | - HumanoidStandup-v1
23 |
24 |
25 | For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing)
26 |
27 | ### Determinstic Polciy (Set std=0)
28 | | | Un-normalized | Normalized |
29 | |---|---|---|
30 | | Hopper-v1 |
|
|
31 | | HalfCheetah-v1 |
|
|
32 | | Walker2d-v1 |
|
|
33 | | Humanoid-v1 |
|
|
34 | | HumanoidStandup-v1 |
|
|
35 |
36 | ### Stochatic Policy
37 | | | Un-normalized | Normalized |
38 | |---|---|---|
39 | | Hopper-v1 |
|
|
40 | | HalfCheetah-v1 |
|
|
41 | | Walker2d-v1 |
|
|
42 | | Humanoid-v1 |
|
|
43 | | HumanoidStandup-v1 |
|
|
44 |
45 | ### details about GAIL imitator
46 |
47 | For all environments, the
48 | imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most
49 | 1024 transitions, and seed 0, 1, 2, 3, respectively.
50 |
51 | ### details about the BC imitators
52 |
53 | All BC imitators are trained with seed 0.
54 |
--------------------------------------------------------------------------------
/baselines/acer/runner.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from baselines.common.runners import AbstractEnvRunner
3 |
4 | class Runner(AbstractEnvRunner):
5 |
6 | def __init__(self, env, model, nsteps, nstack):
7 | super().__init__(env=env, model=model, nsteps=nsteps)
8 | self.nstack = nstack
9 | nh, nw, nc = env.observation_space.shape
10 | self.nc = nc # nc = 1 for atari, but just in case
11 | self.nact = env.action_space.n
12 | nenv = self.nenv
13 | self.nbatch = nenv * nsteps
14 | self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack)
15 | self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8)
16 | obs = env.reset()
17 | self.update_obs(obs)
18 |
19 | def update_obs(self, obs, dones=None):
20 | #self.obs = obs
21 | if dones is not None:
22 | self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None]
23 | self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
24 | self.obs[:, :, :, -self.nc:] = obs[:, :, :, :]
25 |
26 | def run(self):
27 | enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps
28 | mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
29 | for _ in range(self.nsteps):
30 | actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones)
31 | mb_obs.append(np.copy(self.obs))
32 | mb_actions.append(actions)
33 | mb_mus.append(mus)
34 | mb_dones.append(self.dones)
35 | obs, rewards, dones, _ = self.env.step(actions)
36 | # states information for statefull models like LSTM
37 | self.states = states
38 | self.dones = dones
39 | self.update_obs(obs, dones)
40 | mb_rewards.append(rewards)
41 | enc_obs.append(obs)
42 | mb_obs.append(np.copy(self.obs))
43 | mb_dones.append(self.dones)
44 |
45 | enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0)
46 | mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0)
47 | mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
48 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
49 | mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
50 |
51 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
52 |
53 | mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
54 | mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
55 |
56 | # shapes are now [nenv, nsteps, []]
57 | # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
58 |
59 | return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
60 |
61 |
--------------------------------------------------------------------------------
/baselines/acktr/value_functions.py:
--------------------------------------------------------------------------------
1 | from baselines import logger
2 | import numpy as np
3 | import baselines.common as common
4 | from baselines.common import tf_util as U
5 | import tensorflow as tf
6 | from baselines.acktr import kfac
7 | from baselines.acktr.utils import dense
8 |
9 | class NeuralNetValueFunction(object):
10 | def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
11 | X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
12 | vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
13 | wd_dict = {}
14 | h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
15 | h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
16 | vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0]
17 | sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
18 | wd_loss = tf.get_collection("vf_losses", None)
19 | loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
20 | loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
21 | self._predict = U.function([X], vpred_n)
22 | optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
23 | clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
24 | async_=1, kfac_update=2, cold_iter=50, \
25 | weight_decay_dict=wd_dict, max_grad_norm=None)
26 | vf_var_list = []
27 | for var in tf.trainable_variables():
28 | if "vf" in var.name:
29 | vf_var_list.append(var)
30 |
31 | update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
32 | self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
33 | U.initialize() # Initialize uninitialized TF variables
34 | def _preproc(self, path):
35 | l = pathlength(path)
36 | al = np.arange(l).reshape(-1,1)/10.0
37 | act = path["action_dist"].astype('float32')
38 | X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
39 | return X
40 | def predict(self, path):
41 | return self._predict(self._preproc(path))
42 | def fit(self, paths, targvals):
43 | X = np.concatenate([self._preproc(p) for p in paths])
44 | y = np.concatenate(targvals)
45 | logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
46 | for _ in range(25): self.do_update(X, y)
47 | logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
48 |
49 | def pathlength(path):
50 | return path["reward"].shape[0]
51 |
--------------------------------------------------------------------------------
/baselines/a2c/runner.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from baselines.a2c.utils import discount_with_dones
3 | from baselines.common.runners import AbstractEnvRunner
4 |
5 | class Runner(AbstractEnvRunner):
6 |
7 | def __init__(self, env, model, nsteps=5, gamma=0.99):
8 | super().__init__(env=env, model=model, nsteps=nsteps)
9 | self.gamma = gamma
10 | self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
11 | self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
12 |
13 | def run(self):
14 | mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
15 | mb_states = self.states
16 | for n in range(self.nsteps):
17 | actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
18 | mb_obs.append(np.copy(self.obs))
19 | mb_actions.append(actions)
20 | mb_values.append(values)
21 | mb_dones.append(self.dones)
22 | obs, rewards, dones, _ = self.env.step(actions)
23 | self.states = states
24 | self.dones = dones
25 | for n, done in enumerate(dones):
26 | if done:
27 | self.obs[n] = self.obs[n]*0
28 | self.obs = obs
29 | mb_rewards.append(rewards)
30 | mb_dones.append(self.dones)
31 | #batch of steps to batch of rollouts
32 |
33 | mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
34 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
35 | mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
36 | mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
37 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
38 | mb_masks = mb_dones[:, :-1]
39 | mb_dones = mb_dones[:, 1:]
40 |
41 |
42 | if self.gamma > 0.0:
43 | #discount/bootstrap off value fn
44 | last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
45 | for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
46 | rewards = rewards.tolist()
47 | dones = dones.tolist()
48 | if dones[-1] == 0:
49 | rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
50 | else:
51 | rewards = discount_with_dones(rewards, dones, self.gamma)
52 |
53 | mb_rewards[n] = rewards
54 |
55 | mb_actions = mb_actions.reshape(self.batch_action_shape)
56 |
57 | mb_rewards = mb_rewards.flatten()
58 | mb_values = mb_values.flatten()
59 | mb_masks = mb_masks.flatten()
60 | return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
61 |
--------------------------------------------------------------------------------
/baselines/deepq/utils.py:
--------------------------------------------------------------------------------
1 | from baselines.common.input import observation_input
2 | from baselines.common.tf_util import adjust_shape
3 |
4 | import tensorflow as tf
5 |
6 | # ================================================================
7 | # Placeholders
8 | # ================================================================
9 |
10 |
11 | class TfInput(object):
12 | def __init__(self, name="(unnamed)"):
13 | """Generalized Tensorflow placeholder. The main differences are:
14 | - possibly uses multiple placeholders internally and returns multiple values
15 | - can apply light postprocessing to the value feed to placeholder.
16 | """
17 | self.name = name
18 |
19 | def get(self):
20 | """Return the tf variable(s) representing the possibly postprocessed value
21 | of placeholder(s).
22 | """
23 | raise NotImplemented()
24 |
25 | def make_feed_dict(data):
26 | """Given data input it to the placeholder(s)."""
27 | raise NotImplemented()
28 |
29 |
30 | class PlaceholderTfInput(TfInput):
31 | def __init__(self, placeholder):
32 | """Wrapper for regular tensorflow placeholder."""
33 | super().__init__(placeholder.name)
34 | self._placeholder = placeholder
35 |
36 | def get(self):
37 | return self._placeholder
38 |
39 | def make_feed_dict(self, data):
40 | return {self._placeholder: adjust_shape(self._placeholder, data)}
41 |
42 |
43 | class Uint8Input(PlaceholderTfInput):
44 | def __init__(self, shape, name=None):
45 | """Takes input in uint8 format which is cast to float32 and divided by 255
46 | before passing it to the model.
47 |
48 | On GPU this ensures lower data transfer times.
49 |
50 | Parameters
51 | ----------
52 | shape: [int]
53 | shape of the tensor.
54 | name: str
55 | name of the underlying placeholder
56 | """
57 |
58 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
59 | self._shape = shape
60 | self._output = tf.cast(super().get(), tf.float32) / 255.0
61 |
62 | def get(self):
63 | return self._output
64 |
65 |
66 | class ObservationInput(PlaceholderTfInput):
67 | def __init__(self, observation_space, name=None):
68 | """Creates an input placeholder tailored to a specific observation space
69 |
70 | Parameters
71 | ----------
72 |
73 | observation_space:
74 | observation space of the environment. Should be one of the gym.spaces types
75 | name: str
76 | tensorflow name of the underlying placeholder
77 | """
78 | inpt, self.processed_inpt = observation_input(observation_space, name=name)
79 | super().__init__(inpt)
80 |
81 | def get(self):
82 | return self.processed_inpt
83 |
84 |
85 |
--------------------------------------------------------------------------------
/baselines/her/her.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
5 | """Creates a sample function that can be used for HER experience replay.
6 |
7 | Args:
8 | replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none',
9 | regular DDPG experience replay is used
10 | replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
11 | as many HER replays as regular replays are used)
12 | reward_fun (function): function to re-compute the reward with substituted goals
13 | """
14 | if replay_strategy == 'future':
15 | future_p = 1 - (1. / (1 + replay_k))
16 | else: # 'replay_strategy' == 'none'
17 | future_p = 0
18 |
19 | def _sample_her_transitions(episode_batch, batch_size_in_transitions):
20 | """episode_batch is {key: array(buffer_size x T x dim_key)}
21 | """
22 | T = episode_batch['u'].shape[1]
23 | rollout_batch_size = episode_batch['u'].shape[0]
24 | batch_size = batch_size_in_transitions
25 |
26 | # Select which episodes and time steps to use.
27 | episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
28 | t_samples = np.random.randint(T, size=batch_size)
29 | transitions = {key: episode_batch[key][episode_idxs, t_samples].copy()
30 | for key in episode_batch.keys()}
31 |
32 | # Select future time indexes proportional with probability future_p. These
33 | # will be used for HER replay by substituting in future goals.
34 | her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
35 | future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
36 | future_offset = future_offset.astype(int)
37 | future_t = (t_samples + 1 + future_offset)[her_indexes]
38 |
39 | # Replace goal with achieved goal but only for the previously-selected
40 | # HER transitions (as defined by her_indexes). For the other transitions,
41 | # keep the original goal.
42 | future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
43 | transitions['g'][her_indexes] = future_ag
44 |
45 | # Reconstruct info dictionary for reward computation.
46 | info = {}
47 | for key, value in transitions.items():
48 | if key.startswith('info_'):
49 | info[key.replace('info_', '')] = value
50 |
51 | # Re-compute reward since we may have substituted the goal.
52 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
53 | reward_params['info'] = info
54 | transitions['r'] = reward_fun(**reward_params)
55 |
56 | transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
57 | for k in transitions.keys()}
58 |
59 | assert(transitions['u'].shape[0] == batch_size_in_transitions)
60 |
61 | return transitions
62 |
63 | return _sample_her_transitions
64 |
--------------------------------------------------------------------------------
/baselines/common/tests/util.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | from gym.spaces import np_random
4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
5 |
6 | N_TRIALS = 10000
7 | N_EPISODES = 100
8 |
9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
10 | np.random.seed(0)
11 | np_random.seed(0)
12 |
13 | env = DummyVecEnv([env_fn])
14 |
15 |
16 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
17 | tf.set_random_seed(0)
18 |
19 | model = learn_fn(env)
20 |
21 | sum_rew = 0
22 | done = True
23 |
24 | for i in range(n_trials):
25 | if done:
26 | obs = env.reset()
27 | state = model.initial_state
28 |
29 | if state is not None:
30 | a, v, state, _ = model.step(obs, S=state, M=[False])
31 | else:
32 | a, v, _, _ = model.step(obs)
33 |
34 | obs, rew, done, _ = env.step(a)
35 | sum_rew += float(rew)
36 |
37 | print("Reward in {} trials is {}".format(n_trials, sum_rew))
38 | assert sum_rew > min_reward_fraction * n_trials, \
39 | 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
40 |
41 |
42 |
43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
44 | env = DummyVecEnv([env_fn])
45 |
46 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
47 | model = learn_fn(env)
48 |
49 | N_TRIALS = 100
50 |
51 | observations, actions, rewards = rollout(env, model, N_TRIALS)
52 | rewards = [sum(r) for r in rewards]
53 |
54 | avg_rew = sum(rewards) / N_TRIALS
55 | print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
56 | assert avg_rew > min_avg_reward, \
57 | 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
58 |
59 | def rollout(env, model, n_trials):
60 | rewards = []
61 | actions = []
62 | observations = []
63 |
64 | for i in range(n_trials):
65 | obs = env.reset()
66 | state = model.initial_state
67 | episode_rew = []
68 | episode_actions = []
69 | episode_obs = []
70 |
71 | while True:
72 | if state is not None:
73 | a, v, state, _ = model.step(obs, S=state, M=[False])
74 | else:
75 | a,v, _, _ = model.step(obs)
76 |
77 | obs, rew, done, _ = env.step(a)
78 |
79 | episode_rew.append(rew)
80 | episode_actions.append(a)
81 | episode_obs.append(obs)
82 |
83 | if done:
84 | break
85 |
86 | rewards.append(episode_rew)
87 | actions.append(episode_actions)
88 | observations.append(episode_obs)
89 |
90 | return observations, actions, rewards
91 |
92 |
--------------------------------------------------------------------------------
/baselines/ddpg/memory.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class RingBuffer(object):
5 | def __init__(self, maxlen, shape, dtype='float32'):
6 | self.maxlen = maxlen
7 | self.start = 0
8 | self.length = 0
9 | self.data = np.zeros((maxlen,) + shape).astype(dtype)
10 |
11 | def __len__(self):
12 | return self.length
13 |
14 | def __getitem__(self, idx):
15 | if idx < 0 or idx >= self.length:
16 | raise KeyError()
17 | return self.data[(self.start + idx) % self.maxlen]
18 |
19 | def get_batch(self, idxs):
20 | return self.data[(self.start + idxs) % self.maxlen]
21 |
22 | def append(self, v):
23 | if self.length < self.maxlen:
24 | # We have space, simply increase the length.
25 | self.length += 1
26 | elif self.length == self.maxlen:
27 | # No space, "remove" the first item.
28 | self.start = (self.start + 1) % self.maxlen
29 | else:
30 | # This should never happen.
31 | raise RuntimeError()
32 | self.data[(self.start + self.length - 1) % self.maxlen] = v
33 |
34 |
35 | def array_min2d(x):
36 | x = np.array(x)
37 | if x.ndim >= 2:
38 | return x
39 | return x.reshape(-1, 1)
40 |
41 |
42 | class Memory(object):
43 | def __init__(self, limit, action_shape, observation_shape):
44 | self.limit = limit
45 |
46 | self.observations0 = RingBuffer(limit, shape=observation_shape)
47 | self.actions = RingBuffer(limit, shape=action_shape)
48 | self.rewards = RingBuffer(limit, shape=(1,))
49 | self.terminals1 = RingBuffer(limit, shape=(1,))
50 | self.observations1 = RingBuffer(limit, shape=observation_shape)
51 |
52 | def sample(self, batch_size):
53 | # Draw such that we always have a proceeding element.
54 | batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size)
55 |
56 | obs0_batch = self.observations0.get_batch(batch_idxs)
57 | obs1_batch = self.observations1.get_batch(batch_idxs)
58 | action_batch = self.actions.get_batch(batch_idxs)
59 | reward_batch = self.rewards.get_batch(batch_idxs)
60 | terminal1_batch = self.terminals1.get_batch(batch_idxs)
61 |
62 | result = {
63 | 'obs0': array_min2d(obs0_batch),
64 | 'obs1': array_min2d(obs1_batch),
65 | 'rewards': array_min2d(reward_batch),
66 | 'actions': array_min2d(action_batch),
67 | 'terminals1': array_min2d(terminal1_batch),
68 | }
69 | return result
70 |
71 | def append(self, obs0, action, reward, obs1, terminal1, training=True):
72 | if not training:
73 | return
74 |
75 | self.observations0.append(obs0)
76 | self.actions.append(action)
77 | self.rewards.append(reward)
78 | self.observations1.append(obs1)
79 | self.terminals1.append(terminal1)
80 |
81 | @property
82 | def nb_entries(self):
83 | return len(self.observations0)
84 |
--------------------------------------------------------------------------------
/baselines/ppo1/mlp_policy.py:
--------------------------------------------------------------------------------
1 | from baselines.common.mpi_running_mean_std import RunningMeanStd
2 | import baselines.common.tf_util as U
3 | import tensorflow as tf
4 | import gym
5 | from baselines.common.distributions import make_pdtype
6 |
7 | class MlpPolicy(object):
8 | recurrent = False
9 | def __init__(self, name, *args, **kwargs):
10 | with tf.variable_scope(name):
11 | self._init(*args, **kwargs)
12 | self.scope = tf.get_variable_scope().name
13 |
14 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
15 | assert isinstance(ob_space, gym.spaces.Box)
16 |
17 | self.pdtype = pdtype = make_pdtype(ac_space)
18 | sequence_length = None
19 |
20 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 |
22 | with tf.variable_scope("obfilter"):
23 | self.ob_rms = RunningMeanStd(shape=ob_space.shape)
24 |
25 | with tf.variable_scope('vf'):
26 | obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
27 | last_out = obz
28 | for i in range(num_hid_layers):
29 | last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
30 | self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
31 |
32 | with tf.variable_scope('pol'):
33 | last_out = obz
34 | for i in range(num_hid_layers):
35 | last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
36 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
37 | mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
38 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
39 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
40 | else:
41 | pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))
42 |
43 | self.pd = pdtype.pdfromflat(pdparam)
44 |
45 | self.state_in = []
46 | self.state_out = []
47 |
48 | stochastic = tf.placeholder(dtype=tf.bool, shape=())
49 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
50 | self._act = U.function([stochastic, ob], [ac, self.vpred])
51 |
52 | def act(self, stochastic, ob):
53 | ac1, vpred1 = self._act(stochastic, ob[None])
54 | return ac1[0], vpred1[0]
55 | def get_variables(self):
56 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
57 | def get_trainable_variables(self):
58 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
59 | def get_initial_state(self):
60 | return []
61 |
62 |
--------------------------------------------------------------------------------
/baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
1 | from mpi4py import MPI
2 | import baselines.common.tf_util as U
3 | import tensorflow as tf
4 | import numpy as np
5 |
6 | class MpiAdam(object):
7 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
8 | self.var_list = var_list
9 | self.beta1 = beta1
10 | self.beta2 = beta2
11 | self.epsilon = epsilon
12 | self.scale_grad_by_procs = scale_grad_by_procs
13 | size = sum(U.numel(v) for v in var_list)
14 | self.m = np.zeros(size, 'float32')
15 | self.v = np.zeros(size, 'float32')
16 | self.t = 0
17 | self.setfromflat = U.SetFromFlat(var_list)
18 | self.getflat = U.GetFlat(var_list)
19 | self.comm = MPI.COMM_WORLD if comm is None else comm
20 |
21 | def update(self, localg, stepsize):
22 | if self.t % 100 == 0:
23 | self.check_synced()
24 | localg = localg.astype('float32')
25 | globalg = np.zeros_like(localg)
26 | self.comm.Allreduce(localg, globalg, op=MPI.SUM)
27 | if self.scale_grad_by_procs:
28 | globalg /= self.comm.Get_size()
29 |
30 | self.t += 1
31 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
32 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
33 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
34 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
35 | self.setfromflat(self.getflat() + step)
36 |
37 | def sync(self):
38 | theta = self.getflat()
39 | self.comm.Bcast(theta, root=0)
40 | self.setfromflat(theta)
41 |
42 | def check_synced(self):
43 | if self.comm.Get_rank() == 0: # this is root
44 | theta = self.getflat()
45 | self.comm.Bcast(theta, root=0)
46 | else:
47 | thetalocal = self.getflat()
48 | thetaroot = np.empty_like(thetalocal)
49 | self.comm.Bcast(thetaroot, root=0)
50 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
51 |
52 | @U.in_session
53 | def test_MpiAdam():
54 | np.random.seed(0)
55 | tf.set_random_seed(0)
56 |
57 | a = tf.Variable(np.random.randn(3).astype('float32'))
58 | b = tf.Variable(np.random.randn(2,5).astype('float32'))
59 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
60 |
61 | stepsize = 1e-2
62 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
63 | do_update = U.function([], loss, updates=[update_op])
64 |
65 | tf.get_default_session().run(tf.global_variables_initializer())
66 | for i in range(10):
67 | print(i,do_update())
68 |
69 | tf.set_random_seed(0)
70 | tf.get_default_session().run(tf.global_variables_initializer())
71 |
72 | var_list = [a,b]
73 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
74 | adam = MpiAdam(var_list)
75 |
76 | for i in range(10):
77 | l,g = lossandgrad()
78 | adam.update(g, stepsize)
79 | print(i,l)
--------------------------------------------------------------------------------
/baselines/acer/policies.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from baselines.common.policies import nature_cnn
4 | from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
5 |
6 |
7 | class AcerCnnPolicy(object):
8 |
9 | def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
10 | nbatch = nenv * nsteps
11 | nh, nw, nc = ob_space.shape
12 | ob_shape = (nbatch, nh, nw, nc * nstack)
13 | nact = ac_space.n
14 | X = tf.placeholder(tf.uint8, ob_shape) # obs
15 | with tf.variable_scope("model", reuse=reuse):
16 | h = nature_cnn(X)
17 | pi_logits = fc(h, 'pi', nact, init_scale=0.01)
18 | pi = tf.nn.softmax(pi_logits)
19 | q = fc(h, 'q', nact)
20 |
21 | a = sample(tf.nn.softmax(pi_logits)) # could change this to use self.pi instead
22 | self.initial_state = [] # not stateful
23 | self.X = X
24 | self.pi = pi # actual policy params now
25 | self.pi_logits = pi_logits
26 | self.q = q
27 | self.vf = q
28 |
29 | def step(ob, *args, **kwargs):
30 | # returns actions, mus, states
31 | a0, pi0 = sess.run([a, pi], {X: ob})
32 | return a0, pi0, [] # dummy state
33 |
34 | def out(ob, *args, **kwargs):
35 | pi0, q0 = sess.run([pi, q], {X: ob})
36 | return pi0, q0
37 |
38 | def act(ob, *args, **kwargs):
39 | return sess.run(a, {X: ob})
40 |
41 | self.step = step
42 | self.out = out
43 | self.act = act
44 |
45 | class AcerLstmPolicy(object):
46 |
47 | def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
48 | nbatch = nenv * nsteps
49 | nh, nw, nc = ob_space.shape
50 | ob_shape = (nbatch, nh, nw, nc * nstack)
51 | nact = ac_space.n
52 | X = tf.placeholder(tf.uint8, ob_shape) # obs
53 | M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
54 | S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
55 | with tf.variable_scope("model", reuse=reuse):
56 | h = nature_cnn(X)
57 |
58 | # lstm
59 | xs = batch_to_seq(h, nenv, nsteps)
60 | ms = batch_to_seq(M, nenv, nsteps)
61 | h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
62 | h5 = seq_to_batch(h5)
63 |
64 | pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
65 | pi = tf.nn.softmax(pi_logits)
66 | q = fc(h5, 'q', nact)
67 |
68 | a = sample(pi_logits) # could change this to use self.pi instead
69 | self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
70 | self.X = X
71 | self.M = M
72 | self.S = S
73 | self.pi = pi # actual policy params now
74 | self.q = q
75 |
76 | def step(ob, state, mask, *args, **kwargs):
77 | # returns actions, mus, states
78 | a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
79 | return a0, pi0, s
80 |
81 | self.step = step
82 |
--------------------------------------------------------------------------------
/baselines/common/filters.py:
--------------------------------------------------------------------------------
1 | from .running_stat import RunningStat
2 | from collections import deque
3 | import numpy as np
4 |
5 | class Filter(object):
6 | def __call__(self, x, update=True):
7 | raise NotImplementedError
8 | def reset(self):
9 | pass
10 |
11 | class IdentityFilter(Filter):
12 | def __call__(self, x, update=True):
13 | return x
14 |
15 | class CompositionFilter(Filter):
16 | def __init__(self, fs):
17 | self.fs = fs
18 | def __call__(self, x, update=True):
19 | for f in self.fs:
20 | x = f(x)
21 | return x
22 | def output_shape(self, input_space):
23 | out = input_space.shape
24 | for f in self.fs:
25 | out = f.output_shape(out)
26 | return out
27 |
28 | class ZFilter(Filter):
29 | """
30 | y = (x-mean)/std
31 | using running estimates of mean,std
32 | """
33 |
34 | def __init__(self, shape, demean=True, destd=True, clip=10.0):
35 | self.demean = demean
36 | self.destd = destd
37 | self.clip = clip
38 |
39 | self.rs = RunningStat(shape)
40 |
41 | def __call__(self, x, update=True):
42 | if update: self.rs.push(x)
43 | if self.demean:
44 | x = x - self.rs.mean
45 | if self.destd:
46 | x = x / (self.rs.std+1e-8)
47 | if self.clip:
48 | x = np.clip(x, -self.clip, self.clip)
49 | return x
50 | def output_shape(self, input_space):
51 | return input_space.shape
52 |
53 | class AddClock(Filter):
54 | def __init__(self):
55 | self.count = 0
56 | def reset(self):
57 | self.count = 0
58 | def __call__(self, x, update=True):
59 | return np.append(x, self.count/100.0)
60 | def output_shape(self, input_space):
61 | return (input_space.shape[0]+1,)
62 |
63 | class FlattenFilter(Filter):
64 | def __call__(self, x, update=True):
65 | return x.ravel()
66 | def output_shape(self, input_space):
67 | return (int(np.prod(input_space.shape)),)
68 |
69 | class Ind2OneHotFilter(Filter):
70 | def __init__(self, n):
71 | self.n = n
72 | def __call__(self, x, update=True):
73 | out = np.zeros(self.n)
74 | out[x] = 1
75 | return out
76 | def output_shape(self, input_space):
77 | return (input_space.n,)
78 |
79 | class DivFilter(Filter):
80 | def __init__(self, divisor):
81 | self.divisor = divisor
82 | def __call__(self, x, update=True):
83 | return x / self.divisor
84 | def output_shape(self, input_space):
85 | return input_space.shape
86 |
87 | class StackFilter(Filter):
88 | def __init__(self, length):
89 | self.stack = deque(maxlen=length)
90 | def reset(self):
91 | self.stack.clear()
92 | def __call__(self, x, update=True):
93 | self.stack.append(x)
94 | while len(self.stack) < self.stack.maxlen:
95 | self.stack.append(x)
96 | return np.concatenate(self.stack, axis=-1)
97 | def output_shape(self, input_space):
98 | return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
99 |
--------------------------------------------------------------------------------
/baselines/gail/mlp_policy.py:
--------------------------------------------------------------------------------
1 | '''
2 | from baselines/ppo1/mlp_policy.py and add simple modification
3 | (1) add reuse argument
4 | (2) cache the `stochastic` placeholder
5 | '''
6 | import tensorflow as tf
7 | import gym
8 |
9 | import baselines.common.tf_util as U
10 | from baselines.common.mpi_running_mean_std import RunningMeanStd
11 | from baselines.common.distributions import make_pdtype
12 | from baselines.acktr.utils import dense
13 |
14 |
15 | class MlpPolicy(object):
16 | recurrent = False
17 |
18 | def __init__(self, name, reuse=False, *args, **kwargs):
19 | with tf.variable_scope(name):
20 | if reuse:
21 | tf.get_variable_scope().reuse_variables()
22 | self._init(*args, **kwargs)
23 | self.scope = tf.get_variable_scope().name
24 |
25 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
26 | assert isinstance(ob_space, gym.spaces.Box)
27 |
28 | self.pdtype = pdtype = make_pdtype(ac_space)
29 | sequence_length = None
30 |
31 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
32 |
33 | with tf.variable_scope("obfilter"):
34 | self.ob_rms = RunningMeanStd(shape=ob_space.shape)
35 |
36 | obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
37 | last_out = obz
38 | for i in range(num_hid_layers):
39 | last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
40 | self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
41 |
42 | last_out = obz
43 | for i in range(num_hid_layers):
44 | last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
45 |
46 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
47 | mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
48 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
49 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
50 | else:
51 | pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
52 |
53 | self.pd = pdtype.pdfromflat(pdparam)
54 |
55 | self.state_in = []
56 | self.state_out = []
57 |
58 | # change for BC
59 | stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
60 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
61 | self.ac = ac
62 | self._act = U.function([stochastic, ob], [ac, self.vpred])
63 |
64 | def act(self, stochastic, ob):
65 | ac1, vpred1 = self._act(stochastic, ob[None])
66 | return ac1[0], vpred1[0]
67 |
68 | def get_variables(self):
69 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
70 |
71 | def get_trainable_variables(self):
72 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
73 |
74 | def get_initial_state(self):
75 | return []
76 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
4 |
5 |
6 | def test_tree_set():
7 | tree = SumSegmentTree(4)
8 |
9 | tree[2] = 1.0
10 | tree[3] = 3.0
11 |
12 | assert np.isclose(tree.sum(), 4.0)
13 | assert np.isclose(tree.sum(0, 2), 0.0)
14 | assert np.isclose(tree.sum(0, 3), 1.0)
15 | assert np.isclose(tree.sum(2, 3), 1.0)
16 | assert np.isclose(tree.sum(2, -1), 1.0)
17 | assert np.isclose(tree.sum(2, 4), 4.0)
18 |
19 |
20 | def test_tree_set_overlap():
21 | tree = SumSegmentTree(4)
22 |
23 | tree[2] = 1.0
24 | tree[2] = 3.0
25 |
26 | assert np.isclose(tree.sum(), 3.0)
27 | assert np.isclose(tree.sum(2, 3), 3.0)
28 | assert np.isclose(tree.sum(2, -1), 3.0)
29 | assert np.isclose(tree.sum(2, 4), 3.0)
30 | assert np.isclose(tree.sum(1, 2), 0.0)
31 |
32 |
33 | def test_prefixsum_idx():
34 | tree = SumSegmentTree(4)
35 |
36 | tree[2] = 1.0
37 | tree[3] = 3.0
38 |
39 | assert tree.find_prefixsum_idx(0.0) == 2
40 | assert tree.find_prefixsum_idx(0.5) == 2
41 | assert tree.find_prefixsum_idx(0.99) == 2
42 | assert tree.find_prefixsum_idx(1.01) == 3
43 | assert tree.find_prefixsum_idx(3.00) == 3
44 | assert tree.find_prefixsum_idx(4.00) == 3
45 |
46 |
47 | def test_prefixsum_idx2():
48 | tree = SumSegmentTree(4)
49 |
50 | tree[0] = 0.5
51 | tree[1] = 1.0
52 | tree[2] = 1.0
53 | tree[3] = 3.0
54 |
55 | assert tree.find_prefixsum_idx(0.00) == 0
56 | assert tree.find_prefixsum_idx(0.55) == 1
57 | assert tree.find_prefixsum_idx(0.99) == 1
58 | assert tree.find_prefixsum_idx(1.51) == 2
59 | assert tree.find_prefixsum_idx(3.00) == 3
60 | assert tree.find_prefixsum_idx(5.50) == 3
61 |
62 |
63 | def test_max_interval_tree():
64 | tree = MinSegmentTree(4)
65 |
66 | tree[0] = 1.0
67 | tree[2] = 0.5
68 | tree[3] = 3.0
69 |
70 | assert np.isclose(tree.min(), 0.5)
71 | assert np.isclose(tree.min(0, 2), 1.0)
72 | assert np.isclose(tree.min(0, 3), 0.5)
73 | assert np.isclose(tree.min(0, -1), 0.5)
74 | assert np.isclose(tree.min(2, 4), 0.5)
75 | assert np.isclose(tree.min(3, 4), 3.0)
76 |
77 | tree[2] = 0.7
78 |
79 | assert np.isclose(tree.min(), 0.7)
80 | assert np.isclose(tree.min(0, 2), 1.0)
81 | assert np.isclose(tree.min(0, 3), 0.7)
82 | assert np.isclose(tree.min(0, -1), 0.7)
83 | assert np.isclose(tree.min(2, 4), 0.7)
84 | assert np.isclose(tree.min(3, 4), 3.0)
85 |
86 | tree[2] = 4.0
87 |
88 | assert np.isclose(tree.min(), 1.0)
89 | assert np.isclose(tree.min(0, 2), 1.0)
90 | assert np.isclose(tree.min(0, 3), 1.0)
91 | assert np.isclose(tree.min(0, -1), 1.0)
92 | assert np.isclose(tree.min(2, 4), 3.0)
93 | assert np.isclose(tree.min(2, 3), 4.0)
94 | assert np.isclose(tree.min(2, -1), 4.0)
95 | assert np.isclose(tree.min(3, 4), 3.0)
96 |
97 |
98 | if __name__ == '__main__':
99 | test_tree_set()
100 | test_tree_set_overlap()
101 | test_prefixsum_idx()
102 | test_prefixsum_idx2()
103 | test_max_interval_tree()
104 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from multiprocessing import Process, Pipe
3 | from . import VecEnv, CloudpickleWrapper
4 |
5 | def worker(remote, parent_remote, env_fn_wrapper):
6 | parent_remote.close()
7 | env = env_fn_wrapper.x()
8 | try:
9 | while True:
10 | cmd, data = remote.recv()
11 | if cmd == 'step':
12 | ob, reward, done, info = env.step(data)
13 | if done:
14 | ob = env.reset()
15 | remote.send((ob, reward, done, info))
16 | elif cmd == 'reset':
17 | ob = env.reset()
18 | remote.send(ob)
19 | elif cmd == 'render':
20 | remote.send(env.render(mode='rgb_array'))
21 | elif cmd == 'close':
22 | remote.close()
23 | break
24 | elif cmd == 'get_spaces':
25 | remote.send((env.observation_space, env.action_space))
26 | else:
27 | raise NotImplementedError
28 | except KeyboardInterrupt:
29 | print('SubprocVecEnv worker: got KeyboardInterrupt')
30 | finally:
31 | env.close()
32 |
33 |
34 | class SubprocVecEnv(VecEnv):
35 | def __init__(self, env_fns, spaces=None):
36 | """
37 | envs: list of gym environments to run in subprocesses
38 | """
39 | self.waiting = False
40 | nenvs = len(env_fns)
41 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
42 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
43 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
44 | for p in self.ps:
45 | p.daemon = True # if the main process crashes, we should not cause things to hang
46 | p.start()
47 | for remote in self.work_remotes:
48 | remote.close()
49 |
50 | self.remotes[0].send(('get_spaces', None))
51 | observation_space, action_space = self.remotes[0].recv()
52 | self.viewer = None
53 | VecEnv.__init__(self, len(env_fns), observation_space, action_space)
54 |
55 | def step_async(self, actions):
56 | for remote, action in zip(self.remotes, actions):
57 | remote.send(('step', action))
58 | self.waiting = True
59 |
60 | def step_wait(self):
61 | results = [remote.recv() for remote in self.remotes]
62 | self.waiting = False
63 | obs, rews, dones, infos = zip(*results)
64 | return np.stack(obs), np.stack(rews), np.stack(dones), infos
65 |
66 | def reset(self):
67 | for remote in self.remotes:
68 | remote.send(('reset', None))
69 | return np.stack([remote.recv() for remote in self.remotes])
70 |
71 | def close_extras(self):
72 | if self.waiting:
73 | for remote in self.remotes:
74 | remote.recv()
75 | for remote in self.remotes:
76 | remote.send(('close', None))
77 | for p in self.ps:
78 | p.join()
79 |
80 | def get_images(self):
81 | for pipe in self.remotes:
82 | pipe.send(('render', None))
83 | imgs = [pipe.recv() for pipe in self.remotes]
84 | return imgs
85 |
--------------------------------------------------------------------------------
/baselines/common/tests/test_serialization.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tempfile
3 | import pytest
4 | import tensorflow as tf
5 | import numpy as np
6 |
7 | from baselines.common.tests.envs.mnist_env import MnistEnv
8 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
9 | from baselines.run import get_learn_function
10 | from baselines.common.tf_util import make_session, get_session
11 |
12 | from functools import partial
13 |
14 |
15 | learn_kwargs = {
16 | 'deepq': {},
17 | 'a2c': {},
18 | 'acktr': {},
19 | 'ppo2': {'nminibatches': 1, 'nsteps': 10},
20 | 'trpo_mpi': {},
21 | }
22 |
23 | network_kwargs = {
24 | 'mlp': {},
25 | 'cnn': {'pad': 'SAME'},
26 | 'lstm': {},
27 | 'cnn_lnlstm': {'pad': 'SAME'}
28 | }
29 |
30 |
31 | @pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
32 | @pytest.mark.parametrize("network_fn", network_kwargs.keys())
33 | def test_serialization(learn_fn, network_fn):
34 | '''
35 | Test if the trained model can be serialized
36 | '''
37 |
38 |
39 | if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
40 | # TODO make acktr work with recurrent policies
41 | # and test
42 | # github issue: https://github.com/openai/baselines/issues/194
43 | return
44 |
45 | env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
46 | ob = env.reset().copy()
47 | learn = get_learn_function(learn_fn)
48 |
49 | kwargs = {}
50 | kwargs.update(network_kwargs[network_fn])
51 | kwargs.update(learn_kwargs[learn_fn])
52 |
53 |
54 | learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)
55 |
56 | with tempfile.TemporaryDirectory() as td:
57 | model_path = os.path.join(td, 'serialization_test_model')
58 |
59 | with tf.Graph().as_default(), make_session().as_default():
60 | model = learn(total_timesteps=100)
61 | model.save(model_path)
62 | mean1, std1 = _get_action_stats(model, ob)
63 | variables_dict1 = _serialize_variables()
64 |
65 | with tf.Graph().as_default(), make_session().as_default():
66 | model = learn(total_timesteps=0, load_path=model_path)
67 | mean2, std2 = _get_action_stats(model, ob)
68 | variables_dict2 = _serialize_variables()
69 |
70 | for k, v in variables_dict1.items():
71 | np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
72 | err_msg='saved and loaded variable {} value mismatch'.format(k))
73 |
74 | np.testing.assert_allclose(mean1, mean2, atol=0.5)
75 | np.testing.assert_allclose(std1, std2, atol=0.5)
76 |
77 |
78 |
79 | def _serialize_variables():
80 | sess = get_session()
81 | variables = tf.trainable_variables()
82 | values = sess.run(variables)
83 | return {var.name: value for var, value in zip(variables, values)}
84 |
85 |
86 | def _get_action_stats(model, ob):
87 | ntrials = 1000
88 | if model.initial_state is None or model.initial_state == []:
89 | actions = np.array([model.step(ob)[0] for _ in range(ntrials)])
90 | else:
91 | actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)])
92 |
93 | mean = np.mean(actions, axis=0)
94 | std = np.std(actions, axis=0)
95 |
96 | return mean, std
97 |
98 |
--------------------------------------------------------------------------------
/baselines/results_plotter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib
3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
4 |
5 | import matplotlib.pyplot as plt
6 | plt.rcParams['svg.fonttype'] = 'none'
7 |
8 | from baselines.bench.monitor import load_results
9 |
10 | X_TIMESTEPS = 'timesteps'
11 | X_EPISODES = 'episodes'
12 | X_WALLTIME = 'walltime_hrs'
13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
14 | EPISODES_WINDOW = 100
15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
16 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
17 | 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
18 |
19 | def rolling_window(a, window):
20 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
21 | strides = a.strides + (a.strides[-1],)
22 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
23 |
24 | def window_func(x, y, window, func):
25 | yw = rolling_window(y, window)
26 | yw_func = func(yw, axis=-1)
27 | return x[window-1:], yw_func
28 |
29 | def ts2xy(ts, xaxis):
30 | if xaxis == X_TIMESTEPS:
31 | x = np.cumsum(ts.l.values)
32 | y = ts.r.values
33 | elif xaxis == X_EPISODES:
34 | x = np.arange(len(ts))
35 | y = ts.r.values
36 | elif xaxis == X_WALLTIME:
37 | x = ts.t.values / 3600.
38 | y = ts.r.values
39 | else:
40 | raise NotImplementedError
41 | return x, y
42 |
43 | def plot_curves(xy_list, xaxis, title):
44 | plt.figure(figsize=(8,2))
45 | maxx = max(xy[0][-1] for xy in xy_list)
46 | minx = 0
47 | for (i, (x, y)) in enumerate(xy_list):
48 | color = COLORS[i]
49 | plt.scatter(x, y, s=2)
50 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
51 | plt.plot(x, y_mean, color=color)
52 | plt.xlim(minx, maxx)
53 | plt.title(title)
54 | plt.xlabel(xaxis)
55 | plt.ylabel("Episode Rewards")
56 | plt.tight_layout()
57 |
58 | def plot_results(dirs, num_timesteps, xaxis, task_name):
59 | tslist = []
60 | for dir in dirs:
61 | ts = load_results(dir)
62 | ts = ts[ts.l.cumsum() <= num_timesteps]
63 | tslist.append(ts)
64 | xy_list = [ts2xy(ts, xaxis) for ts in tslist]
65 | plot_curves(xy_list, xaxis, task_name)
66 |
67 | # Example usage in jupyter-notebook
68 | # from baselines import log_viewer
69 | # %matplotlib inline
70 | # log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
71 | # Here ./log is a directory containing the monitor.csv files
72 |
73 | def main():
74 | import argparse
75 | import os
76 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
77 | parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
78 | parser.add_argument('--num_timesteps', type=int, default=int(10e6))
79 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
80 | parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
81 | args = parser.parse_args()
82 | args.dirs = [os.path.abspath(dir) for dir in args.dirs]
83 | plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name)
84 | plt.show()
85 |
86 | if __name__ == '__main__':
87 | main()
--------------------------------------------------------------------------------
/baselines/common/mpi_util.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from mpi4py import MPI
3 | import os, numpy as np
4 | import platform
5 | import shutil
6 | import subprocess
7 |
8 | def sync_from_root(sess, variables, comm=None):
9 | """
10 | Send the root node's parameters to every worker.
11 | Arguments:
12 | sess: the TensorFlow session.
13 | variables: all parameter variables including optimizer's
14 | """
15 | if comm is None: comm = MPI.COMM_WORLD
16 | rank = comm.Get_rank()
17 | for var in variables:
18 | if rank == 0:
19 | comm.Bcast(sess.run(var))
20 | else:
21 | import tensorflow as tf
22 | returned_var = np.empty(var.shape, dtype='float32')
23 | comm.Bcast(returned_var)
24 | sess.run(tf.assign(var, returned_var))
25 |
26 | def gpu_count():
27 | """
28 | Count the GPUs on this machine.
29 | """
30 | if shutil.which('nvidia-smi') is None:
31 | return 0
32 | output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
33 | return max(0, len(output.split(b'\n')) - 2)
34 |
35 | def setup_mpi_gpus():
36 | """
37 | Set CUDA_VISIBLE_DEVICES using MPI.
38 | """
39 | num_gpus = gpu_count()
40 | if num_gpus == 0:
41 | return
42 | local_rank, _ = get_local_rank_size(MPI.COMM_WORLD)
43 | os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus)
44 |
45 | def get_local_rank_size(comm):
46 | """
47 | Returns the rank of each process on its machine
48 | The processes on a given machine will be assigned ranks
49 | 0, 1, 2, ..., N-1,
50 | where N is the number of processes on this machine.
51 |
52 | Useful if you want to assign one gpu per machine
53 | """
54 | this_node = platform.node()
55 | ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
56 | node2rankssofar = defaultdict(int)
57 | local_rank = None
58 | for (rank, node) in ranks_nodes:
59 | if rank == comm.Get_rank():
60 | local_rank = node2rankssofar[node]
61 | node2rankssofar[node] += 1
62 | assert local_rank is not None
63 | return local_rank, node2rankssofar[this_node]
64 |
65 | def share_file(comm, path):
66 | """
67 | Copies the file from rank 0 to all other ranks
68 | Puts it in the same place on all machines
69 | """
70 | localrank, _ = get_local_rank_size(comm)
71 | if comm.Get_rank() == 0:
72 | with open(path, 'rb') as fh:
73 | data = fh.read()
74 | comm.bcast(data)
75 | else:
76 | data = comm.bcast(None)
77 | if localrank == 0:
78 | os.makedirs(os.path.dirname(path), exist_ok=True)
79 | with open(path, 'wb') as fh:
80 | fh.write(data)
81 | comm.Barrier()
82 |
83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True):
84 | if comm is None: return d
85 | alldicts = comm.allgather(d)
86 | size = comm.size
87 | k2li = defaultdict(list)
88 | for d in alldicts:
89 | for (k,v) in d.items():
90 | k2li[k].append(v)
91 | result = {}
92 | for (k,li) in k2li.items():
93 | if assert_all_have_data:
94 | assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
95 | if op=='mean':
96 | result[k] = np.mean(li, axis=0)
97 | elif op=='sum':
98 | result[k] = np.sum(li, axis=0)
99 | else:
100 | assert 0, op
101 | return result
102 |
--------------------------------------------------------------------------------
/baselines/deepq/experiments/custom_cartpole.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import itertools
3 | import numpy as np
4 | import tensorflow as tf
5 | import tensorflow.contrib.layers as layers
6 |
7 | import baselines.common.tf_util as U
8 |
9 | from baselines import logger
10 | from baselines import deepq
11 | from baselines.deepq.replay_buffer import ReplayBuffer
12 | from baselines.deepq.utils import ObservationInput
13 | from baselines.common.schedules import LinearSchedule
14 |
15 |
16 | def model(inpt, num_actions, scope, reuse=False):
17 | """This model takes as input an observation and returns values of all actions."""
18 | with tf.variable_scope(scope, reuse=reuse):
19 | out = inpt
20 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
21 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
22 | return out
23 |
24 |
25 | if __name__ == '__main__':
26 | with U.make_session(8):
27 | # Create the environment
28 | env = gym.make("CartPole-v0")
29 | # Create all the functions necessary to train the model
30 | act, train, update_target, debug = deepq.build_train(
31 | make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name),
32 | q_func=model,
33 | num_actions=env.action_space.n,
34 | optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
35 | )
36 | # Create the replay buffer
37 | replay_buffer = ReplayBuffer(50000)
38 | # Create the schedule for exploration starting from 1 (every action is random) down to
39 | # 0.02 (98% of actions are selected according to values predicted by the model).
40 | exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
41 |
42 | # Initialize the parameters and copy them to the target network.
43 | U.initialize()
44 | update_target()
45 |
46 | episode_rewards = [0.0]
47 | obs = env.reset()
48 | for t in itertools.count():
49 | # Take action and update exploration to the newest value
50 | action = act(obs[None], update_eps=exploration.value(t))[0]
51 | new_obs, rew, done, _ = env.step(action)
52 | # Store transition in the replay buffer.
53 | replay_buffer.add(obs, action, rew, new_obs, float(done))
54 | obs = new_obs
55 |
56 | episode_rewards[-1] += rew
57 | if done:
58 | obs = env.reset()
59 | episode_rewards.append(0)
60 |
61 | is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
62 | if is_solved:
63 | # Show off the result
64 | env.render()
65 | else:
66 | # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
67 | if t > 1000:
68 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
69 | train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
70 | # Update target network periodically.
71 | if t % 1000 == 0:
72 | update_target()
73 |
74 | if done and len(episode_rewards) % 10 == 0:
75 | logger.record_tabular("steps", t)
76 | logger.record_tabular("episodes", len(episode_rewards))
77 | logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
78 | logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
79 | logger.dump_tabular()
80 |
--------------------------------------------------------------------------------
/baselines/acktr/kfac_utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
4 | assert reduce_dim is not None
5 |
6 | # weird batch matmul
7 | if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
8 | # reshape reduce_dim to the left most dim in b
9 | b_shape = b.get_shape()
10 | if reduce_dim != 0:
11 | b_dims = list(range(len(b_shape)))
12 | b_dims.remove(reduce_dim)
13 | b_dims.insert(0, reduce_dim)
14 | b = tf.transpose(b, b_dims)
15 | b_t_shape = b.get_shape()
16 | b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
17 | result = tf.matmul(a, b, transpose_a=transpose_a,
18 | transpose_b=transpose_b)
19 | result = tf.reshape(result, b_t_shape)
20 | if reduce_dim != 0:
21 | b_dims = list(range(len(b_shape)))
22 | b_dims.remove(0)
23 | b_dims.insert(reduce_dim, 0)
24 | result = tf.transpose(result, b_dims)
25 | return result
26 |
27 | elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
28 | # reshape reduce_dim to the right most dim in a
29 | a_shape = a.get_shape()
30 | outter_dim = len(a_shape) - 1
31 | reduce_dim = len(a_shape) - reduce_dim - 1
32 | if reduce_dim != outter_dim:
33 | a_dims = list(range(len(a_shape)))
34 | a_dims.remove(reduce_dim)
35 | a_dims.insert(outter_dim, reduce_dim)
36 | a = tf.transpose(a, a_dims)
37 | a_t_shape = a.get_shape()
38 | a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
39 | result = tf.matmul(a, b, transpose_a=transpose_a,
40 | transpose_b=transpose_b)
41 | result = tf.reshape(result, a_t_shape)
42 | if reduce_dim != outter_dim:
43 | a_dims = list(range(len(a_shape)))
44 | a_dims.remove(outter_dim)
45 | a_dims.insert(reduce_dim, outter_dim)
46 | result = tf.transpose(result, a_dims)
47 | return result
48 |
49 | elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
50 | return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
51 |
52 | assert False, 'something went wrong'
53 |
54 |
55 | def clipoutNeg(vec, threshold=1e-6):
56 | mask = tf.cast(vec > threshold, tf.float32)
57 | return mask * vec
58 |
59 |
60 | def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
61 | eigen_min = tf.reduce_min(input_mat)
62 | eigen_max = tf.reduce_max(input_mat)
63 | eigen_ratio = eigen_max / eigen_min
64 | input_mat_clipped = clipoutNeg(input_mat, threshold)
65 |
66 | if debug:
67 | input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print(
68 | input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio]))
69 |
70 | return input_mat_clipped
71 |
72 |
73 | def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
74 | grad_shape = grad.get_shape()
75 | if ftype == 'act':
76 | assert e.get_shape()[0] == grad_shape[facIndx]
77 | expanded_shape = [1, ] * len(grad_shape)
78 | expanded_shape[facIndx] = -1
79 | e = tf.reshape(e, expanded_shape)
80 | if ftype == 'grad':
81 | assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
82 | expanded_shape = [1, ] * len(grad_shape)
83 | expanded_shape[len(grad_shape) - facIndx - 1] = -1
84 | e = tf.reshape(e, expanded_shape)
85 |
86 | return Q, e
87 |
--------------------------------------------------------------------------------
/baselines/acktr/policies.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from baselines.acktr.utils import dense, kl_div
4 | import baselines.common.tf_util as U
5 |
6 | class GaussianMlpPolicy(object):
7 | def __init__(self, ob_dim, ac_dim):
8 | # Here we'll construct a bunch of expressions, which will be used in two places:
9 | # (1) When sampling actions
10 | # (2) When computing loss functions, for the policy update
11 | # Variables specific to (1) have the word "sampled" in them,
12 | # whereas variables specific to (2) have the word "old" in them
13 | ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
14 | oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
15 | oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
16 | adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
17 | wd_dict = {}
18 | h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
19 | h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
20 | mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
21 | self.wd_dict = wd_dict
22 | self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
23 | logstd_1a = tf.expand_dims(logstd_1a, 0)
24 | std_1a = tf.exp(logstd_1a)
25 | std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
26 | ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
27 | sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
28 | logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
29 | logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
30 | kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
31 | #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
32 | surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
33 | surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy
34 | self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
35 | #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
36 | self.compute_kl = U.function([ob_no, oldac_dist], kl)
37 | self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
38 | U.initialize() # Initialize uninitialized TF variables
39 |
40 | def act(self, ob):
41 | ac, ac_dist, logp = self._act(ob[None])
42 | return ac[0], ac_dist[0], logp[0]
43 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/test_vec_env.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for asynchronous vectorized environments.
3 | """
4 |
5 | import gym
6 | import numpy as np
7 | import pytest
8 | from .dummy_vec_env import DummyVecEnv
9 | from .shmem_vec_env import ShmemVecEnv
10 | from .subproc_vec_env import SubprocVecEnv
11 |
12 |
13 | def assert_envs_equal(env1, env2, num_steps):
14 | """
15 | Compare two environments over num_steps steps and make sure
16 | that the observations produced by each are the same when given
17 | the same actions.
18 | """
19 | assert env1.num_envs == env2.num_envs
20 | assert env1.action_space.shape == env2.action_space.shape
21 | assert env1.action_space.dtype == env2.action_space.dtype
22 | joint_shape = (env1.num_envs,) + env1.action_space.shape
23 |
24 | try:
25 | obs1, obs2 = env1.reset(), env2.reset()
26 | assert np.array(obs1).shape == np.array(obs2).shape
27 | assert np.array(obs1).shape == joint_shape
28 | assert np.allclose(obs1, obs2)
29 | np.random.seed(1337)
30 | for _ in range(num_steps):
31 | actions = np.array(np.random.randint(0, 0x100, size=joint_shape),
32 | dtype=env1.action_space.dtype)
33 | for env in [env1, env2]:
34 | env.step_async(actions)
35 | outs1 = env1.step_wait()
36 | outs2 = env2.step_wait()
37 | for out1, out2 in zip(outs1[:3], outs2[:3]):
38 | assert np.array(out1).shape == np.array(out2).shape
39 | assert np.allclose(out1, out2)
40 | assert list(outs1[3]) == list(outs2[3])
41 | finally:
42 | env1.close()
43 | env2.close()
44 |
45 |
46 | @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv))
47 | @pytest.mark.parametrize('dtype', ('uint8', 'float32'))
48 | def test_vec_env(klass, dtype): # pylint: disable=R0914
49 | """
50 | Test that a vectorized environment is equivalent to
51 | DummyVecEnv, since DummyVecEnv is less likely to be
52 | error prone.
53 | """
54 | num_envs = 3
55 | num_steps = 100
56 | shape = (3, 8)
57 |
58 | def make_fn(seed):
59 | """
60 | Get an environment constructor with a seed.
61 | """
62 | return lambda: SimpleEnv(seed, shape, dtype)
63 | fns = [make_fn(i) for i in range(num_envs)]
64 | env1 = DummyVecEnv(fns)
65 | env2 = klass(fns)
66 | assert_envs_equal(env1, env2, num_steps=num_steps)
67 |
68 |
69 | class SimpleEnv(gym.Env):
70 | """
71 | An environment with a pre-determined observation space
72 | and RNG seed.
73 | """
74 |
75 | def __init__(self, seed, shape, dtype):
76 | np.random.seed(seed)
77 | self._dtype = dtype
78 | self._start_obs = np.array(np.random.randint(0, 0x100, size=shape),
79 | dtype=dtype)
80 | self._max_steps = seed + 1
81 | self._cur_obs = None
82 | self._cur_step = 0
83 | # this is 0xFF instead of 0x100 because the Box space includes
84 | # the high end, while randint does not
85 | self.action_space = gym.spaces.Box(low=0, high=0xFF, shape=shape, dtype=dtype)
86 | self.observation_space = self.action_space
87 |
88 | def step(self, action):
89 | self._cur_obs += np.array(action, dtype=self._dtype)
90 | self._cur_step += 1
91 | done = self._cur_step >= self._max_steps
92 | reward = self._cur_step / self._max_steps
93 | return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)}
94 |
95 | def reset(self):
96 | self._cur_obs = self._start_obs
97 | self._cur_step = 0
98 | return self._cur_obs
99 |
100 | def render(self, mode=None):
101 | raise NotImplementedError
102 |
--------------------------------------------------------------------------------
/baselines/her/experiment/plot.py:
--------------------------------------------------------------------------------
1 | import os
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | import json
5 | import seaborn as sns; sns.set()
6 | import glob2
7 | import argparse
8 |
9 |
10 | def smooth_reward_curve(x, y):
11 | halfwidth = int(np.ceil(len(x) / 60)) # Halfwidth of our smoothing convolution
12 | k = halfwidth
13 | xsmoo = x
14 | ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1),
15 | mode='same')
16 | return xsmoo, ysmoo
17 |
18 |
19 | def load_results(file):
20 | if not os.path.exists(file):
21 | return None
22 | with open(file, 'r') as f:
23 | lines = [line for line in f]
24 | if len(lines) < 2:
25 | return None
26 | keys = [name.strip() for name in lines[0].split(',')]
27 | data = np.genfromtxt(file, delimiter=',', skip_header=1, filling_values=0.)
28 | if data.ndim == 1:
29 | data = data.reshape(1, -1)
30 | assert data.ndim == 2
31 | assert data.shape[-1] == len(keys)
32 | result = {}
33 | for idx, key in enumerate(keys):
34 | result[key] = data[:, idx]
35 | return result
36 |
37 |
38 | def pad(xs, value=np.nan):
39 | maxlen = np.max([len(x) for x in xs])
40 |
41 | padded_xs = []
42 | for x in xs:
43 | if x.shape[0] >= maxlen:
44 | padded_xs.append(x)
45 |
46 | padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
47 | x_padded = np.concatenate([x, padding], axis=0)
48 | assert x_padded.shape[1:] == x.shape[1:]
49 | assert x_padded.shape[0] == maxlen
50 | padded_xs.append(x_padded)
51 | return np.array(padded_xs)
52 |
53 |
54 | parser = argparse.ArgumentParser()
55 | parser.add_argument('dir', type=str)
56 | parser.add_argument('--smooth', type=int, default=1)
57 | args = parser.parse_args()
58 |
59 | # Load all data.
60 | data = {}
61 | paths = [os.path.abspath(os.path.join(path, '..')) for path in glob2.glob(os.path.join(args.dir, '**', 'progress.csv'))]
62 | for curr_path in paths:
63 | if not os.path.isdir(curr_path):
64 | continue
65 | results = load_results(os.path.join(curr_path, 'progress.csv'))
66 | if not results:
67 | print('skipping {}'.format(curr_path))
68 | continue
69 | print('loading {} ({})'.format(curr_path, len(results['epoch'])))
70 | with open(os.path.join(curr_path, 'params.json'), 'r') as f:
71 | params = json.load(f)
72 |
73 | success_rate = np.array(results['test/success_rate'])
74 | epoch = np.array(results['epoch']) + 1
75 | env_id = params['env_name']
76 | replay_strategy = params['replay_strategy']
77 |
78 | if replay_strategy == 'future':
79 | config = 'her'
80 | else:
81 | config = 'ddpg'
82 | if 'Dense' in env_id:
83 | config += '-dense'
84 | else:
85 | config += '-sparse'
86 | env_id = env_id.replace('Dense', '')
87 |
88 | # Process and smooth data.
89 | assert success_rate.shape == epoch.shape
90 | x = epoch
91 | y = success_rate
92 | if args.smooth:
93 | x, y = smooth_reward_curve(epoch, success_rate)
94 | assert x.shape == y.shape
95 |
96 | if env_id not in data:
97 | data[env_id] = {}
98 | if config not in data[env_id]:
99 | data[env_id][config] = []
100 | data[env_id][config].append((x, y))
101 |
102 | # Plot data.
103 | for env_id in sorted(data.keys()):
104 | print('exporting {}'.format(env_id))
105 | plt.clf()
106 |
107 | for config in sorted(data[env_id].keys()):
108 | xs, ys = zip(*data[env_id][config])
109 | xs, ys = pad(xs), pad(ys)
110 | assert xs.shape == ys.shape
111 |
112 | plt.plot(xs[0], np.nanmedian(ys, axis=0), label=config)
113 | plt.fill_between(xs[0], np.nanpercentile(ys, 25, axis=0), np.nanpercentile(ys, 75, axis=0), alpha=0.25)
114 | plt.title(env_id)
115 | plt.xlabel('Epoch')
116 | plt.ylabel('Median Success Rate')
117 | plt.legend()
118 | plt.savefig(os.path.join(args.dir, 'fig_{}.png'.format(env_id)))
119 |
--------------------------------------------------------------------------------
/baselines/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
1 | from mpi4py import MPI
2 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np
3 |
4 | class RunningMeanStd(object):
5 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
6 | def __init__(self, epsilon=1e-2, shape=()):
7 |
8 | self._sum = tf.get_variable(
9 | dtype=tf.float64,
10 | shape=shape,
11 | initializer=tf.constant_initializer(0.0),
12 | name="runningsum", trainable=False)
13 | self._sumsq = tf.get_variable(
14 | dtype=tf.float64,
15 | shape=shape,
16 | initializer=tf.constant_initializer(epsilon),
17 | name="runningsumsq", trainable=False)
18 | self._count = tf.get_variable(
19 | dtype=tf.float64,
20 | shape=(),
21 | initializer=tf.constant_initializer(epsilon),
22 | name="count", trainable=False)
23 | self.shape = shape
24 |
25 | self.mean = tf.to_float(self._sum / self._count)
26 | self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
27 |
28 | newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
29 | newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
30 | newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
31 | self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
32 | updates=[tf.assign_add(self._sum, newsum),
33 | tf.assign_add(self._sumsq, newsumsq),
34 | tf.assign_add(self._count, newcount)])
35 |
36 |
37 | def update(self, x):
38 | x = x.astype('float64')
39 | n = int(np.prod(self.shape))
40 | totalvec = np.zeros(n*2+1, 'float64')
41 | addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
42 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
43 | self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
44 |
45 | @U.in_session
46 | def test_runningmeanstd():
47 | for (x1, x2, x3) in [
48 | (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
49 | (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
50 | ]:
51 |
52 | rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
53 | U.initialize()
54 |
55 | x = np.concatenate([x1, x2, x3], axis=0)
56 | ms1 = [x.mean(axis=0), x.std(axis=0)]
57 | rms.update(x1)
58 | rms.update(x2)
59 | rms.update(x3)
60 | ms2 = [rms.mean.eval(), rms.std.eval()]
61 |
62 | assert np.allclose(ms1, ms2)
63 |
64 | @U.in_session
65 | def test_dist():
66 | np.random.seed(0)
67 | p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
68 | q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
69 |
70 | # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
71 | # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
72 |
73 | comm = MPI.COMM_WORLD
74 | assert comm.Get_size()==2
75 | if comm.Get_rank()==0:
76 | x1,x2,x3 = p1,p2,p3
77 | elif comm.Get_rank()==1:
78 | x1,x2,x3 = q1,q2,q3
79 | else:
80 | assert False
81 |
82 | rms = RunningMeanStd(epsilon=0.0, shape=(1,))
83 | U.initialize()
84 |
85 | rms.update(x1)
86 | rms.update(x2)
87 | rms.update(x3)
88 |
89 | bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
90 |
91 | def checkallclose(x,y):
92 | print(x,y)
93 | return np.allclose(x,y)
94 |
95 | assert checkallclose(
96 | bigvec.mean(axis=0),
97 | rms.mean.eval(),
98 | )
99 | assert checkallclose(
100 | bigvec.std(axis=0),
101 | rms.std.eval(),
102 | )
103 |
104 |
105 | if __name__ == "__main__":
106 | # Run with mpirun -np 2 python
107 | test_dist()
108 |
--------------------------------------------------------------------------------
/baselines/common/schedules.py:
--------------------------------------------------------------------------------
1 | """This file is used for specifying various schedules that evolve over
2 | time throughout the execution of the algorithm, such as:
3 | - learning rate for the optimizer
4 | - exploration epsilon for the epsilon greedy exploration strategy
5 | - beta parameter for beta parameter in prioritized replay
6 |
7 | Each schedule has a function `value(t)` which returns the current value
8 | of the parameter given the timestep t of the optimization procedure.
9 | """
10 |
11 |
12 | class Schedule(object):
13 | def value(self, t):
14 | """Value of the schedule at time t"""
15 | raise NotImplementedError()
16 |
17 |
18 | class ConstantSchedule(object):
19 | def __init__(self, value):
20 | """Value remains constant over time.
21 |
22 | Parameters
23 | ----------
24 | value: float
25 | Constant value of the schedule
26 | """
27 | self._v = value
28 |
29 | def value(self, t):
30 | """See Schedule.value"""
31 | return self._v
32 |
33 |
34 | def linear_interpolation(l, r, alpha):
35 | return l + alpha * (r - l)
36 |
37 |
38 | class PiecewiseSchedule(object):
39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
40 | """Piecewise schedule.
41 |
42 | endpoints: [(int, int)]
43 | list of pairs `(time, value)` meanining that schedule should output
44 | `value` when `t==time`. All the values for time must be sorted in
45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)`
46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
48 | time passed between `time_a` and `time_b` for time `t`.
49 | interpolation: lambda float, float, float: float
50 | a function that takes value to the left and to the right of t according
51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to
52 | right endpoint that t has covered. See linear_interpolation for example.
53 | outside_value: float
54 | if the value is requested outside of all the intervals sepecified in
55 | `endpoints` this value is returned. If None then AssertionError is
56 | raised when outside value is requested.
57 | """
58 | idxes = [e[0] for e in endpoints]
59 | assert idxes == sorted(idxes)
60 | self._interpolation = interpolation
61 | self._outside_value = outside_value
62 | self._endpoints = endpoints
63 |
64 | def value(self, t):
65 | """See Schedule.value"""
66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
67 | if l_t <= t and t < r_t:
68 | alpha = float(t - l_t) / (r_t - l_t)
69 | return self._interpolation(l, r, alpha)
70 |
71 | # t does not belong to any of the pieces, so doom.
72 | assert self._outside_value is not None
73 | return self._outside_value
74 |
75 |
76 | class LinearSchedule(object):
77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
78 | """Linear interpolation between initial_p and final_p over
79 | schedule_timesteps. After this many timesteps pass final_p is
80 | returned.
81 |
82 | Parameters
83 | ----------
84 | schedule_timesteps: int
85 | Number of timesteps for which to linearly anneal initial_p
86 | to final_p
87 | initial_p: float
88 | initial output value
89 | final_p: float
90 | final output value
91 | """
92 | self.schedule_timesteps = schedule_timesteps
93 | self.final_p = final_p
94 | self.initial_p = initial_p
95 |
96 | def value(self, t):
97 | """See Schedule.value"""
98 | fraction = min(float(t) / self.schedule_timesteps, 1.0)
99 | return self.initial_p + fraction * (self.final_p - self.initial_p)
100 |
--------------------------------------------------------------------------------
/baselines/her/replay_buffer.py:
--------------------------------------------------------------------------------
1 | import threading
2 |
3 | import numpy as np
4 |
5 |
6 | class ReplayBuffer:
7 | def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions):
8 | """Creates a replay buffer.
9 |
10 | Args:
11 | buffer_shapes (dict of ints): the shape for all buffers that are used in the replay
12 | buffer
13 | size_in_transitions (int): the size of the buffer, measured in transitions
14 | T (int): the time horizon for episodes
15 | sample_transitions (function): a function that samples from the replay buffer
16 | """
17 | self.buffer_shapes = buffer_shapes
18 | self.size = size_in_transitions // T
19 | self.T = T
20 | self.sample_transitions = sample_transitions
21 |
22 | # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)}
23 | self.buffers = {key: np.empty([self.size, *shape])
24 | for key, shape in buffer_shapes.items()}
25 |
26 | # memory management
27 | self.current_size = 0
28 | self.n_transitions_stored = 0
29 |
30 | self.lock = threading.Lock()
31 |
32 | @property
33 | def full(self):
34 | with self.lock:
35 | return self.current_size == self.size
36 |
37 | def sample(self, batch_size):
38 | """Returns a dict {key: array(batch_size x shapes[key])}
39 | """
40 | buffers = {}
41 |
42 | with self.lock:
43 | assert self.current_size > 0
44 | for key in self.buffers.keys():
45 | buffers[key] = self.buffers[key][:self.current_size]
46 |
47 | buffers['o_2'] = buffers['o'][:, 1:, :]
48 | buffers['ag_2'] = buffers['ag'][:, 1:, :]
49 |
50 | transitions = self.sample_transitions(buffers, batch_size)
51 |
52 | for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())):
53 | assert key in transitions, "key %s missing from transitions" % key
54 |
55 | return transitions
56 |
57 | def store_episode(self, episode_batch):
58 | """episode_batch: array(batch_size x (T or T+1) x dim_key)
59 | """
60 | batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()]
61 | assert np.all(np.array(batch_sizes) == batch_sizes[0])
62 | batch_size = batch_sizes[0]
63 |
64 | with self.lock:
65 | idxs = self._get_storage_idx(batch_size)
66 |
67 | # load inputs into buffers
68 | for key in self.buffers.keys():
69 | self.buffers[key][idxs] = episode_batch[key]
70 |
71 | self.n_transitions_stored += batch_size * self.T
72 |
73 | def get_current_episode_size(self):
74 | with self.lock:
75 | return self.current_size
76 |
77 | def get_current_size(self):
78 | with self.lock:
79 | return self.current_size * self.T
80 |
81 | def get_transitions_stored(self):
82 | with self.lock:
83 | return self.n_transitions_stored
84 |
85 | def clear_buffer(self):
86 | with self.lock:
87 | self.current_size = 0
88 |
89 | def _get_storage_idx(self, inc=None):
90 | inc = inc or 1 # size increment
91 | assert inc <= self.size, "Batch committed to replay is too large!"
92 | # go consecutively until you hit the end, and then go randomly.
93 | if self.current_size+inc <= self.size:
94 | idx = np.arange(self.current_size, self.current_size+inc)
95 | elif self.current_size < self.size:
96 | overflow = inc - (self.size - self.current_size)
97 | idx_a = np.arange(self.current_size, self.size)
98 | idx_b = np.random.randint(0, self.current_size, overflow)
99 | idx = np.concatenate([idx_a, idx_b])
100 | else:
101 | idx = np.random.randint(0, self.size, inc)
102 |
103 | # update replay size
104 | self.current_size = min(self.size, self.current_size+inc)
105 |
106 | if inc == 1:
107 | idx = idx[0]
108 | return idx
109 |
--------------------------------------------------------------------------------
/baselines/her/util.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import sys
4 | import importlib
5 | import inspect
6 | import functools
7 |
8 | import tensorflow as tf
9 | import numpy as np
10 |
11 | from baselines.common import tf_util as U
12 |
13 |
14 | def store_args(method):
15 | """Stores provided method args as instance attributes.
16 | """
17 | argspec = inspect.getfullargspec(method)
18 | defaults = {}
19 | if argspec.defaults is not None:
20 | defaults = dict(
21 | zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
22 | if argspec.kwonlydefaults is not None:
23 | defaults.update(argspec.kwonlydefaults)
24 | arg_names = argspec.args[1:]
25 |
26 | @functools.wraps(method)
27 | def wrapper(*positional_args, **keyword_args):
28 | self = positional_args[0]
29 | # Get default arg values
30 | args = defaults.copy()
31 | # Add provided arg values
32 | for name, value in zip(arg_names, positional_args[1:]):
33 | args[name] = value
34 | args.update(keyword_args)
35 | self.__dict__.update(args)
36 | return method(*positional_args, **keyword_args)
37 |
38 | return wrapper
39 |
40 |
41 | def import_function(spec):
42 | """Import a function identified by a string like "pkg.module:fn_name".
43 | """
44 | mod_name, fn_name = spec.split(':')
45 | module = importlib.import_module(mod_name)
46 | fn = getattr(module, fn_name)
47 | return fn
48 |
49 |
50 | def flatten_grads(var_list, grads):
51 | """Flattens a variables and their gradients.
52 | """
53 | return tf.concat([tf.reshape(grad, [U.numel(v)])
54 | for (v, grad) in zip(var_list, grads)], 0)
55 |
56 |
57 | def nn(input, layers_sizes, reuse=None, flatten=False, name=""):
58 | """Creates a simple neural network
59 | """
60 | for i, size in enumerate(layers_sizes):
61 | activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
62 | input = tf.layers.dense(inputs=input,
63 | units=size,
64 | kernel_initializer=tf.contrib.layers.xavier_initializer(),
65 | reuse=reuse,
66 | name=name + '_' + str(i))
67 | if activation:
68 | input = activation(input)
69 | if flatten:
70 | assert layers_sizes[-1] == 1
71 | input = tf.reshape(input, [-1])
72 | return input
73 |
74 |
75 | def install_mpi_excepthook():
76 | import sys
77 | from mpi4py import MPI
78 | old_hook = sys.excepthook
79 |
80 | def new_hook(a, b, c):
81 | old_hook(a, b, c)
82 | sys.stdout.flush()
83 | sys.stderr.flush()
84 | MPI.COMM_WORLD.Abort()
85 | sys.excepthook = new_hook
86 |
87 |
88 | def mpi_fork(n, extra_mpi_args=[]):
89 | """Re-launches the current script with workers
90 | Returns "parent" for original parent, "child" for MPI children
91 | """
92 | if n <= 1:
93 | return "child"
94 | if os.getenv("IN_MPI") is None:
95 | env = os.environ.copy()
96 | env.update(
97 | MKL_NUM_THREADS="1",
98 | OMP_NUM_THREADS="1",
99 | IN_MPI="1"
100 | )
101 | # "-bind-to core" is crucial for good performance
102 | args = ["mpirun", "-np", str(n)] + \
103 | extra_mpi_args + \
104 | [sys.executable]
105 |
106 | args += sys.argv
107 | subprocess.check_call(args, env=env)
108 | return "parent"
109 | else:
110 | install_mpi_excepthook()
111 | return "child"
112 |
113 |
114 | def convert_episode_to_batch_major(episode):
115 | """Converts an episode to have the batch dimension in the major (first)
116 | dimension.
117 | """
118 | episode_batch = {}
119 | for key in episode.keys():
120 | val = np.array(episode[key]).copy()
121 | # make inputs batch-major instead of time-major
122 | episode_batch[key] = val.swapaxes(0, 1)
123 |
124 | return episode_batch
125 |
126 |
127 | def transitions_in_episode_batch(episode_batch):
128 | """Number of transitions in a given episode batch.
129 | """
130 | shape = episode_batch['u'].shape
131 | return shape[0] * shape[1]
132 |
133 |
134 | def reshape_for_broadcasting(source, target):
135 | """Reshapes a tensor (source) to have the correct shape and dtype of the target
136 | before broadcasting it with MPI.
137 | """
138 | dim = len(target.get_shape())
139 | shape = ([1] * (dim - 1)) + [-1]
140 | return tf.reshape(tf.cast(source, target.dtype), shape)
141 |
--------------------------------------------------------------------------------
/baselines/gail/dataset/mujoco_dset.py:
--------------------------------------------------------------------------------
1 | '''
2 | Data structure of the input .npz:
3 | the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs'
4 | the values of each item is a list storing the expert trajectory sequentially
5 | a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t]
6 | '''
7 |
8 | from baselines import logger
9 | import numpy as np
10 |
11 |
12 | class Dset(object):
13 | def __init__(self, inputs, labels, randomize):
14 | self.inputs = inputs
15 | self.labels = labels
16 | assert len(self.inputs) == len(self.labels)
17 | self.randomize = randomize
18 | self.num_pairs = len(inputs)
19 | self.init_pointer()
20 |
21 | def init_pointer(self):
22 | self.pointer = 0
23 | if self.randomize:
24 | idx = np.arange(self.num_pairs)
25 | np.random.shuffle(idx)
26 | self.inputs = self.inputs[idx, :]
27 | self.labels = self.labels[idx, :]
28 |
29 | def get_next_batch(self, batch_size):
30 | # if batch_size is negative -> return all
31 | if batch_size < 0:
32 | return self.inputs, self.labels
33 | if self.pointer + batch_size >= self.num_pairs:
34 | self.init_pointer()
35 | end = self.pointer + batch_size
36 | inputs = self.inputs[self.pointer:end, :]
37 | labels = self.labels[self.pointer:end, :]
38 | self.pointer = end
39 | return inputs, labels
40 |
41 |
42 | class Mujoco_Dset(object):
43 | def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True):
44 | traj_data = np.load(expert_path)
45 | if traj_limitation < 0:
46 | traj_limitation = len(traj_data['obs'])
47 | obs = traj_data['obs'][:traj_limitation]
48 | acs = traj_data['acs'][:traj_limitation]
49 |
50 | # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
51 | # and S is the environment observation/action space.
52 | # Flatten to (N * L, prod(S))
53 | self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
54 | self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
55 |
56 | self.rets = traj_data['ep_rets'][:traj_limitation]
57 | self.avg_ret = sum(self.rets)/len(self.rets)
58 | self.std_ret = np.std(np.array(self.rets))
59 | if len(self.acs) > 2:
60 | self.acs = np.squeeze(self.acs)
61 | assert len(self.obs) == len(self.acs)
62 | self.num_traj = min(traj_limitation, len(traj_data['obs']))
63 | self.num_transition = len(self.obs)
64 | self.randomize = randomize
65 | self.dset = Dset(self.obs, self.acs, self.randomize)
66 | # for behavior cloning
67 | self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction), :],
68 | self.acs[:int(self.num_transition*train_fraction), :],
69 | self.randomize)
70 | self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):, :],
71 | self.acs[int(self.num_transition*train_fraction):, :],
72 | self.randomize)
73 | self.log_info()
74 |
75 | def log_info(self):
76 | logger.log("Total trajectorues: %d" % self.num_traj)
77 | logger.log("Total transitions: %d" % self.num_transition)
78 | logger.log("Average returns: %f" % self.avg_ret)
79 | logger.log("Std for returns: %f" % self.std_ret)
80 |
81 | def get_next_batch(self, batch_size, split=None):
82 | if split is None:
83 | return self.dset.get_next_batch(batch_size)
84 | elif split == 'train':
85 | return self.train_set.get_next_batch(batch_size)
86 | elif split == 'val':
87 | return self.val_set.get_next_batch(batch_size)
88 | else:
89 | raise NotImplementedError
90 |
91 | def plot(self):
92 | import matplotlib.pyplot as plt
93 | plt.hist(self.rets)
94 | plt.savefig("histogram_rets.png")
95 | plt.close()
96 |
97 |
98 | def test(expert_path, traj_limitation, plot):
99 | dset = Mujoco_Dset(expert_path, traj_limitation=traj_limitation)
100 | if plot:
101 | dset.plot()
102 |
103 | if __name__ == '__main__':
104 | import argparse
105 | parser = argparse.ArgumentParser()
106 | parser.add_argument("--expert_path", type=str, default="../data/deterministic.trpo.Hopper.0.00.npz")
107 | parser.add_argument("--traj_limitation", type=int, default=None)
108 | parser.add_argument("--plot", type=bool, default=False)
109 | args = parser.parse_args()
110 | test(args.expert_path, args.traj_limitation, args.plot)
111 |
--------------------------------------------------------------------------------
/baselines/acer/buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class Buffer(object):
4 | # gets obs, actions, rewards, mu's, (states, masks), dones
5 | def __init__(self, env, nsteps, nstack, size=50000):
6 | self.nenv = env.num_envs
7 | self.nsteps = nsteps
8 | self.nh, self.nw, self.nc = env.observation_space.shape
9 | self.nstack = nstack
10 | self.nbatch = self.nenv * self.nsteps
11 | self.size = size // (self.nsteps) # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames
12 |
13 | # Memory
14 | self.enc_obs = None
15 | self.actions = None
16 | self.rewards = None
17 | self.mus = None
18 | self.dones = None
19 | self.masks = None
20 |
21 | # Size indexes
22 | self.next_idx = 0
23 | self.num_in_buffer = 0
24 |
25 | def has_atleast(self, frames):
26 | # Frames per env, so total (nenv * frames) Frames needed
27 | # Each buffer loc has nenv * nsteps frames
28 | return self.num_in_buffer >= (frames // self.nsteps)
29 |
30 | def can_sample(self):
31 | return self.num_in_buffer > 0
32 |
33 | # Generate stacked frames
34 | def decode(self, enc_obs, dones):
35 | # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc]
36 | # dones has shape [nenvs, nsteps, nh, nw, nc]
37 | # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc]
38 | nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc
39 | y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32)
40 | obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8)
41 | x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1,
42 | 0) # [nsteps + nstack, nenv, nh, nw, nc]
43 | y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep
44 | y[:3] = 1.0
45 | # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1])
46 | for i in range(nstack):
47 | obs[-(i + 1), i:] = x
48 | # obs[:,i:,:,:,-(i+1),:] = x
49 | x = x[:-1] * y
50 | y = y[1:]
51 | return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc])
52 |
53 | def put(self, enc_obs, actions, rewards, mus, dones, masks):
54 | # enc_obs [nenv, (nsteps + nstack), nh, nw, nc]
55 | # actions, rewards, dones [nenv, nsteps]
56 | # mus [nenv, nsteps, nact]
57 |
58 | if self.enc_obs is None:
59 | self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8)
60 | self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32)
61 | self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32)
62 | self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32)
63 | self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool)
64 | self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool)
65 |
66 | self.enc_obs[self.next_idx] = enc_obs
67 | self.actions[self.next_idx] = actions
68 | self.rewards[self.next_idx] = rewards
69 | self.mus[self.next_idx] = mus
70 | self.dones[self.next_idx] = dones
71 | self.masks[self.next_idx] = masks
72 |
73 | self.next_idx = (self.next_idx + 1) % self.size
74 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1)
75 |
76 | def take(self, x, idx, envx):
77 | nenv = self.nenv
78 | out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype)
79 | for i in range(nenv):
80 | out[i] = x[idx[i], envx[i]]
81 | return out
82 |
83 | def get(self):
84 | # returns
85 | # obs [nenv, (nsteps + 1), nh, nw, nstack*nc]
86 | # actions, rewards, dones [nenv, nsteps]
87 | # mus [nenv, nsteps, nact]
88 | nenv = self.nenv
89 | assert self.can_sample()
90 |
91 | # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env.
92 | idx = np.random.randint(0, self.num_in_buffer, nenv)
93 | envx = np.arange(nenv)
94 |
95 | take = lambda x: self.take(x, idx, envx) # for i in range(nenv)], axis = 0)
96 | dones = take(self.dones)
97 | enc_obs = take(self.enc_obs)
98 | obs = self.decode(enc_obs, dones)
99 | actions = take(self.actions)
100 | rewards = take(self.rewards)
101 | mus = take(self.mus)
102 | masks = take(self.masks)
103 | return obs, actions, rewards, mus, dones, masks
104 |
--------------------------------------------------------------------------------
/baselines/gail/adversary.py:
--------------------------------------------------------------------------------
1 | '''
2 | Reference: https://github.com/openai/imitation
3 | I follow the architecture from the official repository
4 | '''
5 | import tensorflow as tf
6 | import numpy as np
7 |
8 | from baselines.common.mpi_running_mean_std import RunningMeanStd
9 | from baselines.common import tf_util as U
10 |
11 | def logsigmoid(a):
12 | '''Equivalent to tf.log(tf.sigmoid(a))'''
13 | return -tf.nn.softplus(-a)
14 |
15 | """ Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51"""
16 | def logit_bernoulli_entropy(logits):
17 | ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits)
18 | return ent
19 |
20 | class TransitionClassifier(object):
21 | def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"):
22 | self.scope = scope
23 | self.observation_shape = env.observation_space.shape
24 | self.actions_shape = env.action_space.shape
25 | self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)])
26 | self.num_actions = env.action_space.shape[0]
27 | self.hidden_size = hidden_size
28 | self.build_ph()
29 | # Build grpah
30 | generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False)
31 | expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True)
32 | # Build accuracy
33 | generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5))
34 | expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5))
35 | # Build regression loss
36 | # let x = logits, z = targets.
37 | # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
38 | generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits))
39 | generator_loss = tf.reduce_mean(generator_loss)
40 | expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits))
41 | expert_loss = tf.reduce_mean(expert_loss)
42 | # Build entropy loss
43 | logits = tf.concat([generator_logits, expert_logits], 0)
44 | entropy = tf.reduce_mean(logit_bernoulli_entropy(logits))
45 | entropy_loss = -entcoeff*entropy
46 | # Loss + Accuracy terms
47 | self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc]
48 | self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"]
49 | self.total_loss = generator_loss + expert_loss + entropy_loss
50 | # Build Reward for policy
51 | self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8)
52 | var_list = self.get_trainable_variables()
53 | self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph],
54 | self.losses + [U.flatgrad(self.total_loss, var_list)])
55 |
56 | def build_ph(self):
57 | self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph")
58 | self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph")
59 | self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph")
60 | self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph")
61 |
62 | def build_graph(self, obs_ph, acs_ph, reuse=False):
63 | with tf.variable_scope(self.scope):
64 | if reuse:
65 | tf.get_variable_scope().reuse_variables()
66 |
67 | with tf.variable_scope("obfilter"):
68 | self.obs_rms = RunningMeanStd(shape=self.observation_shape)
69 | obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std)
70 | _input = tf.concat([obs, acs_ph], axis=1) # concatenate the two input -> form a transition
71 | p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh)
72 | p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh)
73 | logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity)
74 | return logits
75 |
76 | def get_trainable_variables(self):
77 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
78 |
79 | def get_reward(self, obs, acs):
80 | sess = tf.get_default_session()
81 | if len(obs.shape) == 1:
82 | obs = np.expand_dims(obs, 0)
83 | if len(acs.shape) == 1:
84 | acs = np.expand_dims(acs, 0)
85 | feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs}
86 | reward = sess.run(self.reward_op, feed_dict)
87 | return reward
88 |
--------------------------------------------------------------------------------
/baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from baselines.common.tile_images import tile_images
3 |
4 | class AlreadySteppingError(Exception):
5 | """
6 | Raised when an asynchronous step is running while
7 | step_async() is called again.
8 | """
9 |
10 | def __init__(self):
11 | msg = 'already running an async step'
12 | Exception.__init__(self, msg)
13 |
14 |
15 | class NotSteppingError(Exception):
16 | """
17 | Raised when an asynchronous step is not running but
18 | step_wait() is called.
19 | """
20 |
21 | def __init__(self):
22 | msg = 'not running an async step'
23 | Exception.__init__(self, msg)
24 |
25 |
26 | class VecEnv(ABC):
27 | """
28 | An abstract asynchronous, vectorized environment.
29 | """
30 |
31 | def __init__(self, num_envs, observation_space, action_space):
32 | self.num_envs = num_envs
33 | self.observation_space = observation_space
34 | self.action_space = action_space
35 | self.closed = False
36 | self.viewer = None # For rendering
37 |
38 | @abstractmethod
39 | def reset(self):
40 | """
41 | Reset all the environments and return an array of
42 | observations, or a dict of observation arrays.
43 |
44 | If step_async is still doing work, that work will
45 | be cancelled and step_wait() should not be called
46 | until step_async() is invoked again.
47 | """
48 | pass
49 |
50 | @abstractmethod
51 | def step_async(self, actions):
52 | """
53 | Tell all the environments to start taking a step
54 | with the given actions.
55 | Call step_wait() to get the results of the step.
56 |
57 | You should not call this if a step_async run is
58 | already pending.
59 | """
60 | pass
61 |
62 | @abstractmethod
63 | def step_wait(self):
64 | """
65 | Wait for the step taken with step_async().
66 |
67 | Returns (obs, rews, dones, infos):
68 | - obs: an array of observations, or a dict of
69 | arrays of observations.
70 | - rews: an array of rewards
71 | - dones: an array of "episode done" booleans
72 | - infos: a sequence of info objects
73 | """
74 | pass
75 |
76 | def close_extras(self):
77 | """
78 | Clean up the extra resources, beyond what's in this base class.
79 | Only runs when not self.closed.
80 | """
81 | pass
82 |
83 | def close(self):
84 | if self.closed:
85 | return
86 | if self.viewer is not None:
87 | self.viewer.close()
88 | self.close_extras()
89 | self.closed = True
90 |
91 | def step(self, actions):
92 | """
93 | Step the environments synchronously.
94 |
95 | This is available for backwards compatibility.
96 | """
97 | self.step_async(actions)
98 | return self.step_wait()
99 |
100 | def render(self, mode='human'):
101 | imgs = self.get_images()
102 | bigimg = tile_images(imgs)
103 | if mode == 'human':
104 | self.get_viewer().imshow(bigimg)
105 | elif mode == 'rgb_array':
106 | return bigimg
107 | else:
108 | raise NotImplementedError
109 |
110 | def get_images(self):
111 | """
112 | Return RGB images from each environment
113 | """
114 | raise NotImplementedError
115 |
116 | @property
117 | def unwrapped(self):
118 | if isinstance(self, VecEnvWrapper):
119 | return self.venv.unwrapped
120 | else:
121 | return self
122 |
123 | def get_viewer(self):
124 | if self.viewer is None:
125 | from gym.envs.classic_control import rendering
126 | self.viewer = rendering.SimpleImageViewer()
127 | return self.viewer
128 |
129 |
130 | class VecEnvWrapper(VecEnv):
131 | """
132 | An environment wrapper that applies to an entire batch
133 | of environments at once.
134 | """
135 |
136 | def __init__(self, venv, observation_space=None, action_space=None):
137 | self.venv = venv
138 | VecEnv.__init__(self,
139 | num_envs=venv.num_envs,
140 | observation_space=observation_space or venv.observation_space,
141 | action_space=action_space or venv.action_space)
142 |
143 | def step_async(self, actions):
144 | self.venv.step_async(actions)
145 |
146 | @abstractmethod
147 | def reset(self):
148 | pass
149 |
150 | @abstractmethod
151 | def step_wait(self):
152 | pass
153 |
154 | def close(self):
155 | return self.venv.close()
156 |
157 | def render(self, mode='human'):
158 | return self.venv.render(mode=mode)
159 |
160 | def get_images(self):
161 | return self.venv.get_images()
162 |
163 | class CloudpickleWrapper(object):
164 | """
165 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
166 | """
167 |
168 | def __init__(self, x):
169 | self.x = x
170 |
171 | def __getstate__(self):
172 | import cloudpickle
173 | return cloudpickle.dumps(self.x)
174 |
175 | def __setstate__(self, ob):
176 | import pickle
177 | self.x = pickle.loads(ob)
178 |
--------------------------------------------------------------------------------
/baselines/common/segment_tree.py:
--------------------------------------------------------------------------------
1 | import operator
2 |
3 |
4 | class SegmentTree(object):
5 | def __init__(self, capacity, operation, neutral_element):
6 | """Build a Segment Tree data structure.
7 |
8 | https://en.wikipedia.org/wiki/Segment_tree
9 |
10 | Can be used as regular array, but with two
11 | important differences:
12 |
13 | a) setting item's value is slightly slower.
14 | It is O(lg capacity) instead of O(1).
15 | b) user has access to an efficient ( O(log segment size) )
16 | `reduce` operation which reduces `operation` over
17 | a contiguous subsequence of items in the array.
18 |
19 | Paramters
20 | ---------
21 | capacity: int
22 | Total size of the array - must be a power of two.
23 | operation: lambda obj, obj -> obj
24 | and operation for combining elements (eg. sum, max)
25 | must form a mathematical group together with the set of
26 | possible values for array elements (i.e. be associative)
27 | neutral_element: obj
28 | neutral element for the operation above. eg. float('-inf')
29 | for max and 0 for sum.
30 | """
31 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
32 | self._capacity = capacity
33 | self._value = [neutral_element for _ in range(2 * capacity)]
34 | self._operation = operation
35 |
36 | def _reduce_helper(self, start, end, node, node_start, node_end):
37 | if start == node_start and end == node_end:
38 | return self._value[node]
39 | mid = (node_start + node_end) // 2
40 | if end <= mid:
41 | return self._reduce_helper(start, end, 2 * node, node_start, mid)
42 | else:
43 | if mid + 1 <= start:
44 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
45 | else:
46 | return self._operation(
47 | self._reduce_helper(start, mid, 2 * node, node_start, mid),
48 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
49 | )
50 |
51 | def reduce(self, start=0, end=None):
52 | """Returns result of applying `self.operation`
53 | to a contiguous subsequence of the array.
54 |
55 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
56 |
57 | Parameters
58 | ----------
59 | start: int
60 | beginning of the subsequence
61 | end: int
62 | end of the subsequences
63 |
64 | Returns
65 | -------
66 | reduced: obj
67 | result of reducing self.operation over the specified range of array elements.
68 | """
69 | if end is None:
70 | end = self._capacity
71 | if end < 0:
72 | end += self._capacity
73 | end -= 1
74 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
75 |
76 | def __setitem__(self, idx, val):
77 | # index of the leaf
78 | idx += self._capacity
79 | self._value[idx] = val
80 | idx //= 2
81 | while idx >= 1:
82 | self._value[idx] = self._operation(
83 | self._value[2 * idx],
84 | self._value[2 * idx + 1]
85 | )
86 | idx //= 2
87 |
88 | def __getitem__(self, idx):
89 | assert 0 <= idx < self._capacity
90 | return self._value[self._capacity + idx]
91 |
92 |
93 | class SumSegmentTree(SegmentTree):
94 | def __init__(self, capacity):
95 | super(SumSegmentTree, self).__init__(
96 | capacity=capacity,
97 | operation=operator.add,
98 | neutral_element=0.0
99 | )
100 |
101 | def sum(self, start=0, end=None):
102 | """Returns arr[start] + ... + arr[end]"""
103 | return super(SumSegmentTree, self).reduce(start, end)
104 |
105 | def find_prefixsum_idx(self, prefixsum):
106 | """Find the highest index `i` in the array such that
107 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
108 |
109 | if array values are probabilities, this function
110 | allows to sample indexes according to the discrete
111 | probability efficiently.
112 |
113 | Parameters
114 | ----------
115 | perfixsum: float
116 | upperbound on the sum of array prefix
117 |
118 | Returns
119 | -------
120 | idx: int
121 | highest index satisfying the prefixsum constraint
122 | """
123 | assert 0 <= prefixsum <= self.sum() + 1e-5
124 | idx = 1
125 | while idx < self._capacity: # while non-leaf
126 | if self._value[2 * idx] > prefixsum:
127 | idx = 2 * idx
128 | else:
129 | prefixsum -= self._value[2 * idx]
130 | idx = 2 * idx + 1
131 | return idx - self._capacity
132 |
133 |
134 | class MinSegmentTree(SegmentTree):
135 | def __init__(self, capacity):
136 | super(MinSegmentTree, self).__init__(
137 | capacity=capacity,
138 | operation=min,
139 | neutral_element=float('inf')
140 | )
141 |
142 | def min(self, start=0, end=None):
143 | """Returns min(arr[start], ..., arr[end])"""
144 |
145 | return super(MinSegmentTree, self).reduce(start, end)
146 |
--------------------------------------------------------------------------------