├── .benchmark_pattern ├── baselines ├── __init__.py ├── a2c │ ├── __init__.py │ ├── README.md │ └── runner.py ├── acer │ ├── __init__.py │ ├── defaults.py │ ├── README.md │ ├── runner.py │ ├── policies.py │ └── buffer.py ├── acktr │ ├── __init__.py │ ├── acktr.py │ ├── README.md │ ├── run_mujoco.py │ ├── utils.py │ ├── value_functions.py │ ├── kfac_utils.py │ └── policies.py ├── ddpg │ ├── __init__.py │ ├── README.md │ ├── noise.py │ ├── models.py │ └── memory.py ├── gail │ ├── __init__.py │ ├── dataset │ │ ├── __init__.py │ │ └── mujoco_dset.py │ ├── result │ │ ├── hopper-training.png │ │ ├── humanoid-training.png │ │ ├── walker2d-training.png │ │ ├── halfcheetah-training.png │ │ ├── humanoidstandup-training.png │ │ ├── Hopper-normalized-stochastic-scores.png │ │ ├── Hopper-normalized-deterministic-scores.png │ │ ├── Hopper-unnormalized-stochastic-scores.png │ │ ├── Humanoid-normalized-stochastic-scores.png │ │ ├── Walker2d-normalized-stochastic-scores.png │ │ ├── HalfCheetah-normalized-stochastic-scores.png │ │ ├── Hopper-unnormalized-deterministic-scores.png │ │ ├── Humanoid-normalized-deterministic-scores.png │ │ ├── Humanoid-unnormalized-stochastic-scores.png │ │ ├── Walker2d-normalized-deterministic-scores.png │ │ ├── Walker2d-unnormalized-stochastic-scores.png │ │ ├── HalfCheetah-normalized-deterministic-scores.png │ │ ├── HalfCheetah-unnormalized-stochastic-scores.png │ │ ├── Humanoid-unnormalized-deterministic-scores.png │ │ ├── Walker2d-unnormalized-deterministic-scores.png │ │ ├── HalfCheetah-unnormalized-deterministic-scores.png │ │ ├── HumanoidStandup-normalized-stochastic-scores.png │ │ ├── HumanoidStandup-unnormalized-stochastic-scores.png │ │ ├── HumanoidStandup-normalized-deterministic-scores.png │ │ ├── HumanoidStandup-unnormalized-deterministic-scores.png │ │ └── gail-result.md │ ├── README.md │ ├── statistics.py │ ├── mlp_policy.py │ └── adversary.py ├── her │ ├── __init__.py │ ├── experiment │ │ ├── __init__.py │ │ ├── play.py │ │ └── plot.py │ ├── README.md │ ├── actor_critic.py │ ├── her.py │ ├── replay_buffer.py │ └── util.py ├── ppo1 │ ├── __init__.py │ ├── README.md │ ├── run_mujoco.py │ ├── run_robotics.py │ ├── run_atari.py │ ├── cnn_policy.py │ ├── run_humanoid.py │ └── mlp_policy.py ├── ppo2 │ ├── __init__.py │ ├── README.md │ └── defaults.py ├── trpo_mpi │ ├── __init__.py │ ├── README.md │ └── defaults.py ├── common │ ├── tests │ │ ├── __init__.py │ │ ├── envs │ │ │ ├── __init__.py │ │ │ ├── fixed_sequence_env.py │ │ │ ├── identity_env.py │ │ │ └── mnist_env.py │ │ ├── test_schedules.py │ │ ├── test_tf_util.py │ │ ├── test_cartpole.py │ │ ├── test_doc_examples.py │ │ ├── test_fixed_sequence.py │ │ ├── test_mnist.py │ │ ├── test_identity.py │ │ ├── util.py │ │ ├── test_segment_tree.py │ │ └── test_serialization.py │ ├── __init__.py │ ├── runners.py │ ├── mpi_fork.py │ ├── identity_env.py │ ├── tile_images.py │ ├── vec_env │ │ ├── vec_monitor.py │ │ ├── vec_frame_stack.py │ │ ├── util.py │ │ ├── vec_normalize.py │ │ ├── dummy_vec_env.py │ │ ├── subproc_vec_env.py │ │ ├── test_vec_env.py │ │ └── __init__.py │ ├── cg.py │ ├── mpi_adam_optimizer.py │ ├── running_stat.py │ ├── input.py │ ├── mpi_moments.py │ ├── console_util.py │ ├── dataset.py │ ├── math_util.py │ ├── mpi_adam.py │ ├── filters.py │ ├── mpi_util.py │ ├── mpi_running_mean_std.py │ ├── schedules.py │ └── segment_tree.py ├── deepq │ ├── experiments │ │ ├── __init__.py │ │ ├── enjoy_cartpole.py │ │ ├── enjoy_mountaincar.py │ │ ├── enjoy_pong.py │ │ ├── train_mountaincar.py │ │ ├── train_cartpole.py │ │ ├── enjoy_retro.py │ │ ├── run_retro.py │ │ ├── run_atari.py │ │ └── custom_cartpole.py │ ├── __init__.py │ ├── defaults.py │ ├── README.md │ └── utils.py ├── bench │ └── __init__.py └── results_plotter.py ├── data ├── logo.jpg └── cartpole.gif ├── setup.cfg ├── .travis.yml ├── .gitignore ├── Dockerfile ├── LICENSE └── setup.py /.benchmark_pattern: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /baselines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/a2c/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/acer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/acktr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/ddpg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/gail/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/her/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/ppo1/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/ppo2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/trpo_mpi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/common/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/gail/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/her/experiment/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/common/tests/envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/acktr/acktr.py: -------------------------------------------------------------------------------- 1 | from baselines.acktr.acktr_disc import * 2 | -------------------------------------------------------------------------------- /data/logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/data/logo.jpg -------------------------------------------------------------------------------- /data/cartpole.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/data/cartpole.gif -------------------------------------------------------------------------------- /baselines/acer/defaults.py: -------------------------------------------------------------------------------- 1 | def atari(): 2 | return dict( 3 | lrschedule='constant' 4 | ) 5 | -------------------------------------------------------------------------------- /baselines/bench/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.bench.benchmarks import * 2 | from baselines.bench.monitor import * -------------------------------------------------------------------------------- /baselines/gail/result/hopper-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/hopper-training.png -------------------------------------------------------------------------------- /baselines/gail/result/humanoid-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/humanoid-training.png -------------------------------------------------------------------------------- /baselines/gail/result/walker2d-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/walker2d-training.png -------------------------------------------------------------------------------- /baselines/gail/result/halfcheetah-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/halfcheetah-training.png -------------------------------------------------------------------------------- /baselines/gail/result/humanoidstandup-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/humanoidstandup-training.png -------------------------------------------------------------------------------- /baselines/gail/result/Hopper-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Hopper-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Hopper-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Humanoid-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Walker2d-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Hopper-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Humanoid-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Walker2d-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Atcold/baselines/master/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F403 2 | from baselines.common.console_util import * 3 | from baselines.common.dataset import Dataset 4 | from baselines.common.math_util import * 5 | from baselines.common.misc_util import * 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = F,E999 3 | exclude = 4 | .git, 5 | __pycache__, 6 | baselines/her, 7 | baselines/ddpg, 8 | baselines/ppo1, 9 | baselines/bench, 10 | baselines/acktr, 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | services: 6 | - docker 7 | 8 | install: 9 | - pip install flake8 10 | - docker build . -t baselines-test 11 | 12 | script: 13 | - flake8 . 14 | - docker run baselines-test pytest -v . 15 | -------------------------------------------------------------------------------- /baselines/ddpg/README.md: -------------------------------------------------------------------------------- 1 | # DDPG 2 | 3 | - Original paper: https://arxiv.org/abs/1509.02971 4 | - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/ 5 | - `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options. -------------------------------------------------------------------------------- /baselines/acer/README.md: -------------------------------------------------------------------------------- 1 | # ACER 2 | 3 | - Original paper: https://arxiv.org/abs/1611.01224 4 | - `python -m baselines.run --alg=acer --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options. 5 | - also refer to the repo-wide [README.md](../../README.md#training-models) 6 | 7 | -------------------------------------------------------------------------------- /baselines/a2c/README.md: -------------------------------------------------------------------------------- 1 | # A2C 2 | 3 | - Original paper: https://arxiv.org/abs/1602.01783 4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ 5 | - `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options 6 | - also refer to the repo-wide [README.md](../../README.md#training-models) 7 | -------------------------------------------------------------------------------- /baselines/acktr/README.md: -------------------------------------------------------------------------------- 1 | # ACKTR 2 | 3 | - Original paper: https://arxiv.org/abs/1708.05144 4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ 5 | - `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options. 6 | - also refer to the repo-wide [README.md](../../README.md#training-models) 7 | 8 | 9 | -------------------------------------------------------------------------------- /baselines/deepq/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.deepq import models # noqa 2 | from baselines.deepq.build_graph import build_act, build_train # noqa 3 | from baselines.deepq.deepq import learn, load_act # noqa 4 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa 5 | 6 | def wrap_atari_dqn(env): 7 | from baselines.common.atari_wrappers import wrap_deepmind 8 | return wrap_deepmind(env, frame_stack=True, scale=True) 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.pkl 4 | *.py~ 5 | .pytest_cache 6 | .DS_Store 7 | .idea 8 | 9 | # Setuptools distribution and build folders. 10 | /dist/ 11 | /build 12 | keys/ 13 | 14 | # Virtualenv 15 | /env 16 | 17 | 18 | *.sublime-project 19 | *.sublime-workspace 20 | 21 | .idea 22 | 23 | logs/ 24 | 25 | .ipynb_checkpoints 26 | ghostdriver.log 27 | 28 | htmlcov 29 | 30 | junk 31 | src 32 | 33 | *.egg-info 34 | .cache 35 | 36 | MUJOCO_LOG.TXT 37 | -------------------------------------------------------------------------------- /baselines/ppo2/README.md: -------------------------------------------------------------------------------- 1 | # PPO2 2 | 3 | - Original paper: https://arxiv.org/abs/1707.06347 4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 5 | 6 | - `python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options. 7 | - `python -m baselines.run --alg=ppo2 --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M frames on a Mujoco Ant environment. 8 | - also refer to the repo-wide [README.md](../../README.md#training-models) 9 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def main(): 7 | env = gym.make("CartPole-v0") 8 | act = deepq.load("cartpole_model.pkl") 9 | 10 | while True: 11 | obs, done = env.reset(), False 12 | episode_rew = 0 13 | while not done: 14 | env.render() 15 | obs, rew, done, _ = env.step(act(obs[None])[0]) 16 | episode_rew += rew 17 | print("Episode reward", episode_rew) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /baselines/trpo_mpi/README.md: -------------------------------------------------------------------------------- 1 | # trpo_mpi 2 | 3 | - Original paper: https://arxiv.org/abs/1502.05477 4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/ 5 | - `mpirun -np 16 python -m baselines.run --alg=trpo_mpi --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options. 6 | - `python -m baselines.run --alg=trpo_mpi --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M timesteps on a Mujoco Ant environment. 7 | - also refer to the repo-wide [README.md](../../README.md#training-models) 8 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_mountaincar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def main(): 7 | env = gym.make("MountainCar-v0") 8 | act = deepq.load("mountaincar_model.pkl") 9 | 10 | while True: 11 | obs, done = env.reset(), False 12 | episode_rew = 0 13 | while not done: 14 | env.render() 15 | obs, rew, done, _ = env.step(act(obs[None])[0]) 16 | episode_rew += rew 17 | print("Episode reward", episode_rew) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /baselines/deepq/defaults.py: -------------------------------------------------------------------------------- 1 | def atari(): 2 | return dict( 3 | network='conv_only', 4 | lr=1e-4, 5 | buffer_size=10000, 6 | exploration_fraction=0.1, 7 | exploration_final_eps=0.01, 8 | train_freq=4, 9 | learning_starts=10000, 10 | target_network_update_freq=1000, 11 | gamma=0.99, 12 | prioritized_replay=True, 13 | prioritized_replay_alpha=0.6, 14 | checkpoint_freq=10000, 15 | checkpoint_path=None, 16 | dueling=True 17 | ) 18 | 19 | def retro(): 20 | return atari() 21 | 22 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from baselines import deepq 3 | 4 | 5 | def main(): 6 | env = gym.make("PongNoFrameskip-v4") 7 | env = deepq.wrap_atari_dqn(env) 8 | act = deepq.load("pong_model.pkl") 9 | 10 | while True: 11 | obs, done = env.reset(), False 12 | episode_rew = 0 13 | while not done: 14 | env.render() 15 | obs, rew, done, _ = env.step(act(obs[None])[0]) 16 | episode_rew += rew 17 | print("Episode reward", episode_rew) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /baselines/ppo2/defaults.py: -------------------------------------------------------------------------------- 1 | def mujoco(): 2 | return dict( 3 | nsteps=2048, 4 | nminibatches=32, 5 | lam=0.95, 6 | gamma=0.99, 7 | noptepochs=10, 8 | log_interval=1, 9 | ent_coef=0.0, 10 | lr=lambda f: 3e-4 * f, 11 | cliprange=0.2, 12 | value_network='copy' 13 | ) 14 | 15 | def atari(): 16 | return dict( 17 | nsteps=128, nminibatches=4, 18 | lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, 19 | ent_coef=.01, 20 | lr=lambda f : f * 2.5e-4, 21 | cliprange=lambda f : f * 0.1, 22 | ) 23 | -------------------------------------------------------------------------------- /baselines/ppo1/README.md: -------------------------------------------------------------------------------- 1 | # PPOSGD 2 | 3 | - Original paper: https://arxiv.org/abs/1707.06347 4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. 6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment. 7 | 8 | - Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model` 9 | - Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model` 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv 4 | ENV CODE_DIR /root/code 5 | ENV VENV /root/venv 6 | 7 | RUN \ 8 | pip install virtualenv && \ 9 | virtualenv $VENV --python=python3 && \ 10 | . $VENV/bin/activate && \ 11 | pip install --upgrade pip 12 | 13 | ENV PATH=$VENV/bin:$PATH 14 | 15 | COPY . $CODE_DIR/baselines 16 | WORKDIR $CODE_DIR/baselines 17 | 18 | # Clean up pycache and pyc files 19 | RUN rm -rf __pycache__ && \ 20 | find . -name "*.pyc" -delete && \ 21 | pip install tensorflow && \ 22 | pip install -e .[test] 23 | 24 | 25 | CMD /bin/bash 26 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/train_mountaincar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def main(): 7 | env = gym.make("MountainCar-v0") 8 | # Enabling layer_norm here is import for parameter space noise! 9 | model = deepq.models.mlp([64], layer_norm=True) 10 | act = deepq.learn( 11 | env, 12 | q_func=model, 13 | lr=1e-3, 14 | max_timesteps=100000, 15 | buffer_size=50000, 16 | exploration_fraction=0.1, 17 | exploration_final_eps=0.1, 18 | print_freq=10, 19 | param_noise=True 20 | ) 21 | print("Saving model to mountaincar_model.pkl") 22 | act.save("mountaincar_model.pkl") 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /baselines/common/runners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | 4 | class AbstractEnvRunner(ABC): 5 | def __init__(self, *, env, model, nsteps): 6 | self.env = env 7 | self.model = model 8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape 10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) 11 | self.obs[:] = env.reset() 12 | self.nsteps = nsteps 13 | self.states = model.initial_state 14 | self.dones = [False for _ in range(nenv)] 15 | 16 | @abstractmethod 17 | def run(self): 18 | raise NotImplementedError 19 | 20 | -------------------------------------------------------------------------------- /baselines/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, sys 2 | 3 | def mpi_fork(n, bind_to_core=False): 4 | """Re-launches the current script with workers 5 | Returns "parent" for original parent, "child" for MPI children 6 | """ 7 | if n<=1: 8 | return "child" 9 | if os.getenv("IN_MPI") is None: 10 | env = os.environ.copy() 11 | env.update( 12 | MKL_NUM_THREADS="1", 13 | OMP_NUM_THREADS="1", 14 | IN_MPI="1" 15 | ) 16 | args = ["mpirun", "-np", str(n)] 17 | if bind_to_core: 18 | args += ["-bind-to", "core"] 19 | args += [sys.executable] + sys.argv 20 | subprocess.check_call(args, env=env) 21 | return "parent" 22 | else: 23 | return "child" 24 | -------------------------------------------------------------------------------- /baselines/trpo_mpi/defaults.py: -------------------------------------------------------------------------------- 1 | from baselines.common.models import mlp, cnn_small 2 | 3 | 4 | def atari(): 5 | return dict( 6 | network = cnn_small(), 7 | timesteps_per_batch=512, 8 | max_kl=0.001, 9 | cg_iters=10, 10 | cg_damping=1e-3, 11 | gamma=0.98, 12 | lam=1.0, 13 | vf_iters=3, 14 | vf_stepsize=1e-4, 15 | entcoeff=0.00, 16 | ) 17 | 18 | def mujoco(): 19 | return dict( 20 | network = mlp(num_hidden=32, num_layers=2), 21 | timesteps_per_batch=1024, 22 | max_kl=0.01, 23 | cg_iters=10, 24 | cg_damping=0.1, 25 | gamma=0.99, 26 | lam=0.98, 27 | vf_iters=5, 28 | vf_stepsize=1e-3, 29 | normalize_observations=True, 30 | ) 31 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/train_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def callback(lcl, _glb): 7 | # stop training if reward exceeds 199 8 | is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199 9 | return is_solved 10 | 11 | 12 | def main(): 13 | env = gym.make("CartPole-v0") 14 | act = deepq.learn( 15 | env, 16 | network='mlp', 17 | lr=1e-3, 18 | total_timesteps=100000, 19 | buffer_size=50000, 20 | exploration_fraction=0.1, 21 | exploration_final_eps=0.02, 22 | print_freq=10, 23 | callback=callback 24 | ) 25 | print("Saving model to cartpole_model.pkl") 26 | act.save("cartpole_model.pkl") 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /baselines/common/identity_env.py: -------------------------------------------------------------------------------- 1 | from gym import Env 2 | from gym.spaces import Discrete 3 | 4 | 5 | class IdentityEnv(Env): 6 | def __init__( 7 | self, 8 | dim, 9 | ep_length=100, 10 | ): 11 | 12 | self.action_space = Discrete(dim) 13 | self.reset() 14 | 15 | def reset(self): 16 | self._choose_next_state() 17 | self.observation_space = self.action_space 18 | 19 | return self.state 20 | 21 | def step(self, actions): 22 | rew = self._get_reward(actions) 23 | self._choose_next_state() 24 | return self.state, rew, False, {} 25 | 26 | def _choose_next_state(self): 27 | self.state = self.action_space.sample() 28 | 29 | def _get_reward(self, actions): 30 | return 1 if self.state == actions else 0 31 | -------------------------------------------------------------------------------- /baselines/common/tile_images.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def tile_images(img_nhwc): 4 | """ 5 | Tile N images into one big PxQ image 6 | (P,Q) are chosen to be as close as possible, and if N 7 | is square, then P=Q. 8 | 9 | input: img_nhwc, list or array of images, ndim=4 once turned into array 10 | n = batch index, h = height, w = width, c = channel 11 | returns: 12 | bigim_HWc, ndarray with ndim=3 13 | """ 14 | img_nhwc = np.asarray(img_nhwc) 15 | N, h, w, c = img_nhwc.shape 16 | H = int(np.ceil(np.sqrt(N))) 17 | W = int(np.ceil(float(N)/H)) 18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) 19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c) 20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) 21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) 22 | return img_Hh_Ww_c 23 | 24 | -------------------------------------------------------------------------------- /baselines/common/tests/test_schedules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule 4 | 5 | 6 | def test_piecewise_schedule(): 7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) 8 | 9 | assert np.isclose(ps.value(-10), 500) 10 | assert np.isclose(ps.value(0), 150) 11 | assert np.isclose(ps.value(5), 200) 12 | assert np.isclose(ps.value(9), 80) 13 | assert np.isclose(ps.value(50), 50) 14 | assert np.isclose(ps.value(80), 50) 15 | assert np.isclose(ps.value(150), 0) 16 | assert np.isclose(ps.value(175), -25) 17 | assert np.isclose(ps.value(201), 500) 18 | assert np.isclose(ps.value(500), 500) 19 | 20 | assert np.isclose(ps.value(200 - 1e-10), -50) 21 | 22 | 23 | def test_constant_schedule(): 24 | cs = ConstantSchedule(5) 25 | for i in range(-100, 100): 26 | assert np.isclose(cs.value(i), 5) 27 | -------------------------------------------------------------------------------- /baselines/common/vec_env/vec_monitor.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | import numpy as np 3 | 4 | 5 | class VecMonitor(VecEnvWrapper): 6 | def __init__(self, venv): 7 | VecEnvWrapper.__init__(self, venv) 8 | self.eprets = None 9 | self.eplens = None 10 | 11 | def reset(self): 12 | obs = self.venv.reset() 13 | self.eprets = np.zeros(self.num_envs, 'f') 14 | self.eplens = np.zeros(self.num_envs, 'i') 15 | return obs 16 | 17 | def step_wait(self): 18 | obs, rews, dones, infos = self.venv.step_wait() 19 | self.eprets += rews 20 | self.eplens += 1 21 | newinfos = [] 22 | for (i, (done, ret, eplen, info)) in enumerate(zip(dones, self.eprets, self.eplens, infos)): 23 | info = info.copy() 24 | if done: 25 | info['episode'] = {'r': ret, 'l': eplen} 26 | self.eprets[i] = 0 27 | self.eplens[i] = 0 28 | newinfos.append(info) 29 | return obs, rews, dones, newinfos 30 | -------------------------------------------------------------------------------- /baselines/common/cg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 3 | """ 4 | Demmel p 312 5 | """ 6 | p = b.copy() 7 | r = b.copy() 8 | x = np.zeros_like(b) 9 | rdotr = r.dot(r) 10 | 11 | fmtstr = "%10i %10.3g %10.3g" 12 | titlestr = "%10s %10s %10s" 13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) 14 | 15 | for i in range(cg_iters): 16 | if callback is not None: 17 | callback(x) 18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) 19 | z = f_Ax(p) 20 | v = rdotr / p.dot(z) 21 | x += v*p 22 | r -= v*z 23 | newrdotr = r.dot(r) 24 | mu = newrdotr/rdotr 25 | p = r + mu*p 26 | 27 | rdotr = newrdotr 28 | if rdotr < residual_tol: 29 | break 30 | 31 | if callback is not None: 32 | callback(x) 33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 34 | return x -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /baselines/ppo1/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 4 | from baselines.common import tf_util as U 5 | from baselines import logger 6 | 7 | def train(env_id, num_timesteps, seed): 8 | from baselines.ppo1 import mlp_policy, pposgd_simple 9 | U.make_session(num_cpu=1).__enter__() 10 | def policy_fn(name, ob_space, ac_space): 11 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 12 | hid_size=64, num_hid_layers=2) 13 | env = make_mujoco_env(env_id, seed) 14 | pposgd_simple.learn(env, policy_fn, 15 | max_timesteps=num_timesteps, 16 | timesteps_per_actorbatch=2048, 17 | clip_param=0.2, entcoeff=0.0, 18 | optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, 19 | gamma=0.99, lam=0.95, schedule='linear', 20 | ) 21 | env.close() 22 | 23 | def main(): 24 | args = mujoco_arg_parser().parse_args() 25 | logger.configure() 26 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /baselines/common/tests/test_tf_util.py: -------------------------------------------------------------------------------- 1 | # tests for tf_util 2 | import tensorflow as tf 3 | from baselines.common.tf_util import ( 4 | function, 5 | initialize, 6 | single_threaded_session 7 | ) 8 | 9 | 10 | def test_function(): 11 | with tf.Graph().as_default(): 12 | x = tf.placeholder(tf.int32, (), name="x") 13 | y = tf.placeholder(tf.int32, (), name="y") 14 | z = 3 * x + 2 * y 15 | lin = function([x, y], z, givens={y: 0}) 16 | 17 | with single_threaded_session(): 18 | initialize() 19 | 20 | assert lin(2) == 6 21 | assert lin(2, 2) == 10 22 | 23 | 24 | def test_multikwargs(): 25 | with tf.Graph().as_default(): 26 | x = tf.placeholder(tf.int32, (), name="x") 27 | with tf.variable_scope("other"): 28 | x2 = tf.placeholder(tf.int32, (), name="x") 29 | z = 3 * x + 2 * x2 30 | 31 | lin = function([x, x2], z, givens={x2: 0}) 32 | with single_threaded_session(): 33 | initialize() 34 | assert lin(2) == 6 35 | assert lin(2, 2) == 10 36 | 37 | 38 | if __name__ == '__main__': 39 | test_function() 40 | test_multikwargs() 41 | -------------------------------------------------------------------------------- /baselines/common/tests/test_cartpole.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | 4 | from baselines.run import get_learn_function 5 | from baselines.common.tests.util import reward_per_episode_test 6 | 7 | common_kwargs = dict( 8 | total_timesteps=30000, 9 | network='mlp', 10 | gamma=1.0, 11 | seed=0, 12 | ) 13 | 14 | learn_kwargs = { 15 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05), 16 | 'acktr': dict(nsteps=32, value_network='copy'), 17 | 'deepq': dict(total_timesteps=20000), 18 | 'ppo2': dict(value_network='copy'), 19 | 'trpo_mpi': {} 20 | } 21 | 22 | @pytest.mark.slow 23 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 24 | def test_cartpole(alg): 25 | ''' 26 | Test if the algorithm (with an mlp policy) 27 | can learn to balance the cartpole 28 | ''' 29 | 30 | kwargs = common_kwargs.copy() 31 | kwargs.update(learn_kwargs[alg]) 32 | 33 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 34 | def env_fn(): 35 | 36 | env = gym.make('CartPole-v0') 37 | env.seed(0) 38 | return env 39 | 40 | reward_per_episode_test(env_fn, learn_fn, 100) 41 | 42 | if __name__ == '__main__': 43 | test_cartpole('deepq') 44 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/enjoy_retro.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | 5 | from baselines import deepq 6 | from baselines.common import retro_wrappers 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes') 12 | parser.add_argument('--gamestate', help='game state to load', default='Level1-1') 13 | parser.add_argument('--model', help='model pickle file from ActWrapper.save', default='model.pkl') 14 | args = parser.parse_args() 15 | 16 | env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=None) 17 | env = retro_wrappers.wrap_deepmind_retro(env) 18 | act = deepq.load(args.model) 19 | 20 | while True: 21 | obs, done = env.reset(), False 22 | episode_rew = 0 23 | while not done: 24 | env.render() 25 | action = act(obs[None])[0] 26 | env_action = np.zeros(env.action_space.n) 27 | env_action[action] = 1 28 | obs, rew, done, _ = env.step(env_action) 29 | episode_rew += rew 30 | print('Episode reward', episode_rew) 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /baselines/acktr/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import tensorflow as tf 4 | from baselines import logger 5 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 6 | from baselines.acktr.acktr_cont import learn 7 | from baselines.acktr.policies import GaussianMlpPolicy 8 | from baselines.acktr.value_functions import NeuralNetValueFunction 9 | 10 | def train(env_id, num_timesteps, seed): 11 | env = make_mujoco_env(env_id, seed) 12 | 13 | with tf.Session(config=tf.ConfigProto()): 14 | ob_dim = env.observation_space.shape[0] 15 | ac_dim = env.action_space.shape[0] 16 | with tf.variable_scope("vf"): 17 | vf = NeuralNetValueFunction(ob_dim, ac_dim) 18 | with tf.variable_scope("pi"): 19 | policy = GaussianMlpPolicy(ob_dim, ac_dim) 20 | 21 | learn(env, policy=policy, vf=vf, 22 | gamma=0.99, lam=0.97, timesteps_per_batch=2500, 23 | desired_kl=0.002, 24 | num_timesteps=num_timesteps, animate=False) 25 | 26 | env.close() 27 | 28 | def main(): 29 | args = mujoco_arg_parser().parse_args() 30 | logger.configure() 31 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 32 | 33 | if __name__ == "__main__": 34 | main() 35 | -------------------------------------------------------------------------------- /baselines/common/tests/envs/fixed_sequence_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import Env 3 | from gym.spaces import Discrete 4 | 5 | 6 | class FixedSequenceEnv(Env): 7 | def __init__( 8 | self, 9 | n_actions=10, 10 | seed=0, 11 | episode_len=100 12 | ): 13 | self.np_random = np.random.RandomState() 14 | self.np_random.seed(seed) 15 | self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)] 16 | 17 | self.action_space = Discrete(n_actions) 18 | self.observation_space = Discrete(1) 19 | 20 | self.episode_len = episode_len 21 | self.time = 0 22 | self.reset() 23 | 24 | def reset(self): 25 | self.time = 0 26 | return 0 27 | 28 | def step(self, actions): 29 | rew = self._get_reward(actions) 30 | self._choose_next_state() 31 | done = False 32 | if self.episode_len and self.time >= self.episode_len: 33 | rew = 0 34 | done = True 35 | 36 | return 0, rew, done, {} 37 | 38 | def _choose_next_state(self): 39 | self.time += 1 40 | 41 | def _get_reward(self, actions): 42 | return 1 if actions == self.sequence[self.time] else 0 43 | 44 | 45 | -------------------------------------------------------------------------------- /baselines/gail/README.md: -------------------------------------------------------------------------------- 1 | # Generative Adversarial Imitation Learning (GAIL) 2 | 3 | - Original paper: https://arxiv.org/abs/1606.03476 4 | 5 | For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md) 6 | 7 | ## If you want to train an imitation learning agent 8 | 9 | ### Step 1: Download expert data 10 | 11 | Download the expert data into `./data`, [download link](https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing) 12 | 13 | ### Step 2: Run GAIL 14 | 15 | Run with single thread: 16 | 17 | ```bash 18 | python -m baselines.gail.run_mujoco 19 | ``` 20 | 21 | Run with multiple threads: 22 | 23 | ```bash 24 | mpirun -np 16 python -m baselines.gail.run_mujoco 25 | ``` 26 | 27 | See help (`-h`) for more options. 28 | 29 | #### In case you want to run Behavior Cloning (BC) 30 | 31 | ```bash 32 | python -m baselines.gail.behavior_clone 33 | ``` 34 | 35 | See help (`-h`) for more options. 36 | 37 | 38 | ## Contributing 39 | 40 | Bug reports and pull requests are welcome on GitHub at https://github.com/openai/baselines/pulls. 41 | 42 | ## Maintainers 43 | 44 | - Yuan-Hong Liao, andrewliao11_at_gmail_dot_com 45 | - Ryan Julian, ryanjulian_at_gmail_dot_com 46 | 47 | ## Others 48 | 49 | Thanks to the open source: 50 | 51 | - @openai/imitation 52 | - @carpedm20/deep-rl-tensorflow 53 | -------------------------------------------------------------------------------- /baselines/common/vec_env/vec_frame_stack.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | import numpy as np 3 | from gym import spaces 4 | 5 | 6 | class VecFrameStack(VecEnvWrapper): 7 | def __init__(self, venv, nstack): 8 | self.venv = venv 9 | self.nstack = nstack 10 | wos = venv.observation_space # wrapped ob space 11 | low = np.repeat(wos.low, self.nstack, axis=-1) 12 | high = np.repeat(wos.high, self.nstack, axis=-1) 13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) 14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) 15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 16 | 17 | def step_wait(self): 18 | obs, rews, news, infos = self.venv.step_wait() 19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) 20 | for (i, new) in enumerate(news): 21 | if new: 22 | self.stackedobs[i] = 0 23 | self.stackedobs[..., -obs.shape[-1]:] = obs 24 | return self.stackedobs, rews, news, infos 25 | 26 | def reset(self): 27 | obs = self.venv.reset() 28 | self.stackedobs[...] = 0 29 | self.stackedobs[..., -obs.shape[-1]:] = obs 30 | return self.stackedobs 31 | 32 | def close(self): 33 | self.venv.close() 34 | -------------------------------------------------------------------------------- /baselines/acktr/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None): 4 | with tf.variable_scope(name, reuse=reuse): 5 | assert (len(tf.get_variable_scope().name.split('/')) == 2) 6 | 7 | w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init) 8 | b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) 9 | weight_decay_fc = 3e-4 10 | 11 | if weight_loss_dict is not None: 12 | weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss') 13 | if weight_loss_dict is not None: 14 | weight_loss_dict[w] = weight_decay_fc 15 | weight_loss_dict[b] = 0.0 16 | 17 | tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay) 18 | 19 | return tf.nn.bias_add(tf.matmul(x, w), b) 20 | 21 | def kl_div(action_dist1, action_dist2, action_size): 22 | mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:] 23 | mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:] 24 | 25 | numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2) 26 | denominator = 2 * tf.square(std2) + 1e-8 27 | return tf.reduce_sum( 28 | numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1) 29 | -------------------------------------------------------------------------------- /baselines/ppo1/run_robotics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from mpi4py import MPI 4 | from baselines.common import set_global_seeds 5 | from baselines import logger 6 | from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser 7 | import mujoco_py 8 | 9 | 10 | def train(env_id, num_timesteps, seed): 11 | from baselines.ppo1 import mlp_policy, pposgd_simple 12 | import baselines.common.tf_util as U 13 | rank = MPI.COMM_WORLD.Get_rank() 14 | sess = U.single_threaded_session() 15 | sess.__enter__() 16 | mujoco_py.ignore_mujoco_warnings().__enter__() 17 | workerseed = seed + 10000 * rank 18 | set_global_seeds(workerseed) 19 | env = make_robotics_env(env_id, workerseed, rank=rank) 20 | def policy_fn(name, ob_space, ac_space): 21 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 22 | hid_size=256, num_hid_layers=3) 23 | 24 | pposgd_simple.learn(env, policy_fn, 25 | max_timesteps=num_timesteps, 26 | timesteps_per_actorbatch=2048, 27 | clip_param=0.2, entcoeff=0.0, 28 | optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, 29 | gamma=0.99, lam=0.95, schedule='linear', 30 | ) 31 | env.close() 32 | 33 | 34 | def main(): 35 | args = robotics_arg_parser().parse_args() 36 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /baselines/common/mpi_adam_optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from mpi4py import MPI 4 | 5 | class MpiAdamOptimizer(tf.train.AdamOptimizer): 6 | """Adam optimizer that averages gradients across mpi processes.""" 7 | def __init__(self, comm, **kwargs): 8 | self.comm = comm 9 | tf.train.AdamOptimizer.__init__(self, **kwargs) 10 | def compute_gradients(self, loss, var_list, **kwargs): 11 | grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs) 12 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] 13 | flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) 14 | shapes = [v.shape.as_list() for g, v in grads_and_vars] 15 | sizes = [int(np.prod(s)) for s in shapes] 16 | 17 | num_tasks = self.comm.Get_size() 18 | buf = np.zeros(sum(sizes), np.float32) 19 | 20 | def _collect_grads(flat_grad): 21 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) 22 | np.divide(buf, float(num_tasks), out=buf) 23 | return buf 24 | 25 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) 26 | avg_flat_grad.set_shape(flat_grad.shape) 27 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0) 28 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v) 29 | for g, (_, v) in zip(avg_grads, grads_and_vars)] 30 | 31 | return avg_grads_and_vars 32 | -------------------------------------------------------------------------------- /baselines/common/running_stat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # http://www.johndcook.com/blog/standard_deviation/ 4 | class RunningStat(object): 5 | def __init__(self, shape): 6 | self._n = 0 7 | self._M = np.zeros(shape) 8 | self._S = np.zeros(shape) 9 | def push(self, x): 10 | x = np.asarray(x) 11 | assert x.shape == self._M.shape 12 | self._n += 1 13 | if self._n == 1: 14 | self._M[...] = x 15 | else: 16 | oldM = self._M.copy() 17 | self._M[...] = oldM + (x - oldM)/self._n 18 | self._S[...] = self._S + (x - oldM)*(x - self._M) 19 | @property 20 | def n(self): 21 | return self._n 22 | @property 23 | def mean(self): 24 | return self._M 25 | @property 26 | def var(self): 27 | return self._S/(self._n - 1) if self._n > 1 else np.square(self._M) 28 | @property 29 | def std(self): 30 | return np.sqrt(self.var) 31 | @property 32 | def shape(self): 33 | return self._M.shape 34 | 35 | def test_running_stat(): 36 | for shp in ((), (3,), (3,4)): 37 | li = [] 38 | rs = RunningStat(shp) 39 | for _ in range(5): 40 | val = np.random.randn(*shp) 41 | rs.push(val) 42 | li.append(val) 43 | m = np.mean(li, axis=0) 44 | assert np.allclose(rs.mean, m) 45 | v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) 46 | assert np.allclose(rs.var, v) 47 | -------------------------------------------------------------------------------- /baselines/common/vec_env/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for dealing with vectorized environments. 3 | """ 4 | 5 | from collections import OrderedDict 6 | 7 | import gym 8 | import numpy as np 9 | 10 | 11 | def copy_obs_dict(obs): 12 | """ 13 | Deep-copy an observation dict. 14 | """ 15 | return {k: np.copy(v) for k, v in obs.items()} 16 | 17 | 18 | def dict_to_obs(obs_dict): 19 | """ 20 | Convert an observation dict into a raw array if the 21 | original observation space was not a Dict space. 22 | """ 23 | if set(obs_dict.keys()) == {None}: 24 | return obs_dict[None] 25 | return obs_dict 26 | 27 | 28 | def obs_space_info(obs_space): 29 | """ 30 | Get dict-structured information about a gym.Space. 31 | 32 | Returns: 33 | A tuple (keys, shapes, dtypes): 34 | keys: a list of dict keys. 35 | shapes: a dict mapping keys to shapes. 36 | dtypes: a dict mapping keys to dtypes. 37 | """ 38 | if isinstance(obs_space, gym.spaces.Dict): 39 | assert isinstance(obs_space.spaces, OrderedDict) 40 | subspaces = obs_space.spaces 41 | else: 42 | subspaces = {None: obs_space} 43 | keys = [] 44 | shapes = {} 45 | dtypes = {} 46 | for key, box in subspaces.items(): 47 | keys.append(key) 48 | shapes[key] = box.shape 49 | dtypes[key] = box.dtype 50 | return keys, shapes, dtypes 51 | 52 | 53 | def obs_to_dict(obs): 54 | """ 55 | Convert an observation into a dict. 56 | """ 57 | if isinstance(obs, dict): 58 | return obs 59 | return {None: obs} 60 | -------------------------------------------------------------------------------- /baselines/common/tests/test_doc_examples.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | try: 3 | import mujoco_py 4 | _mujoco_present = True 5 | except BaseException: 6 | mujoco_py = None 7 | _mujoco_present = False 8 | 9 | 10 | @pytest.mark.skipif( 11 | not _mujoco_present, 12 | reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library' 13 | ) 14 | def test_lstm_example(): 15 | import tensorflow as tf 16 | from baselines.common import policies, models, cmd_util 17 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 18 | 19 | # create vectorized environment 20 | venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) 21 | 22 | with tf.Session() as sess: 23 | # build policy based on lstm network with 128 units 24 | policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) 25 | 26 | # initialize tensorflow variables 27 | sess.run(tf.global_variables_initializer()) 28 | 29 | # prepare environment variables 30 | ob = venv.reset() 31 | state = policy.initial_state 32 | done = [False] 33 | step_counter = 0 34 | 35 | # run a single episode until the end (i.e. until done) 36 | while True: 37 | action, _, state, _ = policy.step(ob, S=state, M=done) 38 | ob, reward, done, _ = venv.step(action) 39 | step_counter += 1 40 | if done: 41 | break 42 | 43 | 44 | assert step_counter > 5 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | 4 | if sys.version_info.major != 3: 5 | print('This Python is only compatible with Python 3, but you are running ' 6 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major)) 7 | 8 | 9 | extras = { 10 | 'test': [ 11 | 'filelock', 12 | 'pytest' 13 | ] 14 | } 15 | 16 | 17 | all_deps = [] 18 | for group_name in extras: 19 | all_deps += extras[group_name] 20 | 21 | extras['all'] = all_deps 22 | 23 | setup(name='baselines', 24 | packages=[package for package in find_packages() 25 | if package.startswith('baselines')], 26 | install_requires=[ 27 | 'gym[mujoco,atari,classic_control,robotics]', 28 | 'scipy', 29 | 'tqdm', 30 | 'joblib', 31 | 'dill', 32 | 'progressbar2', 33 | 'mpi4py', 34 | 'cloudpickle', 35 | 'click', 36 | 'opencv-python' 37 | ], 38 | extras_require=extras, 39 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', 40 | author='OpenAI', 41 | url='https://github.com/openai/baselines', 42 | author_email='gym@openai.com', 43 | version='0.1.5') 44 | 45 | 46 | # ensure there is some tensorflow build with version above 1.4 47 | try: 48 | from distutils.version import StrictVersion 49 | import tensorflow 50 | assert StrictVersion(tensorflow.__version__) >= StrictVersion('1.4.0') 51 | except ImportError: 52 | assert False, "TensorFlow needed, of version above 1.4" 53 | -------------------------------------------------------------------------------- /baselines/common/tests/test_fixed_sequence.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv 3 | 4 | from baselines.common.tests.util import simple_test 5 | from baselines.run import get_learn_function 6 | 7 | common_kwargs = dict( 8 | seed=0, 9 | total_timesteps=50000, 10 | ) 11 | 12 | learn_kwargs = { 13 | 'a2c': {}, 14 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1), 15 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps) 16 | # github issue: https://github.com/openai/baselines/issues/188 17 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001) 18 | } 19 | 20 | 21 | alg_list = learn_kwargs.keys() 22 | rnn_list = ['lstm'] 23 | 24 | @pytest.mark.slow 25 | @pytest.mark.parametrize("alg", alg_list) 26 | @pytest.mark.parametrize("rnn", rnn_list) 27 | def test_fixed_sequence(alg, rnn): 28 | ''' 29 | Test if the algorithm (with a given policy) 30 | can learn an identity transformation (i.e. return observation as an action) 31 | ''' 32 | 33 | kwargs = learn_kwargs[alg] 34 | kwargs.update(common_kwargs) 35 | 36 | episode_len = 5 37 | env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len) 38 | learn = lambda e: get_learn_function(alg)( 39 | env=e, 40 | network=rnn, 41 | **kwargs 42 | ) 43 | 44 | simple_test(env_fn, learn, 0.7) 45 | 46 | 47 | if __name__ == '__main__': 48 | test_fixed_sequence('ppo2', 'lstm') 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /baselines/common/vec_env/vec_normalize.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | from baselines.common.running_mean_std import RunningMeanStd 3 | import numpy as np 4 | 5 | 6 | class VecNormalize(VecEnvWrapper): 7 | """ 8 | A vectorized wrapper that normalizes the observations 9 | and returns from an environment. 10 | """ 11 | 12 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): 13 | VecEnvWrapper.__init__(self, venv) 14 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None 15 | self.ret_rms = RunningMeanStd(shape=()) if ret else None 16 | self.clipob = clipob 17 | self.cliprew = cliprew 18 | self.ret = np.zeros(self.num_envs) 19 | self.gamma = gamma 20 | self.epsilon = epsilon 21 | 22 | def step_wait(self): 23 | obs, rews, news, infos = self.venv.step_wait() 24 | self.ret = self.ret * self.gamma + rews 25 | obs = self._obfilt(obs) 26 | if self.ret_rms: 27 | self.ret_rms.update(self.ret) 28 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) 29 | return obs, rews, news, infos 30 | 31 | def _obfilt(self, obs): 32 | if self.ob_rms: 33 | self.ob_rms.update(obs) 34 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) 35 | return obs 36 | else: 37 | return obs 38 | 39 | def reset(self): 40 | obs = self.venv.reset() 41 | return self._obfilt(obs) 42 | -------------------------------------------------------------------------------- /baselines/her/README.md: -------------------------------------------------------------------------------- 1 | # Hindsight Experience Replay 2 | For details on Hindsight Experience Replay (HER), please read the [paper](https://arxiv.org/abs/1707.01495). 3 | 4 | ## How to use Hindsight Experience Replay 5 | 6 | ### Getting started 7 | Training an agent is very simple: 8 | ```bash 9 | python -m baselines.her.experiment.train 10 | ``` 11 | This will train a DDPG+HER agent on the `FetchReach` environment. 12 | You should see the success rate go up quickly to `1.0`, which means that the agent achieves the 13 | desired goal in 100% of the cases. 14 | The training script logs other diagnostics as well and pickles the best policy so far (w.r.t. to its test success rate), 15 | the latest policy, and, if enabled, a history of policies every K epochs. 16 | 17 | To inspect what the agent has learned, use the play script: 18 | ```bash 19 | python -m baselines.her.experiment.play /path/to/an/experiment/policy_best.pkl 20 | ``` 21 | You can try it right now with the results of the training step (the script prints out the path for you). 22 | This should visualize the current policy for 10 episodes and will also print statistics. 23 | 24 | 25 | ### Reproducing results 26 | In order to reproduce the results from [Plappert et al. (2018)](https://arxiv.org/abs/1802.09464), run the following command: 27 | ```bash 28 | python -m baselines.her.experiment.train --num_cpu 19 29 | ``` 30 | This will require a machine with sufficient amount of physical CPU cores. In our experiments, 31 | we used [Azure's D15v2 instances](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes), 32 | which have 20 physical cores. We only scheduled the experiment on 19 of those to leave some head-room on the system. 33 | -------------------------------------------------------------------------------- /baselines/deepq/README.md: -------------------------------------------------------------------------------- 1 | ## If you are curious. 2 | 3 | ##### Train a Cartpole agent and watch it play once it converges! 4 | 5 | Here's a list of commands to run to quickly get a working example: 6 | 7 | 8 | 9 | 10 | ```bash 11 | # Train model and save the results to cartpole_model.pkl 12 | python -m baselines.run --alg=deepq --env=CartPole-v0 --save_path=./cartpole_model.pkl --num_timesteps=1e5 13 | # Load the model saved in cartpole_model.pkl and visualize the learned policy 14 | python -m baselines.run --alg=deepq --env=CartPole-v0 --load_path=./cartpole_model.pkl --num_timesteps=0 --play 15 | ``` 16 | 17 | ## If you wish to apply DQN to solve a problem. 18 | 19 | Check out our simple agent trained with one stop shop `deepq.learn` function. 20 | 21 | - [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent. 22 | 23 | In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. Complimentary file `enjoy_cartpole.py` loads and visualizes the learned policy. 24 | 25 | ## If you wish to experiment with the algorithm 26 | 27 | ##### Check out the examples 28 | 29 | - [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm. 30 | - [baselines/deepq/defaults.py](defaults.py) - settings for training on atari. Run 31 | 32 | ```bash 33 | python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 34 | ``` 35 | to train on Atari Pong (see more in repo-wide [README.md](../../README.md#training-models)) 36 | 37 | 38 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/run_retro.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from baselines import deepq 4 | from baselines.common import set_global_seeds 5 | from baselines import bench 6 | from baselines import logger 7 | from baselines.common import retro_wrappers 8 | import retro 9 | 10 | 11 | def main(): 12 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 13 | parser.add_argument('--env', help='environment ID', default='SuperMarioBros-Nes') 14 | parser.add_argument('--gamestate', help='game state to load', default='Level1-1') 15 | parser.add_argument('--seed', help='seed', type=int, default=0) 16 | parser.add_argument('--num-timesteps', type=int, default=int(10e6)) 17 | args = parser.parse_args() 18 | logger.configure() 19 | set_global_seeds(args.seed) 20 | env = retro_wrappers.make_retro(game=args.env, state=args.gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) 21 | env.seed(args.seed) 22 | env = bench.Monitor(env, logger.get_dir()) 23 | env = retro_wrappers.wrap_deepmind_retro(env) 24 | 25 | model = deepq.models.cnn_to_mlp( 26 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], 27 | hiddens=[256], 28 | dueling=True 29 | ) 30 | act = deepq.learn( 31 | env, 32 | q_func=model, 33 | lr=1e-4, 34 | max_timesteps=args.num_timesteps, 35 | buffer_size=10000, 36 | exploration_fraction=0.1, 37 | exploration_final_eps=0.01, 38 | train_freq=4, 39 | learning_starts=10000, 40 | target_network_update_freq=1000, 41 | gamma=0.99, 42 | prioritized_replay=True 43 | ) 44 | act.save() 45 | env.close() 46 | 47 | 48 | if __name__ == '__main__': 49 | main() 50 | -------------------------------------------------------------------------------- /baselines/ppo1/run_atari.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from mpi4py import MPI 4 | from baselines.common import set_global_seeds 5 | from baselines import bench 6 | import os.path as osp 7 | from baselines import logger 8 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 9 | from baselines.common.cmd_util import atari_arg_parser 10 | 11 | def train(env_id, num_timesteps, seed): 12 | from baselines.ppo1 import pposgd_simple, cnn_policy 13 | import baselines.common.tf_util as U 14 | rank = MPI.COMM_WORLD.Get_rank() 15 | sess = U.single_threaded_session() 16 | sess.__enter__() 17 | if rank == 0: 18 | logger.configure() 19 | else: 20 | logger.configure(format_strs=[]) 21 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None 22 | set_global_seeds(workerseed) 23 | env = make_atari(env_id) 24 | def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 25 | return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) 26 | env = bench.Monitor(env, logger.get_dir() and 27 | osp.join(logger.get_dir(), str(rank))) 28 | env.seed(workerseed) 29 | 30 | env = wrap_deepmind(env) 31 | env.seed(workerseed) 32 | 33 | pposgd_simple.learn(env, policy_fn, 34 | max_timesteps=int(num_timesteps * 1.1), 35 | timesteps_per_actorbatch=256, 36 | clip_param=0.2, entcoeff=0.01, 37 | optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, 38 | gamma=0.99, lam=0.95, 39 | schedule='linear' 40 | ) 41 | env.close() 42 | 43 | def main(): 44 | args = atari_arg_parser().parse_args() 45 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /baselines/common/tests/test_mnist.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # from baselines.acer import acer_simple as acer 4 | from baselines.common.tests.envs.mnist_env import MnistEnv 5 | from baselines.common.tests.util import simple_test 6 | from baselines.run import get_learn_function 7 | 8 | 9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem? 10 | # GitHub issue https://github.com/openai/baselines/issues/189 11 | common_kwargs = { 12 | 'seed': 0, 13 | 'network':'cnn', 14 | 'gamma':0.9, 15 | 'pad':'SAME' 16 | } 17 | 18 | learn_args = { 19 | 'a2c': dict(total_timesteps=50000), 20 | # TODO need to resolve inference (step) API differences for acer; also slow 21 | # 'acer': dict(seed=0, total_timesteps=1000), 22 | 'deepq': dict(total_timesteps=5000), 23 | 'acktr': dict(total_timesteps=30000), 24 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0), 25 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001) 26 | } 27 | 28 | 29 | #tests pass, but are too slow on travis. Same algorithms are covered 30 | # by other tests with less compute-hungry nn's and by benchmarks 31 | @pytest.mark.skip 32 | @pytest.mark.slow 33 | @pytest.mark.parametrize("alg", learn_args.keys()) 34 | def test_mnist(alg): 35 | ''' 36 | Test if the algorithm can learn to classify MNIST digits. 37 | Uses CNN policy. 38 | ''' 39 | 40 | learn_kwargs = learn_args[alg] 41 | learn_kwargs.update(common_kwargs) 42 | 43 | learn = get_learn_function(alg) 44 | learn_fn = lambda e: learn(env=e, **learn_kwargs) 45 | env_fn = lambda: MnistEnv(seed=0, episode_len=100) 46 | 47 | simple_test(env_fn, learn_fn, 0.6) 48 | 49 | if __name__ == '__main__': 50 | test_mnist('deepq') 51 | -------------------------------------------------------------------------------- /baselines/common/tests/test_identity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv 3 | from baselines.run import get_learn_function 4 | from baselines.common.tests.util import simple_test 5 | 6 | common_kwargs = dict( 7 | total_timesteps=30000, 8 | network='mlp', 9 | gamma=0.9, 10 | seed=0, 11 | ) 12 | 13 | learn_kwargs = { 14 | 'a2c' : {}, 15 | 'acktr': {}, 16 | 'deepq': {}, 17 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0), 18 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01) 19 | } 20 | 21 | 22 | @pytest.mark.slow 23 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 24 | def test_discrete_identity(alg): 25 | ''' 26 | Test if the algorithm (with an mlp policy) 27 | can learn an identity transformation (i.e. return observation as an action) 28 | ''' 29 | 30 | kwargs = learn_kwargs[alg] 31 | kwargs.update(common_kwargs) 32 | 33 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 34 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100) 35 | simple_test(env_fn, learn_fn, 0.9) 36 | 37 | @pytest.mark.slow 38 | @pytest.mark.parametrize("alg", ['a2c', 'ppo2', 'trpo_mpi']) 39 | def test_continuous_identity(alg): 40 | ''' 41 | Test if the algorithm (with an mlp policy) 42 | can learn an identity transformation (i.e. return observation as an action) 43 | to a required precision 44 | ''' 45 | 46 | kwargs = learn_kwargs[alg] 47 | kwargs.update(common_kwargs) 48 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 49 | 50 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100) 51 | simple_test(env_fn, learn_fn, -0.1) 52 | 53 | if __name__ == '__main__': 54 | test_continuous_identity('a2c') 55 | 56 | -------------------------------------------------------------------------------- /baselines/common/input.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from gym.spaces import Discrete, Box 3 | 4 | def observation_placeholder(ob_space, batch_size=None, name='Ob'): 5 | ''' 6 | Create placeholder to feed observations into of the size appropriate to the observation space 7 | 8 | Parameters: 9 | ---------- 10 | 11 | ob_space: gym.Space observation space 12 | 13 | batch_size: int size of the batch to be fed into input. Can be left None in most cases. 14 | 15 | name: str name of the placeholder 16 | 17 | Returns: 18 | ------- 19 | 20 | tensorflow placeholder tensor 21 | ''' 22 | 23 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \ 24 | 'Can only deal with Discrete and Box observation spaces for now' 25 | 26 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name) 27 | 28 | 29 | def observation_input(ob_space, batch_size=None, name='Ob'): 30 | ''' 31 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input 32 | encoder of the appropriate type. 33 | ''' 34 | 35 | placeholder = observation_placeholder(ob_space, batch_size, name) 36 | return placeholder, encode_observation(ob_space, placeholder) 37 | 38 | def encode_observation(ob_space, placeholder): 39 | ''' 40 | Encode input in the way that is appropriate to the observation space 41 | 42 | Parameters: 43 | ---------- 44 | 45 | ob_space: gym.Space observation space 46 | 47 | placeholder: tf.placeholder observation input placeholder 48 | ''' 49 | if isinstance(ob_space, Discrete): 50 | return tf.to_float(tf.one_hot(placeholder, ob_space.n)) 51 | 52 | elif isinstance(ob_space, Box): 53 | return tf.to_float(placeholder) 54 | else: 55 | raise NotImplementedError 56 | 57 | -------------------------------------------------------------------------------- /baselines/common/tests/envs/identity_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import abstractmethod 3 | from gym import Env 4 | from gym.spaces import Discrete, Box 5 | 6 | 7 | class IdentityEnv(Env): 8 | def __init__( 9 | self, 10 | episode_len=None 11 | ): 12 | 13 | self.episode_len = episode_len 14 | self.time = 0 15 | self.reset() 16 | 17 | def reset(self): 18 | self._choose_next_state() 19 | self.time = 0 20 | self.observation_space = self.action_space 21 | 22 | return self.state 23 | 24 | def step(self, actions): 25 | rew = self._get_reward(actions) 26 | self._choose_next_state() 27 | done = False 28 | if self.episode_len and self.time >= self.episode_len: 29 | rew = 0 30 | done = True 31 | 32 | return self.state, rew, done, {} 33 | 34 | def _choose_next_state(self): 35 | self.state = self.action_space.sample() 36 | self.time += 1 37 | 38 | @abstractmethod 39 | def _get_reward(self, actions): 40 | raise NotImplementedError 41 | 42 | 43 | class DiscreteIdentityEnv(IdentityEnv): 44 | def __init__( 45 | self, 46 | dim, 47 | episode_len=None, 48 | ): 49 | 50 | self.action_space = Discrete(dim) 51 | super().__init__(episode_len=episode_len) 52 | 53 | def _get_reward(self, actions): 54 | return 1 if self.state == actions else 0 55 | 56 | 57 | class BoxIdentityEnv(IdentityEnv): 58 | def __init__( 59 | self, 60 | shape, 61 | episode_len=None, 62 | ): 63 | 64 | self.action_space = Box(low=-1.0, high=1.0, shape=shape) 65 | super().__init__(episode_len=episode_len) 66 | 67 | def _get_reward(self, actions): 68 | diff = actions - self.state 69 | diff = diff[:] 70 | return -0.5 * np.dot(diff, diff) 71 | -------------------------------------------------------------------------------- /baselines/gail/statistics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py 3 | ''' 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | import baselines.common.tf_util as U 9 | 10 | 11 | class stats(): 12 | 13 | def __init__(self, scalar_keys=[], histogram_keys=[]): 14 | self.scalar_keys = scalar_keys 15 | self.histogram_keys = histogram_keys 16 | self.scalar_summaries = [] 17 | self.scalar_summaries_ph = [] 18 | self.histogram_summaries_ph = [] 19 | self.histogram_summaries = [] 20 | with tf.variable_scope('summary'): 21 | for k in scalar_keys: 22 | ph = tf.placeholder('float32', None, name=k+'.scalar.summary') 23 | sm = tf.summary.scalar(k+'.scalar.summary', ph) 24 | self.scalar_summaries_ph.append(ph) 25 | self.scalar_summaries.append(sm) 26 | for k in histogram_keys: 27 | ph = tf.placeholder('float32', None, name=k+'.histogram.summary') 28 | sm = tf.summary.scalar(k+'.histogram.summary', ph) 29 | self.histogram_summaries_ph.append(ph) 30 | self.histogram_summaries.append(sm) 31 | 32 | self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries) 33 | 34 | def add_all_summary(self, writer, values, iter): 35 | # Note that the order of the incoming ```values``` should be the same as the that of the 36 | # ```scalar_keys``` given in ```__init__``` 37 | if np.sum(np.isnan(values)+0) != 0: 38 | return 39 | sess = U.get_session() 40 | keys = self.scalar_summaries_ph + self.histogram_summaries_ph 41 | feed_dict = {} 42 | for k, v in zip(keys, values): 43 | feed_dict.update({k: v}) 44 | summaries_str = sess.run(self.summaries, feed_dict) 45 | writer.add_summary(summaries_str, iter) 46 | -------------------------------------------------------------------------------- /baselines/her/experiment/play.py: -------------------------------------------------------------------------------- 1 | import click 2 | import numpy as np 3 | import pickle 4 | 5 | from baselines import logger 6 | from baselines.common import set_global_seeds 7 | import baselines.her.experiment.config as config 8 | from baselines.her.rollout import RolloutWorker 9 | 10 | 11 | @click.command() 12 | @click.argument('policy_file', type=str) 13 | @click.option('--seed', type=int, default=0) 14 | @click.option('--n_test_rollouts', type=int, default=10) 15 | @click.option('--render', type=int, default=1) 16 | def main(policy_file, seed, n_test_rollouts, render): 17 | set_global_seeds(seed) 18 | 19 | # Load policy. 20 | with open(policy_file, 'rb') as f: 21 | policy = pickle.load(f) 22 | env_name = policy.info['env_name'] 23 | 24 | # Prepare params. 25 | params = config.DEFAULT_PARAMS 26 | if env_name in config.DEFAULT_ENV_PARAMS: 27 | params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in 28 | params['env_name'] = env_name 29 | params = config.prepare_params(params) 30 | config.log_params(params, logger=logger) 31 | 32 | dims = config.configure_dims(params) 33 | 34 | eval_params = { 35 | 'exploit': True, 36 | 'use_target_net': params['test_with_polyak'], 37 | 'compute_Q': True, 38 | 'rollout_batch_size': 1, 39 | 'render': bool(render), 40 | } 41 | 42 | for name in ['T', 'gamma', 'noise_eps', 'random_eps']: 43 | eval_params[name] = params[name] 44 | 45 | evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) 46 | evaluator.seed(seed) 47 | 48 | # Run evaluation. 49 | evaluator.clear_history() 50 | for _ in range(n_test_rollouts): 51 | evaluator.generate_rollouts() 52 | 53 | # record logs 54 | for key, val in evaluator.logs('test'): 55 | logger.record_tabular(key, np.mean(val)) 56 | logger.dump_tabular() 57 | 58 | 59 | if __name__ == '__main__': 60 | main() 61 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/run_atari.py: -------------------------------------------------------------------------------- 1 | from baselines import deepq 2 | from baselines.common import set_global_seeds 3 | from baselines import bench 4 | import argparse 5 | from baselines import logger 6 | from baselines.common.atari_wrappers import make_atari 7 | 8 | 9 | def main(): 10 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 11 | parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') 12 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 13 | parser.add_argument('--prioritized', type=int, default=1) 14 | parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) 15 | parser.add_argument('--dueling', type=int, default=1) 16 | parser.add_argument('--num-timesteps', type=int, default=int(10e6)) 17 | parser.add_argument('--checkpoint-freq', type=int, default=10000) 18 | parser.add_argument('--checkpoint-path', type=str, default=None) 19 | 20 | args = parser.parse_args() 21 | logger.configure() 22 | set_global_seeds(args.seed) 23 | env = make_atari(args.env) 24 | env = bench.Monitor(env, logger.get_dir()) 25 | env = deepq.wrap_atari_dqn(env) 26 | 27 | deepq.learn( 28 | env, 29 | "conv_only", 30 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], 31 | hiddens=[256], 32 | dueling=bool(args.dueling), 33 | lr=1e-4, 34 | total_timesteps=args.num_timesteps, 35 | buffer_size=10000, 36 | exploration_fraction=0.1, 37 | exploration_final_eps=0.01, 38 | train_freq=4, 39 | learning_starts=10000, 40 | target_network_update_freq=1000, 41 | gamma=0.99, 42 | prioritized_replay=bool(args.prioritized), 43 | prioritized_replay_alpha=args.prioritized_replay_alpha, 44 | checkpoint_freq=args.checkpoint_freq, 45 | checkpoint_path=args.checkpoint_path, 46 | ) 47 | 48 | env.close() 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /baselines/her/actor_critic.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from baselines.her.util import store_args, nn 3 | 4 | 5 | class ActorCritic: 6 | @store_args 7 | def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, 8 | **kwargs): 9 | """The actor-critic network and related training code. 10 | 11 | Args: 12 | inputs_tf (dict of tensors): all necessary inputs for the network: the 13 | observation (o), the goal (g), and the action (u) 14 | dimo (int): the dimension of the observations 15 | dimg (int): the dimension of the goals 16 | dimu (int): the dimension of the actions 17 | max_u (float): the maximum magnitude of actions; action outputs will be scaled 18 | accordingly 19 | o_stats (baselines.her.Normalizer): normalizer for observations 20 | g_stats (baselines.her.Normalizer): normalizer for goals 21 | hidden (int): number of hidden units that should be used in hidden layers 22 | layers (int): number of hidden layers 23 | """ 24 | self.o_tf = inputs_tf['o'] 25 | self.g_tf = inputs_tf['g'] 26 | self.u_tf = inputs_tf['u'] 27 | 28 | # Prepare inputs for actor and critic. 29 | o = self.o_stats.normalize(self.o_tf) 30 | g = self.g_stats.normalize(self.g_tf) 31 | input_pi = tf.concat(axis=1, values=[o, g]) # for actor 32 | 33 | # Networks. 34 | with tf.variable_scope('pi'): 35 | self.pi_tf = self.max_u * tf.tanh(nn( 36 | input_pi, [self.hidden] * self.layers + [self.dimu])) 37 | with tf.variable_scope('Q'): 38 | # for policy training 39 | input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u]) 40 | self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) 41 | # for critic training 42 | input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u]) 43 | self._input_Q = input_Q # exposed for tests 44 | self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True) 45 | -------------------------------------------------------------------------------- /baselines/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from baselines.common import zipsame 4 | 5 | 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False): 7 | x = np.asarray(x) 8 | assert x.ndim > 0 9 | if comm is None: comm = MPI.COMM_WORLD 10 | xsum = x.sum(axis=axis, keepdims=keepdims) 11 | n = xsum.size 12 | localsum = np.zeros(n+1, x.dtype) 13 | localsum[:n] = xsum.ravel() 14 | localsum[n] = x.shape[axis] 15 | globalsum = np.zeros_like(localsum) 16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM) 17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] 18 | 19 | def mpi_moments(x, axis=0, comm=None, keepdims=False): 20 | x = np.asarray(x) 21 | assert x.ndim > 0 22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True) 23 | sqdiffs = np.square(x - mean) 24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) 25 | assert count1 == count 26 | std = np.sqrt(meansqdiff) 27 | if not keepdims: 28 | newshape = mean.shape[:axis] + mean.shape[axis+1:] 29 | mean = mean.reshape(newshape) 30 | std = std.reshape(newshape) 31 | return mean, std, count 32 | 33 | 34 | def test_runningmeanstd(): 35 | import subprocess 36 | subprocess.check_call(['mpirun', '-np', '3', 37 | 'python','-c', 38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) 39 | 40 | def _helper_runningmeanstd(): 41 | comm = MPI.COMM_WORLD 42 | np.random.seed(0) 43 | for (triple,axis) in [ 44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 47 | ]: 48 | 49 | 50 | x = np.concatenate(triple, axis=axis) 51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 52 | 53 | 54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 55 | 56 | for (a1,a2) in zipsame(ms1, ms2): 57 | print(a1, a2) 58 | assert np.allclose(a1, a2) 59 | print("ok!") 60 | 61 | -------------------------------------------------------------------------------- /baselines/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | import shlex 6 | import subprocess 7 | 8 | # ================================================================ 9 | # Misc 10 | # ================================================================ 11 | 12 | def fmt_row(width, row, header=False): 13 | out = " | ".join(fmt_item(x, width) for x in row) 14 | if header: out = out + "\n" + "-"*len(out) 15 | return out 16 | 17 | def fmt_item(x, l): 18 | if isinstance(x, np.ndarray): 19 | assert x.ndim==0 20 | x = x.item() 21 | if isinstance(x, (float, np.float32, np.float64)): 22 | v = abs(x) 23 | if (v < 1e-4 or v > 1e+4) and v > 0: 24 | rep = "%7.2e" % x 25 | else: 26 | rep = "%7.5f" % x 27 | else: rep = str(x) 28 | return " "*(l - len(rep)) + rep 29 | 30 | color2num = dict( 31 | gray=30, 32 | red=31, 33 | green=32, 34 | yellow=33, 35 | blue=34, 36 | magenta=35, 37 | cyan=36, 38 | white=37, 39 | crimson=38 40 | ) 41 | 42 | def colorize(string, color='green', bold=False, highlight=False): 43 | attr = [] 44 | num = color2num[color] 45 | if highlight: num += 10 46 | attr.append(str(num)) 47 | if bold: attr.append('1') 48 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 49 | 50 | def print_cmd(cmd, dry=False): 51 | if isinstance(cmd, str): # for shell=True 52 | pass 53 | else: 54 | cmd = ' '.join(shlex.quote(arg) for arg in cmd) 55 | print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd)) 56 | 57 | 58 | def get_git_commit(cwd=None): 59 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8') 60 | 61 | def ccap(cmd, dry=False, env=None, **kwargs): 62 | print_cmd(cmd, dry) 63 | if not dry: 64 | subprocess.check_call(cmd, env=env, **kwargs) 65 | 66 | 67 | MESSAGE_DEPTH = 0 68 | 69 | @contextmanager 70 | def timed(msg): 71 | global MESSAGE_DEPTH #pylint: disable=W0603 72 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 73 | tstart = time.time() 74 | MESSAGE_DEPTH += 1 75 | yield 76 | MESSAGE_DEPTH -= 1 77 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 78 | -------------------------------------------------------------------------------- /baselines/common/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Dataset(object): 4 | def __init__(self, data_map, deterministic=False, shuffle=True): 5 | self.data_map = data_map 6 | self.deterministic = deterministic 7 | self.enable_shuffle = shuffle 8 | self.n = next(iter(data_map.values())).shape[0] 9 | self._next_id = 0 10 | self.shuffle() 11 | 12 | def shuffle(self): 13 | if self.deterministic: 14 | return 15 | perm = np.arange(self.n) 16 | np.random.shuffle(perm) 17 | 18 | for key in self.data_map: 19 | self.data_map[key] = self.data_map[key][perm] 20 | 21 | self._next_id = 0 22 | 23 | def next_batch(self, batch_size): 24 | if self._next_id >= self.n and self.enable_shuffle: 25 | self.shuffle() 26 | 27 | cur_id = self._next_id 28 | cur_batch_size = min(batch_size, self.n - self._next_id) 29 | self._next_id += cur_batch_size 30 | 31 | data_map = dict() 32 | for key in self.data_map: 33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 34 | return data_map 35 | 36 | def iterate_once(self, batch_size): 37 | if self.enable_shuffle: self.shuffle() 38 | 39 | while self._next_id <= self.n - batch_size: 40 | yield self.next_batch(batch_size) 41 | self._next_id = 0 42 | 43 | def subset(self, num_elements, deterministic=True): 44 | data_map = dict() 45 | for key in self.data_map: 46 | data_map[key] = self.data_map[key][:num_elements] 47 | return Dataset(data_map, deterministic) 48 | 49 | 50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 52 | arrays = tuple(map(np.asarray, arrays)) 53 | n = arrays[0].shape[0] 54 | assert all(a.shape[0] == n for a in arrays[1:]) 55 | inds = np.arange(n) 56 | if shuffle: np.random.shuffle(inds) 57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 58 | for batch_inds in np.array_split(inds, sections): 59 | if include_final_partial_batch or len(batch_inds) == batch_size: 60 | yield tuple(a[batch_inds] for a in arrays) 61 | -------------------------------------------------------------------------------- /baselines/common/tests/envs/mnist_env.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import numpy as np 3 | import tempfile 4 | import filelock 5 | from gym import Env 6 | from gym.spaces import Discrete, Box 7 | 8 | 9 | 10 | class MnistEnv(Env): 11 | def __init__( 12 | self, 13 | seed=0, 14 | episode_len=None, 15 | no_images=None 16 | ): 17 | from tensorflow.examples.tutorials.mnist import input_data 18 | # we could use temporary directory for this with a context manager and 19 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data 20 | # this way the data is not cleaned up, but we only download it once per machine 21 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data') 22 | with filelock.FileLock(mnist_path + '.lock'): 23 | self.mnist = input_data.read_data_sets(mnist_path) 24 | 25 | self.np_random = np.random.RandomState() 26 | self.np_random.seed(seed) 27 | 28 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1)) 29 | self.action_space = Discrete(10) 30 | self.episode_len = episode_len 31 | self.time = 0 32 | self.no_images = no_images 33 | 34 | self.train_mode() 35 | self.reset() 36 | 37 | def reset(self): 38 | self._choose_next_state() 39 | self.time = 0 40 | 41 | return self.state[0] 42 | 43 | def step(self, actions): 44 | rew = self._get_reward(actions) 45 | self._choose_next_state() 46 | done = False 47 | if self.episode_len and self.time >= self.episode_len: 48 | rew = 0 49 | done = True 50 | 51 | return self.state[0], rew, done, {} 52 | 53 | def train_mode(self): 54 | self.dataset = self.mnist.train 55 | 56 | def test_mode(self): 57 | self.dataset = self.mnist.test 58 | 59 | def _choose_next_state(self): 60 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1 61 | index = self.np_random.randint(0, max_index) 62 | image = self.dataset.images[index].reshape(28,28,1)*255 63 | label = self.dataset.labels[index] 64 | self.state = (image, label) 65 | self.time += 1 66 | 67 | def _get_reward(self, actions): 68 | return 1 if self.state[1] == actions else 0 69 | 70 | 71 | -------------------------------------------------------------------------------- /baselines/ddpg/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class AdaptiveParamNoiseSpec(object): 5 | def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01): 6 | self.initial_stddev = initial_stddev 7 | self.desired_action_stddev = desired_action_stddev 8 | self.adoption_coefficient = adoption_coefficient 9 | 10 | self.current_stddev = initial_stddev 11 | 12 | def adapt(self, distance): 13 | if distance > self.desired_action_stddev: 14 | # Decrease stddev. 15 | self.current_stddev /= self.adoption_coefficient 16 | else: 17 | # Increase stddev. 18 | self.current_stddev *= self.adoption_coefficient 19 | 20 | def get_stats(self): 21 | stats = { 22 | 'param_noise_stddev': self.current_stddev, 23 | } 24 | return stats 25 | 26 | def __repr__(self): 27 | fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})' 28 | return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient) 29 | 30 | 31 | class ActionNoise(object): 32 | def reset(self): 33 | pass 34 | 35 | 36 | class NormalActionNoise(ActionNoise): 37 | def __init__(self, mu, sigma): 38 | self.mu = mu 39 | self.sigma = sigma 40 | 41 | def __call__(self): 42 | return np.random.normal(self.mu, self.sigma) 43 | 44 | def __repr__(self): 45 | return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) 46 | 47 | 48 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 49 | class OrnsteinUhlenbeckActionNoise(ActionNoise): 50 | def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None): 51 | self.theta = theta 52 | self.mu = mu 53 | self.sigma = sigma 54 | self.dt = dt 55 | self.x0 = x0 56 | self.reset() 57 | 58 | def __call__(self): 59 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) 60 | self.x_prev = x 61 | return x 62 | 63 | def reset(self): 64 | self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) 65 | 66 | def __repr__(self): 67 | return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) 68 | -------------------------------------------------------------------------------- /baselines/common/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(x, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of x. 8 | 9 | inputs 10 | ------ 11 | x: ndarray 12 | gamma: float 13 | 14 | outputs 15 | ------- 16 | y: ndarray with same shape as x, satisfying 17 | 18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 19 | where k = len(x) - t - 1 20 | 21 | """ 22 | assert x.ndim >= 1 23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 24 | 25 | def explained_variance(ypred,y): 26 | """ 27 | Computes fraction of variance that ypred explains about y. 28 | Returns 1 - Var[y-ypred] / Var[y] 29 | 30 | interpretation: 31 | ev=0 => might as well have predicted zero 32 | ev=1 => perfect prediction 33 | ev<0 => worse than just predicting zero 34 | 35 | """ 36 | assert y.ndim == 1 and ypred.ndim == 1 37 | vary = np.var(y) 38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 39 | 40 | def explained_variance_2d(ypred, y): 41 | assert y.ndim == 2 and ypred.ndim == 2 42 | vary = np.var(y, axis=0) 43 | out = 1 - np.var(y-ypred)/vary 44 | out[vary < 1e-10] = 0 45 | return out 46 | 47 | def ncc(ypred, y): 48 | return np.corrcoef(ypred, y)[1,0] 49 | 50 | def flatten_arrays(arrs): 51 | return np.concatenate([arr.flat for arr in arrs]) 52 | 53 | def unflatten_vector(vec, shapes): 54 | i=0 55 | arrs = [] 56 | for shape in shapes: 57 | size = np.prod(shape) 58 | arr = vec[i:i+size].reshape(shape) 59 | arrs.append(arr) 60 | i += size 61 | return arrs 62 | 63 | def discount_with_boundaries(X, New, gamma): 64 | """ 65 | X: 2d array of floats, time x features 66 | New: 2d array of bools, indicating when a new episode has started 67 | """ 68 | Y = np.zeros_like(X) 69 | T = X.shape[0] 70 | Y[T-1] = X[T-1] 71 | for t in range(T-2, -1, -1): 72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) 73 | return Y 74 | 75 | def test_discount_with_boundaries(): 76 | gamma=0.9 77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') 78 | starts = [1.0, 0.0, 0.0, 1.0] 79 | y = discount_with_boundaries(x, starts, gamma) 80 | assert np.allclose(y, [ 81 | 1 + gamma * 2 + gamma**2 * 3, 82 | 2 + gamma * 3, 83 | 3, 84 | 4 85 | ]) -------------------------------------------------------------------------------- /baselines/ppo1/cnn_policy.py: -------------------------------------------------------------------------------- 1 | import baselines.common.tf_util as U 2 | import tensorflow as tf 3 | import gym 4 | from baselines.common.distributions import make_pdtype 5 | 6 | class CnnPolicy(object): 7 | recurrent = False 8 | def __init__(self, name, ob_space, ac_space, kind='large'): 9 | with tf.variable_scope(name): 10 | self._init(ob_space, ac_space, kind) 11 | self.scope = tf.get_variable_scope().name 12 | 13 | def _init(self, ob_space, ac_space, kind): 14 | assert isinstance(ob_space, gym.spaces.Box) 15 | 16 | self.pdtype = pdtype = make_pdtype(ac_space) 17 | sequence_length = None 18 | 19 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) 20 | 21 | x = ob / 255.0 22 | if kind == 'small': # from A3C paper 23 | x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) 24 | x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) 25 | x = U.flattenallbut0(x) 26 | x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) 27 | elif kind == 'large': # Nature DQN 28 | x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) 29 | x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) 30 | x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) 31 | x = U.flattenallbut0(x) 32 | x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) 33 | else: 34 | raise NotImplementedError 35 | 36 | logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) 37 | self.pd = pdtype.pdfromflat(logits) 38 | self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] 39 | 40 | self.state_in = [] 41 | self.state_out = [] 42 | 43 | stochastic = tf.placeholder(dtype=tf.bool, shape=()) 44 | ac = self.pd.sample() # XXX 45 | self._act = U.function([stochastic, ob], [ac, self.vpred]) 46 | 47 | def act(self, stochastic, ob): 48 | ac1, vpred1 = self._act(stochastic, ob[None]) 49 | return ac1[0], vpred1[0] 50 | def get_variables(self): 51 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 52 | def get_trainable_variables(self): 53 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 54 | def get_initial_state(self): 55 | return [] 56 | 57 | -------------------------------------------------------------------------------- /baselines/common/vec_env/dummy_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from . import VecEnv 4 | from .util import copy_obs_dict, dict_to_obs, obs_space_info 5 | 6 | class DummyVecEnv(VecEnv): 7 | def __init__(self, env_fns): 8 | self.envs = [fn() for fn in env_fns] 9 | env = self.envs[0] 10 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 11 | obs_space = env.observation_space 12 | 13 | self.keys, shapes, dtypes = obs_space_info(obs_space) 14 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } 15 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) 16 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) 17 | self.buf_infos = [{} for _ in range(self.num_envs)] 18 | self.actions = None 19 | 20 | def step_async(self, actions): 21 | listify = True 22 | try: 23 | if len(actions) == self.num_envs: 24 | listify = False 25 | except TypeError: 26 | pass 27 | 28 | if not listify: 29 | self.actions = actions 30 | else: 31 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs) 32 | self.actions = [actions] 33 | 34 | def step_wait(self): 35 | for e in range(self.num_envs): 36 | action = self.actions[e] 37 | if isinstance(self.envs[e].action_space, spaces.Discrete): 38 | action = int(action) 39 | 40 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action) 41 | if self.buf_dones[e]: 42 | obs = self.envs[e].reset() 43 | self._save_obs(e, obs) 44 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), 45 | self.buf_infos.copy()) 46 | 47 | def reset(self): 48 | for e in range(self.num_envs): 49 | obs = self.envs[e].reset() 50 | self._save_obs(e, obs) 51 | return self._obs_from_buf() 52 | 53 | def close(self): 54 | return 55 | 56 | def _save_obs(self, e, obs): 57 | for k in self.keys: 58 | if k is None: 59 | self.buf_obs[k][e] = obs 60 | else: 61 | self.buf_obs[k][e] = obs[k] 62 | 63 | def _obs_from_buf(self): 64 | return dict_to_obs(copy_obs_dict(self.buf_obs)) 65 | 66 | def get_images(self): 67 | return [env.render(mode='rgb_array') for env in self.envs] 68 | 69 | -------------------------------------------------------------------------------- /baselines/ppo1/run_humanoid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 4 | from baselines.common import tf_util as U 5 | from baselines import logger 6 | 7 | import gym 8 | 9 | def train(num_timesteps, seed, model_path=None): 10 | env_id = 'Humanoid-v2' 11 | from baselines.ppo1 import mlp_policy, pposgd_simple 12 | U.make_session(num_cpu=1).__enter__() 13 | def policy_fn(name, ob_space, ac_space): 14 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 15 | hid_size=64, num_hid_layers=2) 16 | env = make_mujoco_env(env_id, seed) 17 | 18 | # parameters below were the best found in a simple random search 19 | # these are good enough to make humanoid walk, but whether those are 20 | # an absolute best or not is not certain 21 | env = RewScale(env, 0.1) 22 | pi = pposgd_simple.learn(env, policy_fn, 23 | max_timesteps=num_timesteps, 24 | timesteps_per_actorbatch=2048, 25 | clip_param=0.2, entcoeff=0.0, 26 | optim_epochs=10, 27 | optim_stepsize=3e-4, 28 | optim_batchsize=64, 29 | gamma=0.99, 30 | lam=0.95, 31 | schedule='linear', 32 | ) 33 | env.close() 34 | if model_path: 35 | U.save_state(model_path) 36 | 37 | return pi 38 | 39 | class RewScale(gym.RewardWrapper): 40 | def __init__(self, env, scale): 41 | gym.RewardWrapper.__init__(self, env) 42 | self.scale = scale 43 | def reward(self, r): 44 | return r * self.scale 45 | 46 | def main(): 47 | logger.configure() 48 | parser = mujoco_arg_parser() 49 | parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) 50 | parser.set_defaults(num_timesteps=int(2e7)) 51 | 52 | args = parser.parse_args() 53 | 54 | if not args.play: 55 | # train the model 56 | train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) 57 | else: 58 | # construct the model object, load pre-trained model and render 59 | pi = train(num_timesteps=1, seed=args.seed) 60 | U.load_state(args.model_path) 61 | env = make_mujoco_env('Humanoid-v2', seed=0) 62 | 63 | ob = env.reset() 64 | while True: 65 | action = pi.act(stochastic=False, ob=ob)[0] 66 | ob, _, done, _ = env.step(action) 67 | env.render() 68 | if done: 69 | ob = env.reset() 70 | 71 | 72 | 73 | 74 | if __name__ == '__main__': 75 | main() 76 | -------------------------------------------------------------------------------- /baselines/ddpg/models.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib as tc 3 | 4 | 5 | class Model(object): 6 | def __init__(self, name): 7 | self.name = name 8 | 9 | @property 10 | def vars(self): 11 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) 12 | 13 | @property 14 | def trainable_vars(self): 15 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name) 16 | 17 | @property 18 | def perturbable_vars(self): 19 | return [var for var in self.trainable_vars if 'LayerNorm' not in var.name] 20 | 21 | 22 | class Actor(Model): 23 | def __init__(self, nb_actions, name='actor', layer_norm=True): 24 | super(Actor, self).__init__(name=name) 25 | self.nb_actions = nb_actions 26 | self.layer_norm = layer_norm 27 | 28 | def __call__(self, obs, reuse=False): 29 | with tf.variable_scope(self.name) as scope: 30 | if reuse: 31 | scope.reuse_variables() 32 | 33 | x = obs 34 | x = tf.layers.dense(x, 64) 35 | if self.layer_norm: 36 | x = tc.layers.layer_norm(x, center=True, scale=True) 37 | x = tf.nn.relu(x) 38 | 39 | x = tf.layers.dense(x, 64) 40 | if self.layer_norm: 41 | x = tc.layers.layer_norm(x, center=True, scale=True) 42 | x = tf.nn.relu(x) 43 | 44 | x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) 45 | x = tf.nn.tanh(x) 46 | return x 47 | 48 | 49 | class Critic(Model): 50 | def __init__(self, name='critic', layer_norm=True): 51 | super(Critic, self).__init__(name=name) 52 | self.layer_norm = layer_norm 53 | 54 | def __call__(self, obs, action, reuse=False): 55 | with tf.variable_scope(self.name) as scope: 56 | if reuse: 57 | scope.reuse_variables() 58 | 59 | x = obs 60 | x = tf.layers.dense(x, 64) 61 | if self.layer_norm: 62 | x = tc.layers.layer_norm(x, center=True, scale=True) 63 | x = tf.nn.relu(x) 64 | 65 | x = tf.concat([x, action], axis=-1) 66 | x = tf.layers.dense(x, 64) 67 | if self.layer_norm: 68 | x = tc.layers.layer_norm(x, center=True, scale=True) 69 | x = tf.nn.relu(x) 70 | 71 | x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) 72 | return x 73 | 74 | @property 75 | def output_vars(self): 76 | output_vars = [var for var in self.trainable_vars if 'output' in var.name] 77 | return output_vars 78 | -------------------------------------------------------------------------------- /baselines/gail/result/gail-result.md: -------------------------------------------------------------------------------- 1 | # Results of GAIL/BC on Mujoco 2 | 3 | Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including 4 | Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0. 5 | 6 | ## Results 7 | 8 | ### Training through iterations 9 | 10 | - Hoppers-v1 11 | 12 | 13 | - HalfCheetah-v1 14 | 15 | 16 | - Walker2d-v1 17 | 18 | 19 | - Humanoid-v1 20 | 21 | 22 | - HumanoidStandup-v1 23 | 24 | 25 | For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing) 26 | 27 | ### Determinstic Polciy (Set std=0) 28 | | | Un-normalized | Normalized | 29 | |---|---|---| 30 | | Hopper-v1 | | | 31 | | HalfCheetah-v1 | | | 32 | | Walker2d-v1 | | | 33 | | Humanoid-v1 | | | 34 | | HumanoidStandup-v1 | | | 35 | 36 | ### Stochatic Policy 37 | | | Un-normalized | Normalized | 38 | |---|---|---| 39 | | Hopper-v1 | | | 40 | | HalfCheetah-v1 | | | 41 | | Walker2d-v1 | | | 42 | | Humanoid-v1 | | | 43 | | HumanoidStandup-v1 | | | 44 | 45 | ### details about GAIL imitator 46 | 47 | For all environments, the 48 | imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most 49 | 1024 transitions, and seed 0, 1, 2, 3, respectively. 50 | 51 | ### details about the BC imitators 52 | 53 | All BC imitators are trained with seed 0. 54 | -------------------------------------------------------------------------------- /baselines/acer/runner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from baselines.common.runners import AbstractEnvRunner 3 | 4 | class Runner(AbstractEnvRunner): 5 | 6 | def __init__(self, env, model, nsteps, nstack): 7 | super().__init__(env=env, model=model, nsteps=nsteps) 8 | self.nstack = nstack 9 | nh, nw, nc = env.observation_space.shape 10 | self.nc = nc # nc = 1 for atari, but just in case 11 | self.nact = env.action_space.n 12 | nenv = self.nenv 13 | self.nbatch = nenv * nsteps 14 | self.batch_ob_shape = (nenv*(nsteps+1), nh, nw, nc*nstack) 15 | self.obs = np.zeros((nenv, nh, nw, nc * nstack), dtype=np.uint8) 16 | obs = env.reset() 17 | self.update_obs(obs) 18 | 19 | def update_obs(self, obs, dones=None): 20 | #self.obs = obs 21 | if dones is not None: 22 | self.obs *= (1 - dones.astype(np.uint8))[:, None, None, None] 23 | self.obs = np.roll(self.obs, shift=-self.nc, axis=3) 24 | self.obs[:, :, :, -self.nc:] = obs[:, :, :, :] 25 | 26 | def run(self): 27 | enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps 28 | mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], [] 29 | for _ in range(self.nsteps): 30 | actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones) 31 | mb_obs.append(np.copy(self.obs)) 32 | mb_actions.append(actions) 33 | mb_mus.append(mus) 34 | mb_dones.append(self.dones) 35 | obs, rewards, dones, _ = self.env.step(actions) 36 | # states information for statefull models like LSTM 37 | self.states = states 38 | self.dones = dones 39 | self.update_obs(obs, dones) 40 | mb_rewards.append(rewards) 41 | enc_obs.append(obs) 42 | mb_obs.append(np.copy(self.obs)) 43 | mb_dones.append(self.dones) 44 | 45 | enc_obs = np.asarray(enc_obs, dtype=np.uint8).swapaxes(1, 0) 46 | mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) 47 | mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) 48 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) 49 | mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) 50 | 51 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) 52 | 53 | mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done 54 | mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards 55 | 56 | # shapes are now [nenv, nsteps, []] 57 | # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy. 58 | 59 | return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks 60 | 61 | -------------------------------------------------------------------------------- /baselines/acktr/value_functions.py: -------------------------------------------------------------------------------- 1 | from baselines import logger 2 | import numpy as np 3 | import baselines.common as common 4 | from baselines.common import tf_util as U 5 | import tensorflow as tf 6 | from baselines.acktr import kfac 7 | from baselines.acktr.utils import dense 8 | 9 | class NeuralNetValueFunction(object): 10 | def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613 11 | X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations 12 | vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') 13 | wd_dict = {} 14 | h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) 15 | h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) 16 | vpred_n = dense(h2, 1, "hfinal", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)[:,0] 17 | sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) 18 | wd_loss = tf.get_collection("vf_losses", None) 19 | loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) 20 | loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) 21 | self._predict = U.function([X], vpred_n) 22 | optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \ 23 | clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \ 24 | async_=1, kfac_update=2, cold_iter=50, \ 25 | weight_decay_dict=wd_dict, max_grad_norm=None) 26 | vf_var_list = [] 27 | for var in tf.trainable_variables(): 28 | if "vf" in var.name: 29 | vf_var_list.append(var) 30 | 31 | update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) 32 | self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101 33 | U.initialize() # Initialize uninitialized TF variables 34 | def _preproc(self, path): 35 | l = pathlength(path) 36 | al = np.arange(l).reshape(-1,1)/10.0 37 | act = path["action_dist"].astype('float32') 38 | X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1) 39 | return X 40 | def predict(self, path): 41 | return self._predict(self._preproc(path)) 42 | def fit(self, paths, targvals): 43 | X = np.concatenate([self._preproc(p) for p in paths]) 44 | y = np.concatenate(targvals) 45 | logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y)) 46 | for _ in range(25): self.do_update(X, y) 47 | logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y)) 48 | 49 | def pathlength(path): 50 | return path["reward"].shape[0] 51 | -------------------------------------------------------------------------------- /baselines/a2c/runner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from baselines.a2c.utils import discount_with_dones 3 | from baselines.common.runners import AbstractEnvRunner 4 | 5 | class Runner(AbstractEnvRunner): 6 | 7 | def __init__(self, env, model, nsteps=5, gamma=0.99): 8 | super().__init__(env=env, model=model, nsteps=nsteps) 9 | self.gamma = gamma 10 | self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()] 11 | self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype 12 | 13 | def run(self): 14 | mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] 15 | mb_states = self.states 16 | for n in range(self.nsteps): 17 | actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones) 18 | mb_obs.append(np.copy(self.obs)) 19 | mb_actions.append(actions) 20 | mb_values.append(values) 21 | mb_dones.append(self.dones) 22 | obs, rewards, dones, _ = self.env.step(actions) 23 | self.states = states 24 | self.dones = dones 25 | for n, done in enumerate(dones): 26 | if done: 27 | self.obs[n] = self.obs[n]*0 28 | self.obs = obs 29 | mb_rewards.append(rewards) 30 | mb_dones.append(self.dones) 31 | #batch of steps to batch of rollouts 32 | 33 | mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) 34 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) 35 | mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) 36 | mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) 37 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) 38 | mb_masks = mb_dones[:, :-1] 39 | mb_dones = mb_dones[:, 1:] 40 | 41 | 42 | if self.gamma > 0.0: 43 | #discount/bootstrap off value fn 44 | last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() 45 | for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): 46 | rewards = rewards.tolist() 47 | dones = dones.tolist() 48 | if dones[-1] == 0: 49 | rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] 50 | else: 51 | rewards = discount_with_dones(rewards, dones, self.gamma) 52 | 53 | mb_rewards[n] = rewards 54 | 55 | mb_actions = mb_actions.reshape(self.batch_action_shape) 56 | 57 | mb_rewards = mb_rewards.flatten() 58 | mb_values = mb_values.flatten() 59 | mb_masks = mb_masks.flatten() 60 | return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values 61 | -------------------------------------------------------------------------------- /baselines/deepq/utils.py: -------------------------------------------------------------------------------- 1 | from baselines.common.input import observation_input 2 | from baselines.common.tf_util import adjust_shape 3 | 4 | import tensorflow as tf 5 | 6 | # ================================================================ 7 | # Placeholders 8 | # ================================================================ 9 | 10 | 11 | class TfInput(object): 12 | def __init__(self, name="(unnamed)"): 13 | """Generalized Tensorflow placeholder. The main differences are: 14 | - possibly uses multiple placeholders internally and returns multiple values 15 | - can apply light postprocessing to the value feed to placeholder. 16 | """ 17 | self.name = name 18 | 19 | def get(self): 20 | """Return the tf variable(s) representing the possibly postprocessed value 21 | of placeholder(s). 22 | """ 23 | raise NotImplemented() 24 | 25 | def make_feed_dict(data): 26 | """Given data input it to the placeholder(s).""" 27 | raise NotImplemented() 28 | 29 | 30 | class PlaceholderTfInput(TfInput): 31 | def __init__(self, placeholder): 32 | """Wrapper for regular tensorflow placeholder.""" 33 | super().__init__(placeholder.name) 34 | self._placeholder = placeholder 35 | 36 | def get(self): 37 | return self._placeholder 38 | 39 | def make_feed_dict(self, data): 40 | return {self._placeholder: adjust_shape(self._placeholder, data)} 41 | 42 | 43 | class Uint8Input(PlaceholderTfInput): 44 | def __init__(self, shape, name=None): 45 | """Takes input in uint8 format which is cast to float32 and divided by 255 46 | before passing it to the model. 47 | 48 | On GPU this ensures lower data transfer times. 49 | 50 | Parameters 51 | ---------- 52 | shape: [int] 53 | shape of the tensor. 54 | name: str 55 | name of the underlying placeholder 56 | """ 57 | 58 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) 59 | self._shape = shape 60 | self._output = tf.cast(super().get(), tf.float32) / 255.0 61 | 62 | def get(self): 63 | return self._output 64 | 65 | 66 | class ObservationInput(PlaceholderTfInput): 67 | def __init__(self, observation_space, name=None): 68 | """Creates an input placeholder tailored to a specific observation space 69 | 70 | Parameters 71 | ---------- 72 | 73 | observation_space: 74 | observation space of the environment. Should be one of the gym.spaces types 75 | name: str 76 | tensorflow name of the underlying placeholder 77 | """ 78 | inpt, self.processed_inpt = observation_input(observation_space, name=name) 79 | super().__init__(inpt) 80 | 81 | def get(self): 82 | return self.processed_inpt 83 | 84 | 85 | -------------------------------------------------------------------------------- /baselines/her/her.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun): 5 | """Creates a sample function that can be used for HER experience replay. 6 | 7 | Args: 8 | replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none', 9 | regular DDPG experience replay is used 10 | replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times 11 | as many HER replays as regular replays are used) 12 | reward_fun (function): function to re-compute the reward with substituted goals 13 | """ 14 | if replay_strategy == 'future': 15 | future_p = 1 - (1. / (1 + replay_k)) 16 | else: # 'replay_strategy' == 'none' 17 | future_p = 0 18 | 19 | def _sample_her_transitions(episode_batch, batch_size_in_transitions): 20 | """episode_batch is {key: array(buffer_size x T x dim_key)} 21 | """ 22 | T = episode_batch['u'].shape[1] 23 | rollout_batch_size = episode_batch['u'].shape[0] 24 | batch_size = batch_size_in_transitions 25 | 26 | # Select which episodes and time steps to use. 27 | episode_idxs = np.random.randint(0, rollout_batch_size, batch_size) 28 | t_samples = np.random.randint(T, size=batch_size) 29 | transitions = {key: episode_batch[key][episode_idxs, t_samples].copy() 30 | for key in episode_batch.keys()} 31 | 32 | # Select future time indexes proportional with probability future_p. These 33 | # will be used for HER replay by substituting in future goals. 34 | her_indexes = np.where(np.random.uniform(size=batch_size) < future_p) 35 | future_offset = np.random.uniform(size=batch_size) * (T - t_samples) 36 | future_offset = future_offset.astype(int) 37 | future_t = (t_samples + 1 + future_offset)[her_indexes] 38 | 39 | # Replace goal with achieved goal but only for the previously-selected 40 | # HER transitions (as defined by her_indexes). For the other transitions, 41 | # keep the original goal. 42 | future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t] 43 | transitions['g'][her_indexes] = future_ag 44 | 45 | # Reconstruct info dictionary for reward computation. 46 | info = {} 47 | for key, value in transitions.items(): 48 | if key.startswith('info_'): 49 | info[key.replace('info_', '')] = value 50 | 51 | # Re-compute reward since we may have substituted the goal. 52 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']} 53 | reward_params['info'] = info 54 | transitions['r'] = reward_fun(**reward_params) 55 | 56 | transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) 57 | for k in transitions.keys()} 58 | 59 | assert(transitions['u'].shape[0] == batch_size_in_transitions) 60 | 61 | return transitions 62 | 63 | return _sample_her_transitions 64 | -------------------------------------------------------------------------------- /baselines/common/tests/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from gym.spaces import np_random 4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 5 | 6 | N_TRIALS = 10000 7 | N_EPISODES = 100 8 | 9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS): 10 | np.random.seed(0) 11 | np_random.seed(0) 12 | 13 | env = DummyVecEnv([env_fn]) 14 | 15 | 16 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): 17 | tf.set_random_seed(0) 18 | 19 | model = learn_fn(env) 20 | 21 | sum_rew = 0 22 | done = True 23 | 24 | for i in range(n_trials): 25 | if done: 26 | obs = env.reset() 27 | state = model.initial_state 28 | 29 | if state is not None: 30 | a, v, state, _ = model.step(obs, S=state, M=[False]) 31 | else: 32 | a, v, _, _ = model.step(obs) 33 | 34 | obs, rew, done, _ = env.step(a) 35 | sum_rew += float(rew) 36 | 37 | print("Reward in {} trials is {}".format(n_trials, sum_rew)) 38 | assert sum_rew > min_reward_fraction * n_trials, \ 39 | 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials) 40 | 41 | 42 | 43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES): 44 | env = DummyVecEnv([env_fn]) 45 | 46 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): 47 | model = learn_fn(env) 48 | 49 | N_TRIALS = 100 50 | 51 | observations, actions, rewards = rollout(env, model, N_TRIALS) 52 | rewards = [sum(r) for r in rewards] 53 | 54 | avg_rew = sum(rewards) / N_TRIALS 55 | print("Average reward in {} episodes is {}".format(n_trials, avg_rew)) 56 | assert avg_rew > min_avg_reward, \ 57 | 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward) 58 | 59 | def rollout(env, model, n_trials): 60 | rewards = [] 61 | actions = [] 62 | observations = [] 63 | 64 | for i in range(n_trials): 65 | obs = env.reset() 66 | state = model.initial_state 67 | episode_rew = [] 68 | episode_actions = [] 69 | episode_obs = [] 70 | 71 | while True: 72 | if state is not None: 73 | a, v, state, _ = model.step(obs, S=state, M=[False]) 74 | else: 75 | a,v, _, _ = model.step(obs) 76 | 77 | obs, rew, done, _ = env.step(a) 78 | 79 | episode_rew.append(rew) 80 | episode_actions.append(a) 81 | episode_obs.append(obs) 82 | 83 | if done: 84 | break 85 | 86 | rewards.append(episode_rew) 87 | actions.append(episode_actions) 88 | observations.append(episode_obs) 89 | 90 | return observations, actions, rewards 91 | 92 | -------------------------------------------------------------------------------- /baselines/ddpg/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RingBuffer(object): 5 | def __init__(self, maxlen, shape, dtype='float32'): 6 | self.maxlen = maxlen 7 | self.start = 0 8 | self.length = 0 9 | self.data = np.zeros((maxlen,) + shape).astype(dtype) 10 | 11 | def __len__(self): 12 | return self.length 13 | 14 | def __getitem__(self, idx): 15 | if idx < 0 or idx >= self.length: 16 | raise KeyError() 17 | return self.data[(self.start + idx) % self.maxlen] 18 | 19 | def get_batch(self, idxs): 20 | return self.data[(self.start + idxs) % self.maxlen] 21 | 22 | def append(self, v): 23 | if self.length < self.maxlen: 24 | # We have space, simply increase the length. 25 | self.length += 1 26 | elif self.length == self.maxlen: 27 | # No space, "remove" the first item. 28 | self.start = (self.start + 1) % self.maxlen 29 | else: 30 | # This should never happen. 31 | raise RuntimeError() 32 | self.data[(self.start + self.length - 1) % self.maxlen] = v 33 | 34 | 35 | def array_min2d(x): 36 | x = np.array(x) 37 | if x.ndim >= 2: 38 | return x 39 | return x.reshape(-1, 1) 40 | 41 | 42 | class Memory(object): 43 | def __init__(self, limit, action_shape, observation_shape): 44 | self.limit = limit 45 | 46 | self.observations0 = RingBuffer(limit, shape=observation_shape) 47 | self.actions = RingBuffer(limit, shape=action_shape) 48 | self.rewards = RingBuffer(limit, shape=(1,)) 49 | self.terminals1 = RingBuffer(limit, shape=(1,)) 50 | self.observations1 = RingBuffer(limit, shape=observation_shape) 51 | 52 | def sample(self, batch_size): 53 | # Draw such that we always have a proceeding element. 54 | batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size) 55 | 56 | obs0_batch = self.observations0.get_batch(batch_idxs) 57 | obs1_batch = self.observations1.get_batch(batch_idxs) 58 | action_batch = self.actions.get_batch(batch_idxs) 59 | reward_batch = self.rewards.get_batch(batch_idxs) 60 | terminal1_batch = self.terminals1.get_batch(batch_idxs) 61 | 62 | result = { 63 | 'obs0': array_min2d(obs0_batch), 64 | 'obs1': array_min2d(obs1_batch), 65 | 'rewards': array_min2d(reward_batch), 66 | 'actions': array_min2d(action_batch), 67 | 'terminals1': array_min2d(terminal1_batch), 68 | } 69 | return result 70 | 71 | def append(self, obs0, action, reward, obs1, terminal1, training=True): 72 | if not training: 73 | return 74 | 75 | self.observations0.append(obs0) 76 | self.actions.append(action) 77 | self.rewards.append(reward) 78 | self.observations1.append(obs1) 79 | self.terminals1.append(terminal1) 80 | 81 | @property 82 | def nb_entries(self): 83 | return len(self.observations0) 84 | -------------------------------------------------------------------------------- /baselines/ppo1/mlp_policy.py: -------------------------------------------------------------------------------- 1 | from baselines.common.mpi_running_mean_std import RunningMeanStd 2 | import baselines.common.tf_util as U 3 | import tensorflow as tf 4 | import gym 5 | from baselines.common.distributions import make_pdtype 6 | 7 | class MlpPolicy(object): 8 | recurrent = False 9 | def __init__(self, name, *args, **kwargs): 10 | with tf.variable_scope(name): 11 | self._init(*args, **kwargs) 12 | self.scope = tf.get_variable_scope().name 13 | 14 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): 15 | assert isinstance(ob_space, gym.spaces.Box) 16 | 17 | self.pdtype = pdtype = make_pdtype(ac_space) 18 | sequence_length = None 19 | 20 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) 21 | 22 | with tf.variable_scope("obfilter"): 23 | self.ob_rms = RunningMeanStd(shape=ob_space.shape) 24 | 25 | with tf.variable_scope('vf'): 26 | obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) 27 | last_out = obz 28 | for i in range(num_hid_layers): 29 | last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) 30 | self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] 31 | 32 | with tf.variable_scope('pol'): 33 | last_out = obz 34 | for i in range(num_hid_layers): 35 | last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) 36 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): 37 | mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) 38 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) 39 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) 40 | else: 41 | pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) 42 | 43 | self.pd = pdtype.pdfromflat(pdparam) 44 | 45 | self.state_in = [] 46 | self.state_out = [] 47 | 48 | stochastic = tf.placeholder(dtype=tf.bool, shape=()) 49 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) 50 | self._act = U.function([stochastic, ob], [ac, self.vpred]) 51 | 52 | def act(self, stochastic, ob): 53 | ac1, vpred1 = self._act(stochastic, ob[None]) 54 | return ac1[0], vpred1[0] 55 | def get_variables(self): 56 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 57 | def get_trainable_variables(self): 58 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 59 | def get_initial_state(self): 60 | return [] 61 | 62 | -------------------------------------------------------------------------------- /baselines/common/mpi_adam.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import baselines.common.tf_util as U 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | class MpiAdam(object): 7 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): 8 | self.var_list = var_list 9 | self.beta1 = beta1 10 | self.beta2 = beta2 11 | self.epsilon = epsilon 12 | self.scale_grad_by_procs = scale_grad_by_procs 13 | size = sum(U.numel(v) for v in var_list) 14 | self.m = np.zeros(size, 'float32') 15 | self.v = np.zeros(size, 'float32') 16 | self.t = 0 17 | self.setfromflat = U.SetFromFlat(var_list) 18 | self.getflat = U.GetFlat(var_list) 19 | self.comm = MPI.COMM_WORLD if comm is None else comm 20 | 21 | def update(self, localg, stepsize): 22 | if self.t % 100 == 0: 23 | self.check_synced() 24 | localg = localg.astype('float32') 25 | globalg = np.zeros_like(localg) 26 | self.comm.Allreduce(localg, globalg, op=MPI.SUM) 27 | if self.scale_grad_by_procs: 28 | globalg /= self.comm.Get_size() 29 | 30 | self.t += 1 31 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) 32 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg 33 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) 34 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) 35 | self.setfromflat(self.getflat() + step) 36 | 37 | def sync(self): 38 | theta = self.getflat() 39 | self.comm.Bcast(theta, root=0) 40 | self.setfromflat(theta) 41 | 42 | def check_synced(self): 43 | if self.comm.Get_rank() == 0: # this is root 44 | theta = self.getflat() 45 | self.comm.Bcast(theta, root=0) 46 | else: 47 | thetalocal = self.getflat() 48 | thetaroot = np.empty_like(thetalocal) 49 | self.comm.Bcast(thetaroot, root=0) 50 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 51 | 52 | @U.in_session 53 | def test_MpiAdam(): 54 | np.random.seed(0) 55 | tf.set_random_seed(0) 56 | 57 | a = tf.Variable(np.random.randn(3).astype('float32')) 58 | b = tf.Variable(np.random.randn(2,5).astype('float32')) 59 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) 60 | 61 | stepsize = 1e-2 62 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) 63 | do_update = U.function([], loss, updates=[update_op]) 64 | 65 | tf.get_default_session().run(tf.global_variables_initializer()) 66 | for i in range(10): 67 | print(i,do_update()) 68 | 69 | tf.set_random_seed(0) 70 | tf.get_default_session().run(tf.global_variables_initializer()) 71 | 72 | var_list = [a,b] 73 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) 74 | adam = MpiAdam(var_list) 75 | 76 | for i in range(10): 77 | l,g = lossandgrad() 78 | adam.update(g, stepsize) 79 | print(i,l) -------------------------------------------------------------------------------- /baselines/acer/policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from baselines.common.policies import nature_cnn 4 | from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample 5 | 6 | 7 | class AcerCnnPolicy(object): 8 | 9 | def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): 10 | nbatch = nenv * nsteps 11 | nh, nw, nc = ob_space.shape 12 | ob_shape = (nbatch, nh, nw, nc * nstack) 13 | nact = ac_space.n 14 | X = tf.placeholder(tf.uint8, ob_shape) # obs 15 | with tf.variable_scope("model", reuse=reuse): 16 | h = nature_cnn(X) 17 | pi_logits = fc(h, 'pi', nact, init_scale=0.01) 18 | pi = tf.nn.softmax(pi_logits) 19 | q = fc(h, 'q', nact) 20 | 21 | a = sample(tf.nn.softmax(pi_logits)) # could change this to use self.pi instead 22 | self.initial_state = [] # not stateful 23 | self.X = X 24 | self.pi = pi # actual policy params now 25 | self.pi_logits = pi_logits 26 | self.q = q 27 | self.vf = q 28 | 29 | def step(ob, *args, **kwargs): 30 | # returns actions, mus, states 31 | a0, pi0 = sess.run([a, pi], {X: ob}) 32 | return a0, pi0, [] # dummy state 33 | 34 | def out(ob, *args, **kwargs): 35 | pi0, q0 = sess.run([pi, q], {X: ob}) 36 | return pi0, q0 37 | 38 | def act(ob, *args, **kwargs): 39 | return sess.run(a, {X: ob}) 40 | 41 | self.step = step 42 | self.out = out 43 | self.act = act 44 | 45 | class AcerLstmPolicy(object): 46 | 47 | def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): 48 | nbatch = nenv * nsteps 49 | nh, nw, nc = ob_space.shape 50 | ob_shape = (nbatch, nh, nw, nc * nstack) 51 | nact = ac_space.n 52 | X = tf.placeholder(tf.uint8, ob_shape) # obs 53 | M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) 54 | S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states 55 | with tf.variable_scope("model", reuse=reuse): 56 | h = nature_cnn(X) 57 | 58 | # lstm 59 | xs = batch_to_seq(h, nenv, nsteps) 60 | ms = batch_to_seq(M, nenv, nsteps) 61 | h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) 62 | h5 = seq_to_batch(h5) 63 | 64 | pi_logits = fc(h5, 'pi', nact, init_scale=0.01) 65 | pi = tf.nn.softmax(pi_logits) 66 | q = fc(h5, 'q', nact) 67 | 68 | a = sample(pi_logits) # could change this to use self.pi instead 69 | self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) 70 | self.X = X 71 | self.M = M 72 | self.S = S 73 | self.pi = pi # actual policy params now 74 | self.q = q 75 | 76 | def step(ob, state, mask, *args, **kwargs): 77 | # returns actions, mus, states 78 | a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) 79 | return a0, pi0, s 80 | 81 | self.step = step 82 | -------------------------------------------------------------------------------- /baselines/common/filters.py: -------------------------------------------------------------------------------- 1 | from .running_stat import RunningStat 2 | from collections import deque 3 | import numpy as np 4 | 5 | class Filter(object): 6 | def __call__(self, x, update=True): 7 | raise NotImplementedError 8 | def reset(self): 9 | pass 10 | 11 | class IdentityFilter(Filter): 12 | def __call__(self, x, update=True): 13 | return x 14 | 15 | class CompositionFilter(Filter): 16 | def __init__(self, fs): 17 | self.fs = fs 18 | def __call__(self, x, update=True): 19 | for f in self.fs: 20 | x = f(x) 21 | return x 22 | def output_shape(self, input_space): 23 | out = input_space.shape 24 | for f in self.fs: 25 | out = f.output_shape(out) 26 | return out 27 | 28 | class ZFilter(Filter): 29 | """ 30 | y = (x-mean)/std 31 | using running estimates of mean,std 32 | """ 33 | 34 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 35 | self.demean = demean 36 | self.destd = destd 37 | self.clip = clip 38 | 39 | self.rs = RunningStat(shape) 40 | 41 | def __call__(self, x, update=True): 42 | if update: self.rs.push(x) 43 | if self.demean: 44 | x = x - self.rs.mean 45 | if self.destd: 46 | x = x / (self.rs.std+1e-8) 47 | if self.clip: 48 | x = np.clip(x, -self.clip, self.clip) 49 | return x 50 | def output_shape(self, input_space): 51 | return input_space.shape 52 | 53 | class AddClock(Filter): 54 | def __init__(self): 55 | self.count = 0 56 | def reset(self): 57 | self.count = 0 58 | def __call__(self, x, update=True): 59 | return np.append(x, self.count/100.0) 60 | def output_shape(self, input_space): 61 | return (input_space.shape[0]+1,) 62 | 63 | class FlattenFilter(Filter): 64 | def __call__(self, x, update=True): 65 | return x.ravel() 66 | def output_shape(self, input_space): 67 | return (int(np.prod(input_space.shape)),) 68 | 69 | class Ind2OneHotFilter(Filter): 70 | def __init__(self, n): 71 | self.n = n 72 | def __call__(self, x, update=True): 73 | out = np.zeros(self.n) 74 | out[x] = 1 75 | return out 76 | def output_shape(self, input_space): 77 | return (input_space.n,) 78 | 79 | class DivFilter(Filter): 80 | def __init__(self, divisor): 81 | self.divisor = divisor 82 | def __call__(self, x, update=True): 83 | return x / self.divisor 84 | def output_shape(self, input_space): 85 | return input_space.shape 86 | 87 | class StackFilter(Filter): 88 | def __init__(self, length): 89 | self.stack = deque(maxlen=length) 90 | def reset(self): 91 | self.stack.clear() 92 | def __call__(self, x, update=True): 93 | self.stack.append(x) 94 | while len(self.stack) < self.stack.maxlen: 95 | self.stack.append(x) 96 | return np.concatenate(self.stack, axis=-1) 97 | def output_shape(self, input_space): 98 | return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,) 99 | -------------------------------------------------------------------------------- /baselines/gail/mlp_policy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | from baselines/ppo1/mlp_policy.py and add simple modification 3 | (1) add reuse argument 4 | (2) cache the `stochastic` placeholder 5 | ''' 6 | import tensorflow as tf 7 | import gym 8 | 9 | import baselines.common.tf_util as U 10 | from baselines.common.mpi_running_mean_std import RunningMeanStd 11 | from baselines.common.distributions import make_pdtype 12 | from baselines.acktr.utils import dense 13 | 14 | 15 | class MlpPolicy(object): 16 | recurrent = False 17 | 18 | def __init__(self, name, reuse=False, *args, **kwargs): 19 | with tf.variable_scope(name): 20 | if reuse: 21 | tf.get_variable_scope().reuse_variables() 22 | self._init(*args, **kwargs) 23 | self.scope = tf.get_variable_scope().name 24 | 25 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): 26 | assert isinstance(ob_space, gym.spaces.Box) 27 | 28 | self.pdtype = pdtype = make_pdtype(ac_space) 29 | sequence_length = None 30 | 31 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) 32 | 33 | with tf.variable_scope("obfilter"): 34 | self.ob_rms = RunningMeanStd(shape=ob_space.shape) 35 | 36 | obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) 37 | last_out = obz 38 | for i in range(num_hid_layers): 39 | last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) 40 | self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] 41 | 42 | last_out = obz 43 | for i in range(num_hid_layers): 44 | last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) 45 | 46 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): 47 | mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) 48 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) 49 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) 50 | else: 51 | pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) 52 | 53 | self.pd = pdtype.pdfromflat(pdparam) 54 | 55 | self.state_in = [] 56 | self.state_out = [] 57 | 58 | # change for BC 59 | stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) 60 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) 61 | self.ac = ac 62 | self._act = U.function([stochastic, ob], [ac, self.vpred]) 63 | 64 | def act(self, stochastic, ob): 65 | ac1, vpred1 = self._act(stochastic, ob[None]) 66 | return ac1[0], vpred1[0] 67 | 68 | def get_variables(self): 69 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 70 | 71 | def get_trainable_variables(self): 72 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 73 | 74 | def get_initial_state(self): 75 | return [] 76 | -------------------------------------------------------------------------------- /baselines/common/tests/test_segment_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 4 | 5 | 6 | def test_tree_set(): 7 | tree = SumSegmentTree(4) 8 | 9 | tree[2] = 1.0 10 | tree[3] = 3.0 11 | 12 | assert np.isclose(tree.sum(), 4.0) 13 | assert np.isclose(tree.sum(0, 2), 0.0) 14 | assert np.isclose(tree.sum(0, 3), 1.0) 15 | assert np.isclose(tree.sum(2, 3), 1.0) 16 | assert np.isclose(tree.sum(2, -1), 1.0) 17 | assert np.isclose(tree.sum(2, 4), 4.0) 18 | 19 | 20 | def test_tree_set_overlap(): 21 | tree = SumSegmentTree(4) 22 | 23 | tree[2] = 1.0 24 | tree[2] = 3.0 25 | 26 | assert np.isclose(tree.sum(), 3.0) 27 | assert np.isclose(tree.sum(2, 3), 3.0) 28 | assert np.isclose(tree.sum(2, -1), 3.0) 29 | assert np.isclose(tree.sum(2, 4), 3.0) 30 | assert np.isclose(tree.sum(1, 2), 0.0) 31 | 32 | 33 | def test_prefixsum_idx(): 34 | tree = SumSegmentTree(4) 35 | 36 | tree[2] = 1.0 37 | tree[3] = 3.0 38 | 39 | assert tree.find_prefixsum_idx(0.0) == 2 40 | assert tree.find_prefixsum_idx(0.5) == 2 41 | assert tree.find_prefixsum_idx(0.99) == 2 42 | assert tree.find_prefixsum_idx(1.01) == 3 43 | assert tree.find_prefixsum_idx(3.00) == 3 44 | assert tree.find_prefixsum_idx(4.00) == 3 45 | 46 | 47 | def test_prefixsum_idx2(): 48 | tree = SumSegmentTree(4) 49 | 50 | tree[0] = 0.5 51 | tree[1] = 1.0 52 | tree[2] = 1.0 53 | tree[3] = 3.0 54 | 55 | assert tree.find_prefixsum_idx(0.00) == 0 56 | assert tree.find_prefixsum_idx(0.55) == 1 57 | assert tree.find_prefixsum_idx(0.99) == 1 58 | assert tree.find_prefixsum_idx(1.51) == 2 59 | assert tree.find_prefixsum_idx(3.00) == 3 60 | assert tree.find_prefixsum_idx(5.50) == 3 61 | 62 | 63 | def test_max_interval_tree(): 64 | tree = MinSegmentTree(4) 65 | 66 | tree[0] = 1.0 67 | tree[2] = 0.5 68 | tree[3] = 3.0 69 | 70 | assert np.isclose(tree.min(), 0.5) 71 | assert np.isclose(tree.min(0, 2), 1.0) 72 | assert np.isclose(tree.min(0, 3), 0.5) 73 | assert np.isclose(tree.min(0, -1), 0.5) 74 | assert np.isclose(tree.min(2, 4), 0.5) 75 | assert np.isclose(tree.min(3, 4), 3.0) 76 | 77 | tree[2] = 0.7 78 | 79 | assert np.isclose(tree.min(), 0.7) 80 | assert np.isclose(tree.min(0, 2), 1.0) 81 | assert np.isclose(tree.min(0, 3), 0.7) 82 | assert np.isclose(tree.min(0, -1), 0.7) 83 | assert np.isclose(tree.min(2, 4), 0.7) 84 | assert np.isclose(tree.min(3, 4), 3.0) 85 | 86 | tree[2] = 4.0 87 | 88 | assert np.isclose(tree.min(), 1.0) 89 | assert np.isclose(tree.min(0, 2), 1.0) 90 | assert np.isclose(tree.min(0, 3), 1.0) 91 | assert np.isclose(tree.min(0, -1), 1.0) 92 | assert np.isclose(tree.min(2, 4), 3.0) 93 | assert np.isclose(tree.min(2, 3), 4.0) 94 | assert np.isclose(tree.min(2, -1), 4.0) 95 | assert np.isclose(tree.min(3, 4), 3.0) 96 | 97 | 98 | if __name__ == '__main__': 99 | test_tree_set() 100 | test_tree_set_overlap() 101 | test_prefixsum_idx() 102 | test_prefixsum_idx2() 103 | test_max_interval_tree() 104 | -------------------------------------------------------------------------------- /baselines/common/vec_env/subproc_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiprocessing import Process, Pipe 3 | from . import VecEnv, CloudpickleWrapper 4 | 5 | def worker(remote, parent_remote, env_fn_wrapper): 6 | parent_remote.close() 7 | env = env_fn_wrapper.x() 8 | try: 9 | while True: 10 | cmd, data = remote.recv() 11 | if cmd == 'step': 12 | ob, reward, done, info = env.step(data) 13 | if done: 14 | ob = env.reset() 15 | remote.send((ob, reward, done, info)) 16 | elif cmd == 'reset': 17 | ob = env.reset() 18 | remote.send(ob) 19 | elif cmd == 'render': 20 | remote.send(env.render(mode='rgb_array')) 21 | elif cmd == 'close': 22 | remote.close() 23 | break 24 | elif cmd == 'get_spaces': 25 | remote.send((env.observation_space, env.action_space)) 26 | else: 27 | raise NotImplementedError 28 | except KeyboardInterrupt: 29 | print('SubprocVecEnv worker: got KeyboardInterrupt') 30 | finally: 31 | env.close() 32 | 33 | 34 | class SubprocVecEnv(VecEnv): 35 | def __init__(self, env_fns, spaces=None): 36 | """ 37 | envs: list of gym environments to run in subprocesses 38 | """ 39 | self.waiting = False 40 | nenvs = len(env_fns) 41 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 42 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 43 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 44 | for p in self.ps: 45 | p.daemon = True # if the main process crashes, we should not cause things to hang 46 | p.start() 47 | for remote in self.work_remotes: 48 | remote.close() 49 | 50 | self.remotes[0].send(('get_spaces', None)) 51 | observation_space, action_space = self.remotes[0].recv() 52 | self.viewer = None 53 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 54 | 55 | def step_async(self, actions): 56 | for remote, action in zip(self.remotes, actions): 57 | remote.send(('step', action)) 58 | self.waiting = True 59 | 60 | def step_wait(self): 61 | results = [remote.recv() for remote in self.remotes] 62 | self.waiting = False 63 | obs, rews, dones, infos = zip(*results) 64 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 65 | 66 | def reset(self): 67 | for remote in self.remotes: 68 | remote.send(('reset', None)) 69 | return np.stack([remote.recv() for remote in self.remotes]) 70 | 71 | def close_extras(self): 72 | if self.waiting: 73 | for remote in self.remotes: 74 | remote.recv() 75 | for remote in self.remotes: 76 | remote.send(('close', None)) 77 | for p in self.ps: 78 | p.join() 79 | 80 | def get_images(self): 81 | for pipe in self.remotes: 82 | pipe.send(('render', None)) 83 | imgs = [pipe.recv() for pipe in self.remotes] 84 | return imgs 85 | -------------------------------------------------------------------------------- /baselines/common/tests/test_serialization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import pytest 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | from baselines.common.tests.envs.mnist_env import MnistEnv 8 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 9 | from baselines.run import get_learn_function 10 | from baselines.common.tf_util import make_session, get_session 11 | 12 | from functools import partial 13 | 14 | 15 | learn_kwargs = { 16 | 'deepq': {}, 17 | 'a2c': {}, 18 | 'acktr': {}, 19 | 'ppo2': {'nminibatches': 1, 'nsteps': 10}, 20 | 'trpo_mpi': {}, 21 | } 22 | 23 | network_kwargs = { 24 | 'mlp': {}, 25 | 'cnn': {'pad': 'SAME'}, 26 | 'lstm': {}, 27 | 'cnn_lnlstm': {'pad': 'SAME'} 28 | } 29 | 30 | 31 | @pytest.mark.parametrize("learn_fn", learn_kwargs.keys()) 32 | @pytest.mark.parametrize("network_fn", network_kwargs.keys()) 33 | def test_serialization(learn_fn, network_fn): 34 | ''' 35 | Test if the trained model can be serialized 36 | ''' 37 | 38 | 39 | if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: 40 | # TODO make acktr work with recurrent policies 41 | # and test 42 | # github issue: https://github.com/openai/baselines/issues/194 43 | return 44 | 45 | env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)]) 46 | ob = env.reset().copy() 47 | learn = get_learn_function(learn_fn) 48 | 49 | kwargs = {} 50 | kwargs.update(network_kwargs[network_fn]) 51 | kwargs.update(learn_kwargs[learn_fn]) 52 | 53 | 54 | learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) 55 | 56 | with tempfile.TemporaryDirectory() as td: 57 | model_path = os.path.join(td, 'serialization_test_model') 58 | 59 | with tf.Graph().as_default(), make_session().as_default(): 60 | model = learn(total_timesteps=100) 61 | model.save(model_path) 62 | mean1, std1 = _get_action_stats(model, ob) 63 | variables_dict1 = _serialize_variables() 64 | 65 | with tf.Graph().as_default(), make_session().as_default(): 66 | model = learn(total_timesteps=0, load_path=model_path) 67 | mean2, std2 = _get_action_stats(model, ob) 68 | variables_dict2 = _serialize_variables() 69 | 70 | for k, v in variables_dict1.items(): 71 | np.testing.assert_allclose(v, variables_dict2[k], atol=0.01, 72 | err_msg='saved and loaded variable {} value mismatch'.format(k)) 73 | 74 | np.testing.assert_allclose(mean1, mean2, atol=0.5) 75 | np.testing.assert_allclose(std1, std2, atol=0.5) 76 | 77 | 78 | 79 | def _serialize_variables(): 80 | sess = get_session() 81 | variables = tf.trainable_variables() 82 | values = sess.run(variables) 83 | return {var.name: value for var, value in zip(variables, values)} 84 | 85 | 86 | def _get_action_stats(model, ob): 87 | ntrials = 1000 88 | if model.initial_state is None or model.initial_state == []: 89 | actions = np.array([model.step(ob)[0] for _ in range(ntrials)]) 90 | else: 91 | actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)]) 92 | 93 | mean = np.mean(actions, axis=0) 94 | std = np.std(actions, axis=0) 95 | 96 | return mean, std 97 | 98 | -------------------------------------------------------------------------------- /baselines/results_plotter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode 4 | 5 | import matplotlib.pyplot as plt 6 | plt.rcParams['svg.fonttype'] = 'none' 7 | 8 | from baselines.bench.monitor import load_results 9 | 10 | X_TIMESTEPS = 'timesteps' 11 | X_EPISODES = 'episodes' 12 | X_WALLTIME = 'walltime_hrs' 13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] 14 | EPISODES_WINDOW = 100 15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 16 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', 17 | 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] 18 | 19 | def rolling_window(a, window): 20 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 21 | strides = a.strides + (a.strides[-1],) 22 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 23 | 24 | def window_func(x, y, window, func): 25 | yw = rolling_window(y, window) 26 | yw_func = func(yw, axis=-1) 27 | return x[window-1:], yw_func 28 | 29 | def ts2xy(ts, xaxis): 30 | if xaxis == X_TIMESTEPS: 31 | x = np.cumsum(ts.l.values) 32 | y = ts.r.values 33 | elif xaxis == X_EPISODES: 34 | x = np.arange(len(ts)) 35 | y = ts.r.values 36 | elif xaxis == X_WALLTIME: 37 | x = ts.t.values / 3600. 38 | y = ts.r.values 39 | else: 40 | raise NotImplementedError 41 | return x, y 42 | 43 | def plot_curves(xy_list, xaxis, title): 44 | plt.figure(figsize=(8,2)) 45 | maxx = max(xy[0][-1] for xy in xy_list) 46 | minx = 0 47 | for (i, (x, y)) in enumerate(xy_list): 48 | color = COLORS[i] 49 | plt.scatter(x, y, s=2) 50 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes 51 | plt.plot(x, y_mean, color=color) 52 | plt.xlim(minx, maxx) 53 | plt.title(title) 54 | plt.xlabel(xaxis) 55 | plt.ylabel("Episode Rewards") 56 | plt.tight_layout() 57 | 58 | def plot_results(dirs, num_timesteps, xaxis, task_name): 59 | tslist = [] 60 | for dir in dirs: 61 | ts = load_results(dir) 62 | ts = ts[ts.l.cumsum() <= num_timesteps] 63 | tslist.append(ts) 64 | xy_list = [ts2xy(ts, xaxis) for ts in tslist] 65 | plot_curves(xy_list, xaxis, task_name) 66 | 67 | # Example usage in jupyter-notebook 68 | # from baselines import log_viewer 69 | # %matplotlib inline 70 | # log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout") 71 | # Here ./log is a directory containing the monitor.csv files 72 | 73 | def main(): 74 | import argparse 75 | import os 76 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 77 | parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log']) 78 | parser.add_argument('--num_timesteps', type=int, default=int(10e6)) 79 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) 80 | parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout') 81 | args = parser.parse_args() 82 | args.dirs = [os.path.abspath(dir) for dir in args.dirs] 83 | plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name) 84 | plt.show() 85 | 86 | if __name__ == '__main__': 87 | main() -------------------------------------------------------------------------------- /baselines/common/mpi_util.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from mpi4py import MPI 3 | import os, numpy as np 4 | import platform 5 | import shutil 6 | import subprocess 7 | 8 | def sync_from_root(sess, variables, comm=None): 9 | """ 10 | Send the root node's parameters to every worker. 11 | Arguments: 12 | sess: the TensorFlow session. 13 | variables: all parameter variables including optimizer's 14 | """ 15 | if comm is None: comm = MPI.COMM_WORLD 16 | rank = comm.Get_rank() 17 | for var in variables: 18 | if rank == 0: 19 | comm.Bcast(sess.run(var)) 20 | else: 21 | import tensorflow as tf 22 | returned_var = np.empty(var.shape, dtype='float32') 23 | comm.Bcast(returned_var) 24 | sess.run(tf.assign(var, returned_var)) 25 | 26 | def gpu_count(): 27 | """ 28 | Count the GPUs on this machine. 29 | """ 30 | if shutil.which('nvidia-smi') is None: 31 | return 0 32 | output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv']) 33 | return max(0, len(output.split(b'\n')) - 2) 34 | 35 | def setup_mpi_gpus(): 36 | """ 37 | Set CUDA_VISIBLE_DEVICES using MPI. 38 | """ 39 | num_gpus = gpu_count() 40 | if num_gpus == 0: 41 | return 42 | local_rank, _ = get_local_rank_size(MPI.COMM_WORLD) 43 | os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus) 44 | 45 | def get_local_rank_size(comm): 46 | """ 47 | Returns the rank of each process on its machine 48 | The processes on a given machine will be assigned ranks 49 | 0, 1, 2, ..., N-1, 50 | where N is the number of processes on this machine. 51 | 52 | Useful if you want to assign one gpu per machine 53 | """ 54 | this_node = platform.node() 55 | ranks_nodes = comm.allgather((comm.Get_rank(), this_node)) 56 | node2rankssofar = defaultdict(int) 57 | local_rank = None 58 | for (rank, node) in ranks_nodes: 59 | if rank == comm.Get_rank(): 60 | local_rank = node2rankssofar[node] 61 | node2rankssofar[node] += 1 62 | assert local_rank is not None 63 | return local_rank, node2rankssofar[this_node] 64 | 65 | def share_file(comm, path): 66 | """ 67 | Copies the file from rank 0 to all other ranks 68 | Puts it in the same place on all machines 69 | """ 70 | localrank, _ = get_local_rank_size(comm) 71 | if comm.Get_rank() == 0: 72 | with open(path, 'rb') as fh: 73 | data = fh.read() 74 | comm.bcast(data) 75 | else: 76 | data = comm.bcast(None) 77 | if localrank == 0: 78 | os.makedirs(os.path.dirname(path), exist_ok=True) 79 | with open(path, 'wb') as fh: 80 | fh.write(data) 81 | comm.Barrier() 82 | 83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True): 84 | if comm is None: return d 85 | alldicts = comm.allgather(d) 86 | size = comm.size 87 | k2li = defaultdict(list) 88 | for d in alldicts: 89 | for (k,v) in d.items(): 90 | k2li[k].append(v) 91 | result = {} 92 | for (k,li) in k2li.items(): 93 | if assert_all_have_data: 94 | assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k) 95 | if op=='mean': 96 | result[k] = np.mean(li, axis=0) 97 | elif op=='sum': 98 | result[k] = np.sum(li, axis=0) 99 | else: 100 | assert 0, op 101 | return result 102 | -------------------------------------------------------------------------------- /baselines/deepq/experiments/custom_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import itertools 3 | import numpy as np 4 | import tensorflow as tf 5 | import tensorflow.contrib.layers as layers 6 | 7 | import baselines.common.tf_util as U 8 | 9 | from baselines import logger 10 | from baselines import deepq 11 | from baselines.deepq.replay_buffer import ReplayBuffer 12 | from baselines.deepq.utils import ObservationInput 13 | from baselines.common.schedules import LinearSchedule 14 | 15 | 16 | def model(inpt, num_actions, scope, reuse=False): 17 | """This model takes as input an observation and returns values of all actions.""" 18 | with tf.variable_scope(scope, reuse=reuse): 19 | out = inpt 20 | out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh) 21 | out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None) 22 | return out 23 | 24 | 25 | if __name__ == '__main__': 26 | with U.make_session(8): 27 | # Create the environment 28 | env = gym.make("CartPole-v0") 29 | # Create all the functions necessary to train the model 30 | act, train, update_target, debug = deepq.build_train( 31 | make_obs_ph=lambda name: ObservationInput(env.observation_space, name=name), 32 | q_func=model, 33 | num_actions=env.action_space.n, 34 | optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), 35 | ) 36 | # Create the replay buffer 37 | replay_buffer = ReplayBuffer(50000) 38 | # Create the schedule for exploration starting from 1 (every action is random) down to 39 | # 0.02 (98% of actions are selected according to values predicted by the model). 40 | exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) 41 | 42 | # Initialize the parameters and copy them to the target network. 43 | U.initialize() 44 | update_target() 45 | 46 | episode_rewards = [0.0] 47 | obs = env.reset() 48 | for t in itertools.count(): 49 | # Take action and update exploration to the newest value 50 | action = act(obs[None], update_eps=exploration.value(t))[0] 51 | new_obs, rew, done, _ = env.step(action) 52 | # Store transition in the replay buffer. 53 | replay_buffer.add(obs, action, rew, new_obs, float(done)) 54 | obs = new_obs 55 | 56 | episode_rewards[-1] += rew 57 | if done: 58 | obs = env.reset() 59 | episode_rewards.append(0) 60 | 61 | is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200 62 | if is_solved: 63 | # Show off the result 64 | env.render() 65 | else: 66 | # Minimize the error in Bellman's equation on a batch sampled from replay buffer. 67 | if t > 1000: 68 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) 69 | train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) 70 | # Update target network periodically. 71 | if t % 1000 == 0: 72 | update_target() 73 | 74 | if done and len(episode_rewards) % 10 == 0: 75 | logger.record_tabular("steps", t) 76 | logger.record_tabular("episodes", len(episode_rewards)) 77 | logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1)) 78 | logger.record_tabular("% time spent exploring", int(100 * exploration.value(t))) 79 | logger.dump_tabular() 80 | -------------------------------------------------------------------------------- /baselines/acktr/kfac_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None): 4 | assert reduce_dim is not None 5 | 6 | # weird batch matmul 7 | if len(a.get_shape()) == 2 and len(b.get_shape()) > 2: 8 | # reshape reduce_dim to the left most dim in b 9 | b_shape = b.get_shape() 10 | if reduce_dim != 0: 11 | b_dims = list(range(len(b_shape))) 12 | b_dims.remove(reduce_dim) 13 | b_dims.insert(0, reduce_dim) 14 | b = tf.transpose(b, b_dims) 15 | b_t_shape = b.get_shape() 16 | b = tf.reshape(b, [int(b_shape[reduce_dim]), -1]) 17 | result = tf.matmul(a, b, transpose_a=transpose_a, 18 | transpose_b=transpose_b) 19 | result = tf.reshape(result, b_t_shape) 20 | if reduce_dim != 0: 21 | b_dims = list(range(len(b_shape))) 22 | b_dims.remove(0) 23 | b_dims.insert(reduce_dim, 0) 24 | result = tf.transpose(result, b_dims) 25 | return result 26 | 27 | elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2: 28 | # reshape reduce_dim to the right most dim in a 29 | a_shape = a.get_shape() 30 | outter_dim = len(a_shape) - 1 31 | reduce_dim = len(a_shape) - reduce_dim - 1 32 | if reduce_dim != outter_dim: 33 | a_dims = list(range(len(a_shape))) 34 | a_dims.remove(reduce_dim) 35 | a_dims.insert(outter_dim, reduce_dim) 36 | a = tf.transpose(a, a_dims) 37 | a_t_shape = a.get_shape() 38 | a = tf.reshape(a, [-1, int(a_shape[reduce_dim])]) 39 | result = tf.matmul(a, b, transpose_a=transpose_a, 40 | transpose_b=transpose_b) 41 | result = tf.reshape(result, a_t_shape) 42 | if reduce_dim != outter_dim: 43 | a_dims = list(range(len(a_shape))) 44 | a_dims.remove(outter_dim) 45 | a_dims.insert(reduce_dim, outter_dim) 46 | result = tf.transpose(result, a_dims) 47 | return result 48 | 49 | elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2: 50 | return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b) 51 | 52 | assert False, 'something went wrong' 53 | 54 | 55 | def clipoutNeg(vec, threshold=1e-6): 56 | mask = tf.cast(vec > threshold, tf.float32) 57 | return mask * vec 58 | 59 | 60 | def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False): 61 | eigen_min = tf.reduce_min(input_mat) 62 | eigen_max = tf.reduce_max(input_mat) 63 | eigen_ratio = eigen_max / eigen_min 64 | input_mat_clipped = clipoutNeg(input_mat, threshold) 65 | 66 | if debug: 67 | input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print( 68 | input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio])) 69 | 70 | return input_mat_clipped 71 | 72 | 73 | def factorReshape(Q, e, grad, facIndx=0, ftype='act'): 74 | grad_shape = grad.get_shape() 75 | if ftype == 'act': 76 | assert e.get_shape()[0] == grad_shape[facIndx] 77 | expanded_shape = [1, ] * len(grad_shape) 78 | expanded_shape[facIndx] = -1 79 | e = tf.reshape(e, expanded_shape) 80 | if ftype == 'grad': 81 | assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1] 82 | expanded_shape = [1, ] * len(grad_shape) 83 | expanded_shape[len(grad_shape) - facIndx - 1] = -1 84 | e = tf.reshape(e, expanded_shape) 85 | 86 | return Q, e 87 | -------------------------------------------------------------------------------- /baselines/acktr/policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from baselines.acktr.utils import dense, kl_div 4 | import baselines.common.tf_util as U 5 | 6 | class GaussianMlpPolicy(object): 7 | def __init__(self, ob_dim, ac_dim): 8 | # Here we'll construct a bunch of expressions, which will be used in two places: 9 | # (1) When sampling actions 10 | # (2) When computing loss functions, for the policy update 11 | # Variables specific to (1) have the word "sampled" in them, 12 | # whereas variables specific to (2) have the word "old" in them 13 | ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations 14 | oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions 15 | oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions 16 | adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate 17 | wd_dict = {} 18 | h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) 19 | h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict)) 20 | mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output 21 | self.wd_dict = wd_dict 22 | self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs 23 | logstd_1a = tf.expand_dims(logstd_1a, 0) 24 | std_1a = tf.exp(logstd_1a) 25 | std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) 26 | ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) 27 | sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform. 28 | logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action 29 | logprob_n = - tf.reduce_sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * tf.reduce_sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) 30 | kl = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) 31 | #kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n 32 | surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient 33 | surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy 34 | self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob 35 | #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy 36 | self.compute_kl = U.function([ob_no, oldac_dist], kl) 37 | self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss 38 | U.initialize() # Initialize uninitialized TF variables 39 | 40 | def act(self, ob): 41 | ac, ac_dist, logp = self._act(ob[None]) 42 | return ac[0], ac_dist[0], logp[0] 43 | -------------------------------------------------------------------------------- /baselines/common/vec_env/test_vec_env.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for asynchronous vectorized environments. 3 | """ 4 | 5 | import gym 6 | import numpy as np 7 | import pytest 8 | from .dummy_vec_env import DummyVecEnv 9 | from .shmem_vec_env import ShmemVecEnv 10 | from .subproc_vec_env import SubprocVecEnv 11 | 12 | 13 | def assert_envs_equal(env1, env2, num_steps): 14 | """ 15 | Compare two environments over num_steps steps and make sure 16 | that the observations produced by each are the same when given 17 | the same actions. 18 | """ 19 | assert env1.num_envs == env2.num_envs 20 | assert env1.action_space.shape == env2.action_space.shape 21 | assert env1.action_space.dtype == env2.action_space.dtype 22 | joint_shape = (env1.num_envs,) + env1.action_space.shape 23 | 24 | try: 25 | obs1, obs2 = env1.reset(), env2.reset() 26 | assert np.array(obs1).shape == np.array(obs2).shape 27 | assert np.array(obs1).shape == joint_shape 28 | assert np.allclose(obs1, obs2) 29 | np.random.seed(1337) 30 | for _ in range(num_steps): 31 | actions = np.array(np.random.randint(0, 0x100, size=joint_shape), 32 | dtype=env1.action_space.dtype) 33 | for env in [env1, env2]: 34 | env.step_async(actions) 35 | outs1 = env1.step_wait() 36 | outs2 = env2.step_wait() 37 | for out1, out2 in zip(outs1[:3], outs2[:3]): 38 | assert np.array(out1).shape == np.array(out2).shape 39 | assert np.allclose(out1, out2) 40 | assert list(outs1[3]) == list(outs2[3]) 41 | finally: 42 | env1.close() 43 | env2.close() 44 | 45 | 46 | @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv)) 47 | @pytest.mark.parametrize('dtype', ('uint8', 'float32')) 48 | def test_vec_env(klass, dtype): # pylint: disable=R0914 49 | """ 50 | Test that a vectorized environment is equivalent to 51 | DummyVecEnv, since DummyVecEnv is less likely to be 52 | error prone. 53 | """ 54 | num_envs = 3 55 | num_steps = 100 56 | shape = (3, 8) 57 | 58 | def make_fn(seed): 59 | """ 60 | Get an environment constructor with a seed. 61 | """ 62 | return lambda: SimpleEnv(seed, shape, dtype) 63 | fns = [make_fn(i) for i in range(num_envs)] 64 | env1 = DummyVecEnv(fns) 65 | env2 = klass(fns) 66 | assert_envs_equal(env1, env2, num_steps=num_steps) 67 | 68 | 69 | class SimpleEnv(gym.Env): 70 | """ 71 | An environment with a pre-determined observation space 72 | and RNG seed. 73 | """ 74 | 75 | def __init__(self, seed, shape, dtype): 76 | np.random.seed(seed) 77 | self._dtype = dtype 78 | self._start_obs = np.array(np.random.randint(0, 0x100, size=shape), 79 | dtype=dtype) 80 | self._max_steps = seed + 1 81 | self._cur_obs = None 82 | self._cur_step = 0 83 | # this is 0xFF instead of 0x100 because the Box space includes 84 | # the high end, while randint does not 85 | self.action_space = gym.spaces.Box(low=0, high=0xFF, shape=shape, dtype=dtype) 86 | self.observation_space = self.action_space 87 | 88 | def step(self, action): 89 | self._cur_obs += np.array(action, dtype=self._dtype) 90 | self._cur_step += 1 91 | done = self._cur_step >= self._max_steps 92 | reward = self._cur_step / self._max_steps 93 | return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)} 94 | 95 | def reset(self): 96 | self._cur_obs = self._start_obs 97 | self._cur_step = 0 98 | return self._cur_obs 99 | 100 | def render(self, mode=None): 101 | raise NotImplementedError 102 | -------------------------------------------------------------------------------- /baselines/her/experiment/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | import json 5 | import seaborn as sns; sns.set() 6 | import glob2 7 | import argparse 8 | 9 | 10 | def smooth_reward_curve(x, y): 11 | halfwidth = int(np.ceil(len(x) / 60)) # Halfwidth of our smoothing convolution 12 | k = halfwidth 13 | xsmoo = x 14 | ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1), 15 | mode='same') 16 | return xsmoo, ysmoo 17 | 18 | 19 | def load_results(file): 20 | if not os.path.exists(file): 21 | return None 22 | with open(file, 'r') as f: 23 | lines = [line for line in f] 24 | if len(lines) < 2: 25 | return None 26 | keys = [name.strip() for name in lines[0].split(',')] 27 | data = np.genfromtxt(file, delimiter=',', skip_header=1, filling_values=0.) 28 | if data.ndim == 1: 29 | data = data.reshape(1, -1) 30 | assert data.ndim == 2 31 | assert data.shape[-1] == len(keys) 32 | result = {} 33 | for idx, key in enumerate(keys): 34 | result[key] = data[:, idx] 35 | return result 36 | 37 | 38 | def pad(xs, value=np.nan): 39 | maxlen = np.max([len(x) for x in xs]) 40 | 41 | padded_xs = [] 42 | for x in xs: 43 | if x.shape[0] >= maxlen: 44 | padded_xs.append(x) 45 | 46 | padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value 47 | x_padded = np.concatenate([x, padding], axis=0) 48 | assert x_padded.shape[1:] == x.shape[1:] 49 | assert x_padded.shape[0] == maxlen 50 | padded_xs.append(x_padded) 51 | return np.array(padded_xs) 52 | 53 | 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('dir', type=str) 56 | parser.add_argument('--smooth', type=int, default=1) 57 | args = parser.parse_args() 58 | 59 | # Load all data. 60 | data = {} 61 | paths = [os.path.abspath(os.path.join(path, '..')) for path in glob2.glob(os.path.join(args.dir, '**', 'progress.csv'))] 62 | for curr_path in paths: 63 | if not os.path.isdir(curr_path): 64 | continue 65 | results = load_results(os.path.join(curr_path, 'progress.csv')) 66 | if not results: 67 | print('skipping {}'.format(curr_path)) 68 | continue 69 | print('loading {} ({})'.format(curr_path, len(results['epoch']))) 70 | with open(os.path.join(curr_path, 'params.json'), 'r') as f: 71 | params = json.load(f) 72 | 73 | success_rate = np.array(results['test/success_rate']) 74 | epoch = np.array(results['epoch']) + 1 75 | env_id = params['env_name'] 76 | replay_strategy = params['replay_strategy'] 77 | 78 | if replay_strategy == 'future': 79 | config = 'her' 80 | else: 81 | config = 'ddpg' 82 | if 'Dense' in env_id: 83 | config += '-dense' 84 | else: 85 | config += '-sparse' 86 | env_id = env_id.replace('Dense', '') 87 | 88 | # Process and smooth data. 89 | assert success_rate.shape == epoch.shape 90 | x = epoch 91 | y = success_rate 92 | if args.smooth: 93 | x, y = smooth_reward_curve(epoch, success_rate) 94 | assert x.shape == y.shape 95 | 96 | if env_id not in data: 97 | data[env_id] = {} 98 | if config not in data[env_id]: 99 | data[env_id][config] = [] 100 | data[env_id][config].append((x, y)) 101 | 102 | # Plot data. 103 | for env_id in sorted(data.keys()): 104 | print('exporting {}'.format(env_id)) 105 | plt.clf() 106 | 107 | for config in sorted(data[env_id].keys()): 108 | xs, ys = zip(*data[env_id][config]) 109 | xs, ys = pad(xs), pad(ys) 110 | assert xs.shape == ys.shape 111 | 112 | plt.plot(xs[0], np.nanmedian(ys, axis=0), label=config) 113 | plt.fill_between(xs[0], np.nanpercentile(ys, 25, axis=0), np.nanpercentile(ys, 75, axis=0), alpha=0.25) 114 | plt.title(env_id) 115 | plt.xlabel('Epoch') 116 | plt.ylabel('Median Success Rate') 117 | plt.legend() 118 | plt.savefig(os.path.join(args.dir, 'fig_{}.png'.format(env_id))) 119 | -------------------------------------------------------------------------------- /baselines/common/mpi_running_mean_std.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np 3 | 4 | class RunningMeanStd(object): 5 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 6 | def __init__(self, epsilon=1e-2, shape=()): 7 | 8 | self._sum = tf.get_variable( 9 | dtype=tf.float64, 10 | shape=shape, 11 | initializer=tf.constant_initializer(0.0), 12 | name="runningsum", trainable=False) 13 | self._sumsq = tf.get_variable( 14 | dtype=tf.float64, 15 | shape=shape, 16 | initializer=tf.constant_initializer(epsilon), 17 | name="runningsumsq", trainable=False) 18 | self._count = tf.get_variable( 19 | dtype=tf.float64, 20 | shape=(), 21 | initializer=tf.constant_initializer(epsilon), 22 | name="count", trainable=False) 23 | self.shape = shape 24 | 25 | self.mean = tf.to_float(self._sum / self._count) 26 | self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) 27 | 28 | newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') 29 | newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') 30 | newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') 31 | self.incfiltparams = U.function([newsum, newsumsq, newcount], [], 32 | updates=[tf.assign_add(self._sum, newsum), 33 | tf.assign_add(self._sumsq, newsumsq), 34 | tf.assign_add(self._count, newcount)]) 35 | 36 | 37 | def update(self, x): 38 | x = x.astype('float64') 39 | n = int(np.prod(self.shape)) 40 | totalvec = np.zeros(n*2+1, 'float64') 41 | addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')]) 42 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) 43 | self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n]) 44 | 45 | @U.in_session 46 | def test_runningmeanstd(): 47 | for (x1, x2, x3) in [ 48 | (np.random.randn(3), np.random.randn(4), np.random.randn(5)), 49 | (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), 50 | ]: 51 | 52 | rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) 53 | U.initialize() 54 | 55 | x = np.concatenate([x1, x2, x3], axis=0) 56 | ms1 = [x.mean(axis=0), x.std(axis=0)] 57 | rms.update(x1) 58 | rms.update(x2) 59 | rms.update(x3) 60 | ms2 = [rms.mean.eval(), rms.std.eval()] 61 | 62 | assert np.allclose(ms1, ms2) 63 | 64 | @U.in_session 65 | def test_dist(): 66 | np.random.seed(0) 67 | p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1)) 68 | q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1)) 69 | 70 | # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) 71 | # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) 72 | 73 | comm = MPI.COMM_WORLD 74 | assert comm.Get_size()==2 75 | if comm.Get_rank()==0: 76 | x1,x2,x3 = p1,p2,p3 77 | elif comm.Get_rank()==1: 78 | x1,x2,x3 = q1,q2,q3 79 | else: 80 | assert False 81 | 82 | rms = RunningMeanStd(epsilon=0.0, shape=(1,)) 83 | U.initialize() 84 | 85 | rms.update(x1) 86 | rms.update(x2) 87 | rms.update(x3) 88 | 89 | bigvec = np.concatenate([p1,p2,p3,q1,q2,q3]) 90 | 91 | def checkallclose(x,y): 92 | print(x,y) 93 | return np.allclose(x,y) 94 | 95 | assert checkallclose( 96 | bigvec.mean(axis=0), 97 | rms.mean.eval(), 98 | ) 99 | assert checkallclose( 100 | bigvec.std(axis=0), 101 | rms.std.eval(), 102 | ) 103 | 104 | 105 | if __name__ == "__main__": 106 | # Run with mpirun -np 2 python 107 | test_dist() 108 | -------------------------------------------------------------------------------- /baselines/common/schedules.py: -------------------------------------------------------------------------------- 1 | """This file is used for specifying various schedules that evolve over 2 | time throughout the execution of the algorithm, such as: 3 | - learning rate for the optimizer 4 | - exploration epsilon for the epsilon greedy exploration strategy 5 | - beta parameter for beta parameter in prioritized replay 6 | 7 | Each schedule has a function `value(t)` which returns the current value 8 | of the parameter given the timestep t of the optimization procedure. 9 | """ 10 | 11 | 12 | class Schedule(object): 13 | def value(self, t): 14 | """Value of the schedule at time t""" 15 | raise NotImplementedError() 16 | 17 | 18 | class ConstantSchedule(object): 19 | def __init__(self, value): 20 | """Value remains constant over time. 21 | 22 | Parameters 23 | ---------- 24 | value: float 25 | Constant value of the schedule 26 | """ 27 | self._v = value 28 | 29 | def value(self, t): 30 | """See Schedule.value""" 31 | return self._v 32 | 33 | 34 | def linear_interpolation(l, r, alpha): 35 | return l + alpha * (r - l) 36 | 37 | 38 | class PiecewiseSchedule(object): 39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 40 | """Piecewise schedule. 41 | 42 | endpoints: [(int, int)] 43 | list of pairs `(time, value)` meanining that schedule should output 44 | `value` when `t==time`. All the values for time must be sorted in 45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 48 | time passed between `time_a` and `time_b` for time `t`. 49 | interpolation: lambda float, float, float: float 50 | a function that takes value to the left and to the right of t according 51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 52 | right endpoint that t has covered. See linear_interpolation for example. 53 | outside_value: float 54 | if the value is requested outside of all the intervals sepecified in 55 | `endpoints` this value is returned. If None then AssertionError is 56 | raised when outside value is requested. 57 | """ 58 | idxes = [e[0] for e in endpoints] 59 | assert idxes == sorted(idxes) 60 | self._interpolation = interpolation 61 | self._outside_value = outside_value 62 | self._endpoints = endpoints 63 | 64 | def value(self, t): 65 | """See Schedule.value""" 66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 67 | if l_t <= t and t < r_t: 68 | alpha = float(t - l_t) / (r_t - l_t) 69 | return self._interpolation(l, r, alpha) 70 | 71 | # t does not belong to any of the pieces, so doom. 72 | assert self._outside_value is not None 73 | return self._outside_value 74 | 75 | 76 | class LinearSchedule(object): 77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 78 | """Linear interpolation between initial_p and final_p over 79 | schedule_timesteps. After this many timesteps pass final_p is 80 | returned. 81 | 82 | Parameters 83 | ---------- 84 | schedule_timesteps: int 85 | Number of timesteps for which to linearly anneal initial_p 86 | to final_p 87 | initial_p: float 88 | initial output value 89 | final_p: float 90 | final output value 91 | """ 92 | self.schedule_timesteps = schedule_timesteps 93 | self.final_p = final_p 94 | self.initial_p = initial_p 95 | 96 | def value(self, t): 97 | """See Schedule.value""" 98 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 99 | return self.initial_p + fraction * (self.final_p - self.initial_p) 100 | -------------------------------------------------------------------------------- /baselines/her/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import numpy as np 4 | 5 | 6 | class ReplayBuffer: 7 | def __init__(self, buffer_shapes, size_in_transitions, T, sample_transitions): 8 | """Creates a replay buffer. 9 | 10 | Args: 11 | buffer_shapes (dict of ints): the shape for all buffers that are used in the replay 12 | buffer 13 | size_in_transitions (int): the size of the buffer, measured in transitions 14 | T (int): the time horizon for episodes 15 | sample_transitions (function): a function that samples from the replay buffer 16 | """ 17 | self.buffer_shapes = buffer_shapes 18 | self.size = size_in_transitions // T 19 | self.T = T 20 | self.sample_transitions = sample_transitions 21 | 22 | # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)} 23 | self.buffers = {key: np.empty([self.size, *shape]) 24 | for key, shape in buffer_shapes.items()} 25 | 26 | # memory management 27 | self.current_size = 0 28 | self.n_transitions_stored = 0 29 | 30 | self.lock = threading.Lock() 31 | 32 | @property 33 | def full(self): 34 | with self.lock: 35 | return self.current_size == self.size 36 | 37 | def sample(self, batch_size): 38 | """Returns a dict {key: array(batch_size x shapes[key])} 39 | """ 40 | buffers = {} 41 | 42 | with self.lock: 43 | assert self.current_size > 0 44 | for key in self.buffers.keys(): 45 | buffers[key] = self.buffers[key][:self.current_size] 46 | 47 | buffers['o_2'] = buffers['o'][:, 1:, :] 48 | buffers['ag_2'] = buffers['ag'][:, 1:, :] 49 | 50 | transitions = self.sample_transitions(buffers, batch_size) 51 | 52 | for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())): 53 | assert key in transitions, "key %s missing from transitions" % key 54 | 55 | return transitions 56 | 57 | def store_episode(self, episode_batch): 58 | """episode_batch: array(batch_size x (T or T+1) x dim_key) 59 | """ 60 | batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()] 61 | assert np.all(np.array(batch_sizes) == batch_sizes[0]) 62 | batch_size = batch_sizes[0] 63 | 64 | with self.lock: 65 | idxs = self._get_storage_idx(batch_size) 66 | 67 | # load inputs into buffers 68 | for key in self.buffers.keys(): 69 | self.buffers[key][idxs] = episode_batch[key] 70 | 71 | self.n_transitions_stored += batch_size * self.T 72 | 73 | def get_current_episode_size(self): 74 | with self.lock: 75 | return self.current_size 76 | 77 | def get_current_size(self): 78 | with self.lock: 79 | return self.current_size * self.T 80 | 81 | def get_transitions_stored(self): 82 | with self.lock: 83 | return self.n_transitions_stored 84 | 85 | def clear_buffer(self): 86 | with self.lock: 87 | self.current_size = 0 88 | 89 | def _get_storage_idx(self, inc=None): 90 | inc = inc or 1 # size increment 91 | assert inc <= self.size, "Batch committed to replay is too large!" 92 | # go consecutively until you hit the end, and then go randomly. 93 | if self.current_size+inc <= self.size: 94 | idx = np.arange(self.current_size, self.current_size+inc) 95 | elif self.current_size < self.size: 96 | overflow = inc - (self.size - self.current_size) 97 | idx_a = np.arange(self.current_size, self.size) 98 | idx_b = np.random.randint(0, self.current_size, overflow) 99 | idx = np.concatenate([idx_a, idx_b]) 100 | else: 101 | idx = np.random.randint(0, self.size, inc) 102 | 103 | # update replay size 104 | self.current_size = min(self.size, self.current_size+inc) 105 | 106 | if inc == 1: 107 | idx = idx[0] 108 | return idx 109 | -------------------------------------------------------------------------------- /baselines/her/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import importlib 5 | import inspect 6 | import functools 7 | 8 | import tensorflow as tf 9 | import numpy as np 10 | 11 | from baselines.common import tf_util as U 12 | 13 | 14 | def store_args(method): 15 | """Stores provided method args as instance attributes. 16 | """ 17 | argspec = inspect.getfullargspec(method) 18 | defaults = {} 19 | if argspec.defaults is not None: 20 | defaults = dict( 21 | zip(argspec.args[-len(argspec.defaults):], argspec.defaults)) 22 | if argspec.kwonlydefaults is not None: 23 | defaults.update(argspec.kwonlydefaults) 24 | arg_names = argspec.args[1:] 25 | 26 | @functools.wraps(method) 27 | def wrapper(*positional_args, **keyword_args): 28 | self = positional_args[0] 29 | # Get default arg values 30 | args = defaults.copy() 31 | # Add provided arg values 32 | for name, value in zip(arg_names, positional_args[1:]): 33 | args[name] = value 34 | args.update(keyword_args) 35 | self.__dict__.update(args) 36 | return method(*positional_args, **keyword_args) 37 | 38 | return wrapper 39 | 40 | 41 | def import_function(spec): 42 | """Import a function identified by a string like "pkg.module:fn_name". 43 | """ 44 | mod_name, fn_name = spec.split(':') 45 | module = importlib.import_module(mod_name) 46 | fn = getattr(module, fn_name) 47 | return fn 48 | 49 | 50 | def flatten_grads(var_list, grads): 51 | """Flattens a variables and their gradients. 52 | """ 53 | return tf.concat([tf.reshape(grad, [U.numel(v)]) 54 | for (v, grad) in zip(var_list, grads)], 0) 55 | 56 | 57 | def nn(input, layers_sizes, reuse=None, flatten=False, name=""): 58 | """Creates a simple neural network 59 | """ 60 | for i, size in enumerate(layers_sizes): 61 | activation = tf.nn.relu if i < len(layers_sizes) - 1 else None 62 | input = tf.layers.dense(inputs=input, 63 | units=size, 64 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 65 | reuse=reuse, 66 | name=name + '_' + str(i)) 67 | if activation: 68 | input = activation(input) 69 | if flatten: 70 | assert layers_sizes[-1] == 1 71 | input = tf.reshape(input, [-1]) 72 | return input 73 | 74 | 75 | def install_mpi_excepthook(): 76 | import sys 77 | from mpi4py import MPI 78 | old_hook = sys.excepthook 79 | 80 | def new_hook(a, b, c): 81 | old_hook(a, b, c) 82 | sys.stdout.flush() 83 | sys.stderr.flush() 84 | MPI.COMM_WORLD.Abort() 85 | sys.excepthook = new_hook 86 | 87 | 88 | def mpi_fork(n, extra_mpi_args=[]): 89 | """Re-launches the current script with workers 90 | Returns "parent" for original parent, "child" for MPI children 91 | """ 92 | if n <= 1: 93 | return "child" 94 | if os.getenv("IN_MPI") is None: 95 | env = os.environ.copy() 96 | env.update( 97 | MKL_NUM_THREADS="1", 98 | OMP_NUM_THREADS="1", 99 | IN_MPI="1" 100 | ) 101 | # "-bind-to core" is crucial for good performance 102 | args = ["mpirun", "-np", str(n)] + \ 103 | extra_mpi_args + \ 104 | [sys.executable] 105 | 106 | args += sys.argv 107 | subprocess.check_call(args, env=env) 108 | return "parent" 109 | else: 110 | install_mpi_excepthook() 111 | return "child" 112 | 113 | 114 | def convert_episode_to_batch_major(episode): 115 | """Converts an episode to have the batch dimension in the major (first) 116 | dimension. 117 | """ 118 | episode_batch = {} 119 | for key in episode.keys(): 120 | val = np.array(episode[key]).copy() 121 | # make inputs batch-major instead of time-major 122 | episode_batch[key] = val.swapaxes(0, 1) 123 | 124 | return episode_batch 125 | 126 | 127 | def transitions_in_episode_batch(episode_batch): 128 | """Number of transitions in a given episode batch. 129 | """ 130 | shape = episode_batch['u'].shape 131 | return shape[0] * shape[1] 132 | 133 | 134 | def reshape_for_broadcasting(source, target): 135 | """Reshapes a tensor (source) to have the correct shape and dtype of the target 136 | before broadcasting it with MPI. 137 | """ 138 | dim = len(target.get_shape()) 139 | shape = ([1] * (dim - 1)) + [-1] 140 | return tf.reshape(tf.cast(source, target.dtype), shape) 141 | -------------------------------------------------------------------------------- /baselines/gail/dataset/mujoco_dset.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Data structure of the input .npz: 3 | the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs' 4 | the values of each item is a list storing the expert trajectory sequentially 5 | a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t] 6 | ''' 7 | 8 | from baselines import logger 9 | import numpy as np 10 | 11 | 12 | class Dset(object): 13 | def __init__(self, inputs, labels, randomize): 14 | self.inputs = inputs 15 | self.labels = labels 16 | assert len(self.inputs) == len(self.labels) 17 | self.randomize = randomize 18 | self.num_pairs = len(inputs) 19 | self.init_pointer() 20 | 21 | def init_pointer(self): 22 | self.pointer = 0 23 | if self.randomize: 24 | idx = np.arange(self.num_pairs) 25 | np.random.shuffle(idx) 26 | self.inputs = self.inputs[idx, :] 27 | self.labels = self.labels[idx, :] 28 | 29 | def get_next_batch(self, batch_size): 30 | # if batch_size is negative -> return all 31 | if batch_size < 0: 32 | return self.inputs, self.labels 33 | if self.pointer + batch_size >= self.num_pairs: 34 | self.init_pointer() 35 | end = self.pointer + batch_size 36 | inputs = self.inputs[self.pointer:end, :] 37 | labels = self.labels[self.pointer:end, :] 38 | self.pointer = end 39 | return inputs, labels 40 | 41 | 42 | class Mujoco_Dset(object): 43 | def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True): 44 | traj_data = np.load(expert_path) 45 | if traj_limitation < 0: 46 | traj_limitation = len(traj_data['obs']) 47 | obs = traj_data['obs'][:traj_limitation] 48 | acs = traj_data['acs'][:traj_limitation] 49 | 50 | # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length 51 | # and S is the environment observation/action space. 52 | # Flatten to (N * L, prod(S)) 53 | self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])]) 54 | self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])]) 55 | 56 | self.rets = traj_data['ep_rets'][:traj_limitation] 57 | self.avg_ret = sum(self.rets)/len(self.rets) 58 | self.std_ret = np.std(np.array(self.rets)) 59 | if len(self.acs) > 2: 60 | self.acs = np.squeeze(self.acs) 61 | assert len(self.obs) == len(self.acs) 62 | self.num_traj = min(traj_limitation, len(traj_data['obs'])) 63 | self.num_transition = len(self.obs) 64 | self.randomize = randomize 65 | self.dset = Dset(self.obs, self.acs, self.randomize) 66 | # for behavior cloning 67 | self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction), :], 68 | self.acs[:int(self.num_transition*train_fraction), :], 69 | self.randomize) 70 | self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):, :], 71 | self.acs[int(self.num_transition*train_fraction):, :], 72 | self.randomize) 73 | self.log_info() 74 | 75 | def log_info(self): 76 | logger.log("Total trajectorues: %d" % self.num_traj) 77 | logger.log("Total transitions: %d" % self.num_transition) 78 | logger.log("Average returns: %f" % self.avg_ret) 79 | logger.log("Std for returns: %f" % self.std_ret) 80 | 81 | def get_next_batch(self, batch_size, split=None): 82 | if split is None: 83 | return self.dset.get_next_batch(batch_size) 84 | elif split == 'train': 85 | return self.train_set.get_next_batch(batch_size) 86 | elif split == 'val': 87 | return self.val_set.get_next_batch(batch_size) 88 | else: 89 | raise NotImplementedError 90 | 91 | def plot(self): 92 | import matplotlib.pyplot as plt 93 | plt.hist(self.rets) 94 | plt.savefig("histogram_rets.png") 95 | plt.close() 96 | 97 | 98 | def test(expert_path, traj_limitation, plot): 99 | dset = Mujoco_Dset(expert_path, traj_limitation=traj_limitation) 100 | if plot: 101 | dset.plot() 102 | 103 | if __name__ == '__main__': 104 | import argparse 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument("--expert_path", type=str, default="../data/deterministic.trpo.Hopper.0.00.npz") 107 | parser.add_argument("--traj_limitation", type=int, default=None) 108 | parser.add_argument("--plot", type=bool, default=False) 109 | args = parser.parse_args() 110 | test(args.expert_path, args.traj_limitation, args.plot) 111 | -------------------------------------------------------------------------------- /baselines/acer/buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Buffer(object): 4 | # gets obs, actions, rewards, mu's, (states, masks), dones 5 | def __init__(self, env, nsteps, nstack, size=50000): 6 | self.nenv = env.num_envs 7 | self.nsteps = nsteps 8 | self.nh, self.nw, self.nc = env.observation_space.shape 9 | self.nstack = nstack 10 | self.nbatch = self.nenv * self.nsteps 11 | self.size = size // (self.nsteps) # Each loc contains nenv * nsteps frames, thus total buffer is nenv * size frames 12 | 13 | # Memory 14 | self.enc_obs = None 15 | self.actions = None 16 | self.rewards = None 17 | self.mus = None 18 | self.dones = None 19 | self.masks = None 20 | 21 | # Size indexes 22 | self.next_idx = 0 23 | self.num_in_buffer = 0 24 | 25 | def has_atleast(self, frames): 26 | # Frames per env, so total (nenv * frames) Frames needed 27 | # Each buffer loc has nenv * nsteps frames 28 | return self.num_in_buffer >= (frames // self.nsteps) 29 | 30 | def can_sample(self): 31 | return self.num_in_buffer > 0 32 | 33 | # Generate stacked frames 34 | def decode(self, enc_obs, dones): 35 | # enc_obs has shape [nenvs, nsteps + nstack, nh, nw, nc] 36 | # dones has shape [nenvs, nsteps, nh, nw, nc] 37 | # returns stacked obs of shape [nenv, (nsteps + 1), nh, nw, nstack*nc] 38 | nstack, nenv, nsteps, nh, nw, nc = self.nstack, self.nenv, self.nsteps, self.nh, self.nw, self.nc 39 | y = np.empty([nsteps + nstack - 1, nenv, 1, 1, 1], dtype=np.float32) 40 | obs = np.zeros([nstack, nsteps + nstack, nenv, nh, nw, nc], dtype=np.uint8) 41 | x = np.reshape(enc_obs, [nenv, nsteps + nstack, nh, nw, nc]).swapaxes(1, 42 | 0) # [nsteps + nstack, nenv, nh, nw, nc] 43 | y[3:] = np.reshape(1.0 - dones, [nenv, nsteps, 1, 1, 1]).swapaxes(1, 0) # keep 44 | y[:3] = 1.0 45 | # y = np.reshape(1 - dones, [nenvs, nsteps, 1, 1, 1]) 46 | for i in range(nstack): 47 | obs[-(i + 1), i:] = x 48 | # obs[:,i:,:,:,-(i+1),:] = x 49 | x = x[:-1] * y 50 | y = y[1:] 51 | return np.reshape(obs[:, 3:].transpose((2, 1, 3, 4, 0, 5)), [nenv, (nsteps + 1), nh, nw, nstack * nc]) 52 | 53 | def put(self, enc_obs, actions, rewards, mus, dones, masks): 54 | # enc_obs [nenv, (nsteps + nstack), nh, nw, nc] 55 | # actions, rewards, dones [nenv, nsteps] 56 | # mus [nenv, nsteps, nact] 57 | 58 | if self.enc_obs is None: 59 | self.enc_obs = np.empty([self.size] + list(enc_obs.shape), dtype=np.uint8) 60 | self.actions = np.empty([self.size] + list(actions.shape), dtype=np.int32) 61 | self.rewards = np.empty([self.size] + list(rewards.shape), dtype=np.float32) 62 | self.mus = np.empty([self.size] + list(mus.shape), dtype=np.float32) 63 | self.dones = np.empty([self.size] + list(dones.shape), dtype=np.bool) 64 | self.masks = np.empty([self.size] + list(masks.shape), dtype=np.bool) 65 | 66 | self.enc_obs[self.next_idx] = enc_obs 67 | self.actions[self.next_idx] = actions 68 | self.rewards[self.next_idx] = rewards 69 | self.mus[self.next_idx] = mus 70 | self.dones[self.next_idx] = dones 71 | self.masks[self.next_idx] = masks 72 | 73 | self.next_idx = (self.next_idx + 1) % self.size 74 | self.num_in_buffer = min(self.size, self.num_in_buffer + 1) 75 | 76 | def take(self, x, idx, envx): 77 | nenv = self.nenv 78 | out = np.empty([nenv] + list(x.shape[2:]), dtype=x.dtype) 79 | for i in range(nenv): 80 | out[i] = x[idx[i], envx[i]] 81 | return out 82 | 83 | def get(self): 84 | # returns 85 | # obs [nenv, (nsteps + 1), nh, nw, nstack*nc] 86 | # actions, rewards, dones [nenv, nsteps] 87 | # mus [nenv, nsteps, nact] 88 | nenv = self.nenv 89 | assert self.can_sample() 90 | 91 | # Sample exactly one id per env. If you sample across envs, then higher correlation in samples from same env. 92 | idx = np.random.randint(0, self.num_in_buffer, nenv) 93 | envx = np.arange(nenv) 94 | 95 | take = lambda x: self.take(x, idx, envx) # for i in range(nenv)], axis = 0) 96 | dones = take(self.dones) 97 | enc_obs = take(self.enc_obs) 98 | obs = self.decode(enc_obs, dones) 99 | actions = take(self.actions) 100 | rewards = take(self.rewards) 101 | mus = take(self.mus) 102 | masks = take(self.masks) 103 | return obs, actions, rewards, mus, dones, masks 104 | -------------------------------------------------------------------------------- /baselines/gail/adversary.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Reference: https://github.com/openai/imitation 3 | I follow the architecture from the official repository 4 | ''' 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | from baselines.common.mpi_running_mean_std import RunningMeanStd 9 | from baselines.common import tf_util as U 10 | 11 | def logsigmoid(a): 12 | '''Equivalent to tf.log(tf.sigmoid(a))''' 13 | return -tf.nn.softplus(-a) 14 | 15 | """ Reference: https://github.com/openai/imitation/blob/99fbccf3e060b6e6c739bdf209758620fcdefd3c/policyopt/thutil.py#L48-L51""" 16 | def logit_bernoulli_entropy(logits): 17 | ent = (1.-tf.nn.sigmoid(logits))*logits - logsigmoid(logits) 18 | return ent 19 | 20 | class TransitionClassifier(object): 21 | def __init__(self, env, hidden_size, entcoeff=0.001, lr_rate=1e-3, scope="adversary"): 22 | self.scope = scope 23 | self.observation_shape = env.observation_space.shape 24 | self.actions_shape = env.action_space.shape 25 | self.input_shape = tuple([o+a for o, a in zip(self.observation_shape, self.actions_shape)]) 26 | self.num_actions = env.action_space.shape[0] 27 | self.hidden_size = hidden_size 28 | self.build_ph() 29 | # Build grpah 30 | generator_logits = self.build_graph(self.generator_obs_ph, self.generator_acs_ph, reuse=False) 31 | expert_logits = self.build_graph(self.expert_obs_ph, self.expert_acs_ph, reuse=True) 32 | # Build accuracy 33 | generator_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(generator_logits) < 0.5)) 34 | expert_acc = tf.reduce_mean(tf.to_float(tf.nn.sigmoid(expert_logits) > 0.5)) 35 | # Build regression loss 36 | # let x = logits, z = targets. 37 | # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) 38 | generator_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=generator_logits, labels=tf.zeros_like(generator_logits)) 39 | generator_loss = tf.reduce_mean(generator_loss) 40 | expert_loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=expert_logits, labels=tf.ones_like(expert_logits)) 41 | expert_loss = tf.reduce_mean(expert_loss) 42 | # Build entropy loss 43 | logits = tf.concat([generator_logits, expert_logits], 0) 44 | entropy = tf.reduce_mean(logit_bernoulli_entropy(logits)) 45 | entropy_loss = -entcoeff*entropy 46 | # Loss + Accuracy terms 47 | self.losses = [generator_loss, expert_loss, entropy, entropy_loss, generator_acc, expert_acc] 48 | self.loss_name = ["generator_loss", "expert_loss", "entropy", "entropy_loss", "generator_acc", "expert_acc"] 49 | self.total_loss = generator_loss + expert_loss + entropy_loss 50 | # Build Reward for policy 51 | self.reward_op = -tf.log(1-tf.nn.sigmoid(generator_logits)+1e-8) 52 | var_list = self.get_trainable_variables() 53 | self.lossandgrad = U.function([self.generator_obs_ph, self.generator_acs_ph, self.expert_obs_ph, self.expert_acs_ph], 54 | self.losses + [U.flatgrad(self.total_loss, var_list)]) 55 | 56 | def build_ph(self): 57 | self.generator_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="observations_ph") 58 | self.generator_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="actions_ph") 59 | self.expert_obs_ph = tf.placeholder(tf.float32, (None, ) + self.observation_shape, name="expert_observations_ph") 60 | self.expert_acs_ph = tf.placeholder(tf.float32, (None, ) + self.actions_shape, name="expert_actions_ph") 61 | 62 | def build_graph(self, obs_ph, acs_ph, reuse=False): 63 | with tf.variable_scope(self.scope): 64 | if reuse: 65 | tf.get_variable_scope().reuse_variables() 66 | 67 | with tf.variable_scope("obfilter"): 68 | self.obs_rms = RunningMeanStd(shape=self.observation_shape) 69 | obs = (obs_ph - self.obs_rms.mean / self.obs_rms.std) 70 | _input = tf.concat([obs, acs_ph], axis=1) # concatenate the two input -> form a transition 71 | p_h1 = tf.contrib.layers.fully_connected(_input, self.hidden_size, activation_fn=tf.nn.tanh) 72 | p_h2 = tf.contrib.layers.fully_connected(p_h1, self.hidden_size, activation_fn=tf.nn.tanh) 73 | logits = tf.contrib.layers.fully_connected(p_h2, 1, activation_fn=tf.identity) 74 | return logits 75 | 76 | def get_trainable_variables(self): 77 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 78 | 79 | def get_reward(self, obs, acs): 80 | sess = tf.get_default_session() 81 | if len(obs.shape) == 1: 82 | obs = np.expand_dims(obs, 0) 83 | if len(acs.shape) == 1: 84 | acs = np.expand_dims(acs, 0) 85 | feed_dict = {self.generator_obs_ph: obs, self.generator_acs_ph: acs} 86 | reward = sess.run(self.reward_op, feed_dict) 87 | return reward 88 | -------------------------------------------------------------------------------- /baselines/common/vec_env/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from baselines.common.tile_images import tile_images 3 | 4 | class AlreadySteppingError(Exception): 5 | """ 6 | Raised when an asynchronous step is running while 7 | step_async() is called again. 8 | """ 9 | 10 | def __init__(self): 11 | msg = 'already running an async step' 12 | Exception.__init__(self, msg) 13 | 14 | 15 | class NotSteppingError(Exception): 16 | """ 17 | Raised when an asynchronous step is not running but 18 | step_wait() is called. 19 | """ 20 | 21 | def __init__(self): 22 | msg = 'not running an async step' 23 | Exception.__init__(self, msg) 24 | 25 | 26 | class VecEnv(ABC): 27 | """ 28 | An abstract asynchronous, vectorized environment. 29 | """ 30 | 31 | def __init__(self, num_envs, observation_space, action_space): 32 | self.num_envs = num_envs 33 | self.observation_space = observation_space 34 | self.action_space = action_space 35 | self.closed = False 36 | self.viewer = None # For rendering 37 | 38 | @abstractmethod 39 | def reset(self): 40 | """ 41 | Reset all the environments and return an array of 42 | observations, or a dict of observation arrays. 43 | 44 | If step_async is still doing work, that work will 45 | be cancelled and step_wait() should not be called 46 | until step_async() is invoked again. 47 | """ 48 | pass 49 | 50 | @abstractmethod 51 | def step_async(self, actions): 52 | """ 53 | Tell all the environments to start taking a step 54 | with the given actions. 55 | Call step_wait() to get the results of the step. 56 | 57 | You should not call this if a step_async run is 58 | already pending. 59 | """ 60 | pass 61 | 62 | @abstractmethod 63 | def step_wait(self): 64 | """ 65 | Wait for the step taken with step_async(). 66 | 67 | Returns (obs, rews, dones, infos): 68 | - obs: an array of observations, or a dict of 69 | arrays of observations. 70 | - rews: an array of rewards 71 | - dones: an array of "episode done" booleans 72 | - infos: a sequence of info objects 73 | """ 74 | pass 75 | 76 | def close_extras(self): 77 | """ 78 | Clean up the extra resources, beyond what's in this base class. 79 | Only runs when not self.closed. 80 | """ 81 | pass 82 | 83 | def close(self): 84 | if self.closed: 85 | return 86 | if self.viewer is not None: 87 | self.viewer.close() 88 | self.close_extras() 89 | self.closed = True 90 | 91 | def step(self, actions): 92 | """ 93 | Step the environments synchronously. 94 | 95 | This is available for backwards compatibility. 96 | """ 97 | self.step_async(actions) 98 | return self.step_wait() 99 | 100 | def render(self, mode='human'): 101 | imgs = self.get_images() 102 | bigimg = tile_images(imgs) 103 | if mode == 'human': 104 | self.get_viewer().imshow(bigimg) 105 | elif mode == 'rgb_array': 106 | return bigimg 107 | else: 108 | raise NotImplementedError 109 | 110 | def get_images(self): 111 | """ 112 | Return RGB images from each environment 113 | """ 114 | raise NotImplementedError 115 | 116 | @property 117 | def unwrapped(self): 118 | if isinstance(self, VecEnvWrapper): 119 | return self.venv.unwrapped 120 | else: 121 | return self 122 | 123 | def get_viewer(self): 124 | if self.viewer is None: 125 | from gym.envs.classic_control import rendering 126 | self.viewer = rendering.SimpleImageViewer() 127 | return self.viewer 128 | 129 | 130 | class VecEnvWrapper(VecEnv): 131 | """ 132 | An environment wrapper that applies to an entire batch 133 | of environments at once. 134 | """ 135 | 136 | def __init__(self, venv, observation_space=None, action_space=None): 137 | self.venv = venv 138 | VecEnv.__init__(self, 139 | num_envs=venv.num_envs, 140 | observation_space=observation_space or venv.observation_space, 141 | action_space=action_space or venv.action_space) 142 | 143 | def step_async(self, actions): 144 | self.venv.step_async(actions) 145 | 146 | @abstractmethod 147 | def reset(self): 148 | pass 149 | 150 | @abstractmethod 151 | def step_wait(self): 152 | pass 153 | 154 | def close(self): 155 | return self.venv.close() 156 | 157 | def render(self, mode='human'): 158 | return self.venv.render(mode=mode) 159 | 160 | def get_images(self): 161 | return self.venv.get_images() 162 | 163 | class CloudpickleWrapper(object): 164 | """ 165 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 166 | """ 167 | 168 | def __init__(self, x): 169 | self.x = x 170 | 171 | def __getstate__(self): 172 | import cloudpickle 173 | return cloudpickle.dumps(self.x) 174 | 175 | def __setstate__(self, ob): 176 | import pickle 177 | self.x = pickle.loads(ob) 178 | -------------------------------------------------------------------------------- /baselines/common/segment_tree.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | 4 | class SegmentTree(object): 5 | def __init__(self, capacity, operation, neutral_element): 6 | """Build a Segment Tree data structure. 7 | 8 | https://en.wikipedia.org/wiki/Segment_tree 9 | 10 | Can be used as regular array, but with two 11 | important differences: 12 | 13 | a) setting item's value is slightly slower. 14 | It is O(lg capacity) instead of O(1). 15 | b) user has access to an efficient ( O(log segment size) ) 16 | `reduce` operation which reduces `operation` over 17 | a contiguous subsequence of items in the array. 18 | 19 | Paramters 20 | --------- 21 | capacity: int 22 | Total size of the array - must be a power of two. 23 | operation: lambda obj, obj -> obj 24 | and operation for combining elements (eg. sum, max) 25 | must form a mathematical group together with the set of 26 | possible values for array elements (i.e. be associative) 27 | neutral_element: obj 28 | neutral element for the operation above. eg. float('-inf') 29 | for max and 0 for sum. 30 | """ 31 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 32 | self._capacity = capacity 33 | self._value = [neutral_element for _ in range(2 * capacity)] 34 | self._operation = operation 35 | 36 | def _reduce_helper(self, start, end, node, node_start, node_end): 37 | if start == node_start and end == node_end: 38 | return self._value[node] 39 | mid = (node_start + node_end) // 2 40 | if end <= mid: 41 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 42 | else: 43 | if mid + 1 <= start: 44 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 45 | else: 46 | return self._operation( 47 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 48 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 49 | ) 50 | 51 | def reduce(self, start=0, end=None): 52 | """Returns result of applying `self.operation` 53 | to a contiguous subsequence of the array. 54 | 55 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 56 | 57 | Parameters 58 | ---------- 59 | start: int 60 | beginning of the subsequence 61 | end: int 62 | end of the subsequences 63 | 64 | Returns 65 | ------- 66 | reduced: obj 67 | result of reducing self.operation over the specified range of array elements. 68 | """ 69 | if end is None: 70 | end = self._capacity 71 | if end < 0: 72 | end += self._capacity 73 | end -= 1 74 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 75 | 76 | def __setitem__(self, idx, val): 77 | # index of the leaf 78 | idx += self._capacity 79 | self._value[idx] = val 80 | idx //= 2 81 | while idx >= 1: 82 | self._value[idx] = self._operation( 83 | self._value[2 * idx], 84 | self._value[2 * idx + 1] 85 | ) 86 | idx //= 2 87 | 88 | def __getitem__(self, idx): 89 | assert 0 <= idx < self._capacity 90 | return self._value[self._capacity + idx] 91 | 92 | 93 | class SumSegmentTree(SegmentTree): 94 | def __init__(self, capacity): 95 | super(SumSegmentTree, self).__init__( 96 | capacity=capacity, 97 | operation=operator.add, 98 | neutral_element=0.0 99 | ) 100 | 101 | def sum(self, start=0, end=None): 102 | """Returns arr[start] + ... + arr[end]""" 103 | return super(SumSegmentTree, self).reduce(start, end) 104 | 105 | def find_prefixsum_idx(self, prefixsum): 106 | """Find the highest index `i` in the array such that 107 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 108 | 109 | if array values are probabilities, this function 110 | allows to sample indexes according to the discrete 111 | probability efficiently. 112 | 113 | Parameters 114 | ---------- 115 | perfixsum: float 116 | upperbound on the sum of array prefix 117 | 118 | Returns 119 | ------- 120 | idx: int 121 | highest index satisfying the prefixsum constraint 122 | """ 123 | assert 0 <= prefixsum <= self.sum() + 1e-5 124 | idx = 1 125 | while idx < self._capacity: # while non-leaf 126 | if self._value[2 * idx] > prefixsum: 127 | idx = 2 * idx 128 | else: 129 | prefixsum -= self._value[2 * idx] 130 | idx = 2 * idx + 1 131 | return idx - self._capacity 132 | 133 | 134 | class MinSegmentTree(SegmentTree): 135 | def __init__(self, capacity): 136 | super(MinSegmentTree, self).__init__( 137 | capacity=capacity, 138 | operation=min, 139 | neutral_element=float('inf') 140 | ) 141 | 142 | def min(self, start=0, end=None): 143 | """Returns min(arr[start], ..., arr[end])""" 144 | 145 | return super(MinSegmentTree, self).reduce(start, end) 146 | --------------------------------------------------------------------------------