├── README.md
├── VAE_CVAE_MNIST_mod
├── README.md
├── __pycache__
│ ├── models.cpython-37.pyc
│ └── utils.cpython-37.pyc
├── figs
│ ├── 1519649452.702026
│ │ ├── E9-Dist.png
│ │ └── E9I937.png
│ └── 1519649461.195146
│ │ ├── E9-Dist.png
│ │ └── E9I937.png
├── models.py
├── requirements.txt
├── train.py
└── utils.py
├── __pycache__
├── datastructures.cpython-37.pyc
├── executor.cpython-37.pyc
├── find_duplicate_programs.cpython-37.pyc
├── gridworld_environments.cpython-37.pyc
├── internal_rewards.cpython-37.pyc
├── learn_program_distance_experiments.cpython-37.pyc
├── operations.cpython-37.pyc
├── operations_list.cpython-37.pyc
├── predict_performance.cpython-37.pyc
├── predict_performance_experiments.cpython-37.pyc
├── program.cpython-37.pyc
├── program_types.cpython-37.pyc
├── run_agent.cpython-37.pyc
├── search_program_experiments.cpython-37.pyc
├── search_programs.cpython-37.pyc
├── simulate_search.cpython-37.pyc
└── test_synthesized_programs_experiments.cpython-37.pyc
├── datastructures.py
├── diversity
├── __pycache__
│ └── density_peaks.cpython-37.pyc
└── density_peaks.py
├── executor.py
├── find_duplicate_programs.py
├── gridworld_environments.py
├── helpers
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-37.pyc
│ ├── config.cpython-37.pyc
│ ├── debug.cpython-37.pyc
│ ├── experiment_params.cpython-37.pyc
│ ├── nn.cpython-37.pyc
│ ├── plotting.cpython-37.pyc
│ ├── probability.cpython-37.pyc
│ ├── torch_knn.cpython-37.pyc
│ └── util.cpython-37.pyc
├── config.py
├── datastructures.py
├── debug.py
├── experiment_params.py
├── lists.py
├── nn.py
├── plotting.py
├── probability.py
├── statistics
│ ├── __pycache__
│ │ └── welfords_std.cpython-37.pyc
│ └── welfords_std.py
├── task_queue.py
├── torch_knn.py
└── util.py
├── internal_rewards.py
├── learn_program_distance_experiments.py
├── operations.py
├── operations_list.py
├── predict_performance.py
├── predict_performance_experiments.py
├── program.py
├── program_synthesis.py
├── program_types.py
├── pytorch-a2c-ppo-acktr-gail_modified
├── .gitignore
├── LICENSE
├── README.md
├── a2c_ppo_acktr
│ ├── __init__.py
│ ├── algo
│ │ ├── __init__.py
│ │ ├── a2c_acktr.py
│ │ ├── gail.py
│ │ ├── kfac.py
│ │ └── ppo.py
│ ├── arguments.py
│ ├── distributions.py
│ ├── envs.py
│ ├── model.py
│ ├── storage.py
│ └── utils.py
├── baselines_modified
│ ├── .benchmark_pattern
│ ├── .gitignore
│ ├── .travis.yml
│ ├── Dockerfile
│ ├── LICENSE
│ ├── README.md
│ ├── baselines
│ │ ├── __init__.py
│ │ ├── a2c
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── a2c.py
│ │ │ ├── runner.py
│ │ │ └── utils.py
│ │ ├── acer
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── acer.py
│ │ │ ├── buffer.py
│ │ │ ├── defaults.py
│ │ │ ├── policies.py
│ │ │ └── runner.py
│ │ ├── acktr
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── acktr.py
│ │ │ ├── defaults.py
│ │ │ ├── kfac.py
│ │ │ ├── kfac_utils.py
│ │ │ └── utils.py
│ │ ├── bench
│ │ │ ├── __init__.py
│ │ │ ├── benchmarks.py
│ │ │ ├── monitor.py
│ │ │ └── test_monitor.py
│ │ ├── common
│ │ │ ├── __init__.py
│ │ │ ├── atari_wrappers.py
│ │ │ ├── cg.py
│ │ │ ├── cmd_util.py
│ │ │ ├── console_util.py
│ │ │ ├── dataset.py
│ │ │ ├── distributions.py
│ │ │ ├── input.py
│ │ │ ├── math_util.py
│ │ │ ├── misc_util.py
│ │ │ ├── models.py
│ │ │ ├── mpi_adam.py
│ │ │ ├── mpi_adam_optimizer.py
│ │ │ ├── mpi_fork.py
│ │ │ ├── mpi_moments.py
│ │ │ ├── mpi_running_mean_std.py
│ │ │ ├── mpi_util.py
│ │ │ ├── plot_util.py
│ │ │ ├── policies.py
│ │ │ ├── retro_wrappers.py
│ │ │ ├── runners.py
│ │ │ ├── running_mean_std.py
│ │ │ ├── schedules.py
│ │ │ ├── segment_tree.py
│ │ │ ├── test_mpi_util.py
│ │ │ ├── tests
│ │ │ │ ├── __init__.py
│ │ │ │ ├── envs
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── fixed_sequence_env.py
│ │ │ │ │ ├── identity_env.py
│ │ │ │ │ ├── identity_env_test.py
│ │ │ │ │ └── mnist_env.py
│ │ │ │ ├── test_cartpole.py
│ │ │ │ ├── test_doc_examples.py
│ │ │ │ ├── test_env_after_learn.py
│ │ │ │ ├── test_fetchreach.py
│ │ │ │ ├── test_fixed_sequence.py
│ │ │ │ ├── test_identity.py
│ │ │ │ ├── test_mnist.py
│ │ │ │ ├── test_plot_util.py
│ │ │ │ ├── test_schedules.py
│ │ │ │ ├── test_segment_tree.py
│ │ │ │ ├── test_serialization.py
│ │ │ │ ├── test_tf_util.py
│ │ │ │ ├── test_with_mpi.py
│ │ │ │ └── util.py
│ │ │ ├── tf_util.py
│ │ │ ├── tile_images.py
│ │ │ ├── vec_env
│ │ │ │ ├── __init__.py
│ │ │ │ ├── dummy_vec_env.py
│ │ │ │ ├── shmem_vec_env.py
│ │ │ │ ├── subproc_vec_env.py
│ │ │ │ ├── test_vec_env.py
│ │ │ │ ├── test_video_recorder.py
│ │ │ │ ├── util.py
│ │ │ │ ├── vec_env.py
│ │ │ │ ├── vec_frame_stack.py
│ │ │ │ ├── vec_monitor.py
│ │ │ │ ├── vec_normalize.py
│ │ │ │ ├── vec_remove_dict_obs.py
│ │ │ │ └── vec_video_recorder.py
│ │ │ └── wrappers.py
│ │ ├── ddpg
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── ddpg.py
│ │ │ ├── ddpg_learner.py
│ │ │ ├── memory.py
│ │ │ ├── models.py
│ │ │ ├── noise.py
│ │ │ └── test_smoke.py
│ │ ├── deepq
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── build_graph.py
│ │ │ ├── deepq.py
│ │ │ ├── defaults.py
│ │ │ ├── experiments
│ │ │ │ ├── __init__.py
│ │ │ │ ├── custom_cartpole.py
│ │ │ │ ├── enjoy_cartpole.py
│ │ │ │ ├── enjoy_mountaincar.py
│ │ │ │ ├── enjoy_pong.py
│ │ │ │ ├── train_cartpole.py
│ │ │ │ ├── train_mountaincar.py
│ │ │ │ └── train_pong.py
│ │ │ ├── models.py
│ │ │ ├── replay_buffer.py
│ │ │ └── utils.py
│ │ ├── gail
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── adversary.py
│ │ │ ├── behavior_clone.py
│ │ │ ├── dataset
│ │ │ │ ├── __init__.py
│ │ │ │ └── mujoco_dset.py
│ │ │ ├── gail-eval.py
│ │ │ ├── mlp_policy.py
│ │ │ ├── result
│ │ │ │ ├── HalfCheetah-normalized-deterministic-scores.png
│ │ │ │ ├── HalfCheetah-normalized-stochastic-scores.png
│ │ │ │ ├── HalfCheetah-unnormalized-deterministic-scores.png
│ │ │ │ ├── HalfCheetah-unnormalized-stochastic-scores.png
│ │ │ │ ├── Hopper-normalized-deterministic-scores.png
│ │ │ │ ├── Hopper-normalized-stochastic-scores.png
│ │ │ │ ├── Hopper-unnormalized-deterministic-scores.png
│ │ │ │ ├── Hopper-unnormalized-stochastic-scores.png
│ │ │ │ ├── Humanoid-normalized-deterministic-scores.png
│ │ │ │ ├── Humanoid-normalized-stochastic-scores.png
│ │ │ │ ├── Humanoid-unnormalized-deterministic-scores.png
│ │ │ │ ├── Humanoid-unnormalized-stochastic-scores.png
│ │ │ │ ├── HumanoidStandup-normalized-deterministic-scores.png
│ │ │ │ ├── HumanoidStandup-normalized-stochastic-scores.png
│ │ │ │ ├── HumanoidStandup-unnormalized-deterministic-scores.png
│ │ │ │ ├── HumanoidStandup-unnormalized-stochastic-scores.png
│ │ │ │ ├── Walker2d-normalized-deterministic-scores.png
│ │ │ │ ├── Walker2d-normalized-stochastic-scores.png
│ │ │ │ ├── Walker2d-unnormalized-deterministic-scores.png
│ │ │ │ ├── Walker2d-unnormalized-stochastic-scores.png
│ │ │ │ ├── gail-result.md
│ │ │ │ ├── halfcheetah-training.png
│ │ │ │ ├── hopper-training.png
│ │ │ │ ├── humanoid-training.png
│ │ │ │ ├── humanoidstandup-training.png
│ │ │ │ └── walker2d-training.png
│ │ │ ├── run_mujoco.py
│ │ │ ├── statistics.py
│ │ │ └── trpo_mpi.py
│ │ ├── her
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── actor_critic.py
│ │ │ ├── ddpg.py
│ │ │ ├── experiment
│ │ │ │ ├── __init__.py
│ │ │ │ ├── config.py
│ │ │ │ ├── data_generation
│ │ │ │ │ └── fetch_data_generation.py
│ │ │ │ ├── play.py
│ │ │ │ └── plot.py
│ │ │ ├── her.py
│ │ │ ├── her_sampler.py
│ │ │ ├── normalizer.py
│ │ │ ├── replay_buffer.py
│ │ │ ├── rollout.py
│ │ │ └── util.py
│ │ ├── logger.py
│ │ ├── ppo1
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── cnn_policy.py
│ │ │ ├── mlp_policy.py
│ │ │ ├── pposgd_simple.py
│ │ │ ├── run_atari.py
│ │ │ ├── run_humanoid.py
│ │ │ ├── run_mujoco.py
│ │ │ └── run_robotics.py
│ │ ├── ppo2
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── defaults.py
│ │ │ ├── microbatched_model.py
│ │ │ ├── model.py
│ │ │ ├── ppo2.py
│ │ │ ├── runner.py
│ │ │ └── test_microbatches.py
│ │ ├── results_plotter.py
│ │ ├── run.py
│ │ └── trpo_mpi
│ │ │ ├── README.md
│ │ │ ├── __init__.py
│ │ │ ├── defaults.py
│ │ │ └── trpo_mpi.py
│ ├── benchmarks_atari10M.htm
│ ├── benchmarks_mujoco1M.htm
│ ├── setup.cfg
│ └── setup.py
├── enjoy.py
├── evaluation.py
├── gail_experts
│ ├── README.md
│ └── convert_to_pytorch.py
├── generate_tmux_yaml.py
├── main.py
├── requirements.txt
├── run_all.yaml
├── setup.py
└── visualize.ipynb
├── requirements.txt
├── run_agent.py
├── scripts
├── __pycache__
│ └── analyze_synthesized_programs.cpython-37.pyc
├── analyze_synthesized_programs.py
├── compare_experiments.py
├── learn_program_distance.py
└── manually_evaluate_program.py
├── search_program_experiments.py
├── search_programs.py
├── simulate_search.py
├── test_synthesized_programs.py
└── test_synthesized_programs_experiments.py
/README.md:
--------------------------------------------------------------------------------
1 | # Meta-Learning Curiosity Algorithms
2 | This is the code for "Meta-Learning Curiosity Algorithms" by [Ferran Alet](http://alet-etal.com/)\*, [Martin Schneider](https://github.com/mfranzs)\*, [Tomas Lozano-Perez](https://people.csail.mit.edu/tlp/), and [Leslie Kaelbling](https://people.csail.mit.edu/lpk/). Published at ICLR 2020 (and previously in Meta-Learning and Reinforcment Learning Workshops at NeurIPS 2019).
3 |
4 | See the paper [here](https://openreview.net/pdf?id=BygdyxHFDS).
5 |
6 | ## Overview of Running an Experiment
7 | 1. Specify your operations in operations.py.
8 | 2. Specify a list of operations to use in operations_list.py.
9 | 3. Run program_synthesis.py to synthesize programs with your list of operations.
10 | 4. Specify an experperiment in test_synthesized_programs_experiments.py.
11 | 5. Run test_synthesized_programs.py to search over your program space.
12 | 6. Use scripts/analyze_synthesized_programs.py to analyze your results.
13 |
14 | ## Code Overview
15 | **datastructures.py**: The datastructures manipulated by program operations.\
16 | **executor.py**: Executes a Program object.\
17 | **find_duplicate_programs.py**: Takes a list of programs and finds / prunes duplicates by testing each program on a fake environment and looking at the output signature.\
18 | **gridworld_environments.py**: Our gridworld environments.\
19 | **internal_rewards.py**: The module that runs intrinsic curiosity programs and reward combiner programs.\
20 | **operations_list.py**: A configuration file that specifies the operations that can appear in different program classes\
21 | operations.py: The operations that are composed to create a program.\
22 | **predict_performance.py**: The regressor that predicts program performance from its \
23 | **predict_performance_experiments.py**: A configuration file for experimenting with performance regressors.\
24 | **program.py**: The core abstraction of a program, represented by a DAG of operations**.\
25 | **program_synthesis.py**: The search module that synthesizes programs.\
26 | **program_types.py**: The types that operations in our language can output.\
27 | **run_agent.py**: The module that runs an agent in an environment.\
28 | **search_programs.py**: The module that searches over a program space, given a list of programs, an environment, and a program selection metric.\
29 | **search_programs_experiments.py**: A configuration file for simulating program searches.\
30 | **simulate_search.py**: A module that simulates searching through programs.\
31 | **test_synthesized_programs.py**: The module that takes a set of synthesized programs and initiates a search over them.\
32 | **test_synthesized_programs_experiments.py**: The configuration file for testing / searching over programs.\
33 |
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/README.md:
--------------------------------------------------------------------------------
1 | # Variational Autoencoder & Conditional Variational Autoenoder on MNIST
2 |
3 | VAE paper: [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114)
4 |
5 | CVAE paper: [Learning Structured Output Representation using Deep Conditional Generative Models](https://papers.nips.cc/paper/5775-learning-structured-output-representation-using-deep-conditional-generative-models)
6 |
7 | ---
8 | In order to run _conditional_ variational autoencoder, add `--conditional` to the the command. Check out the other commandline options in the code for hyperparameter settings (like learning rate, batch size, encoder/decoder layer depth and size).
9 |
10 | ---
11 |
12 | ## Results
13 |
14 | All plots obtained after 10 epochs of training. Hyperparameters accordning to default settings in the code; not tuned.
15 |
16 | ### z ~ q(z|x) and q(z|x,c)
17 | The modeled latent distribution after 10 epochs and 100 samples per digit.
18 |
19 | VAE | CVAE
20 | --- | ---
21 |
|
22 |
23 | ### p(x|z) and p(x|z,c)
24 | Randonly sampled z, and their output. For CVAE, each c has been given as input once.
25 |
26 | VAE | CVAE
27 | --- | ---
28 |
|
29 |
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/__pycache__/models.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/__pycache__/models.cpython-37.pyc
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/__pycache__/utils.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/__pycache__/utils.cpython-37.pyc
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/figs/1519649452.702026/E9-Dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/figs/1519649452.702026/E9-Dist.png
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/figs/1519649452.702026/E9I937.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/figs/1519649452.702026/E9I937.png
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/figs/1519649461.195146/E9-Dist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/figs/1519649461.195146/E9-Dist.png
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/figs/1519649461.195146/E9I937.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/figs/1519649461.195146/E9I937.png
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/requirements.txt:
--------------------------------------------------------------------------------
1 | cycler==0.10.0
2 | kiwisolver==1.0.1
3 | matplotlib==3.0.2
4 | numpy==1.16.1
5 | pandas==0.24.1
6 | Pillow==5.4.1
7 | pyparsing==2.3.1
8 | python-dateutil==2.8.0
9 | pytz==2018.9
10 | scipy==1.2.1
11 | seaborn==0.9.0
12 | six==1.12.0
13 | torch==1.0.1.post2
14 | torchvision==0.2.1
15 |
--------------------------------------------------------------------------------
/VAE_CVAE_MNIST_mod/utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | def idx2onehot(idx, n):
5 |
6 | assert torch.max(idx).item() < n
7 | if idx.dim() == 1:
8 | idx = idx.unsqueeze(1)
9 |
10 | onehot = torch.zeros(idx.size(0), n)
11 | onehot.scatter_(1, idx, 1)
12 |
13 | return onehot
14 |
--------------------------------------------------------------------------------
/__pycache__/datastructures.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/datastructures.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/executor.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/executor.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/find_duplicate_programs.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/find_duplicate_programs.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/gridworld_environments.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/gridworld_environments.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/internal_rewards.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/internal_rewards.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/learn_program_distance_experiments.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/learn_program_distance_experiments.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/operations.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/operations.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/operations_list.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/operations_list.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/predict_performance.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/predict_performance.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/predict_performance_experiments.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/predict_performance_experiments.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/program.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/program.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/program_types.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/program_types.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/run_agent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/run_agent.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/search_program_experiments.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/search_program_experiments.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/search_programs.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/search_programs.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/simulate_search.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/simulate_search.cpython-37.pyc
--------------------------------------------------------------------------------
/__pycache__/test_synthesized_programs_experiments.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/test_synthesized_programs_experiments.cpython-37.pyc
--------------------------------------------------------------------------------
/datastructures.py:
--------------------------------------------------------------------------------
1 | """
2 | The datastructures manipulated by the operations in operations.py
3 | """
4 |
5 | from torch import nn
6 |
7 | import collections
8 | import numpy as np
9 |
10 | from mlca.helpers.nn import SimpleConvNet, MLP
11 |
12 | CHW = collections.namedtuple('CHW', ('channels', 'height', 'width'))
13 |
14 | class CNNModule(nn.Module):
15 | def __init__(self, environment):
16 | super().__init__()
17 |
18 | observation_space = environment.observation_space.shape
19 | image_size = CHW(
20 | observation_space[2],
21 | observation_space[0],
22 | observation_space[1])
23 |
24 | self.conv = SimpleConvNet(image_size.channels, 1, [], [3], {"USE_BATCH_NORM": True})
25 | self.mlp = MLP(
26 | self.conv.output_size(
27 | (image_size.width, image_size.height)), 32, [32, 32]
28 | )
29 |
30 | def forward(self, x):
31 | # x = x.permute(0, 3, 1, 2)
32 | x = self.conv(x)
33 | x = x.flatten(start_dim=1)
34 | x = self.mlp(x)
35 | return x
36 |
37 | class ObservationMLPModule(nn.Module):
38 | def __init__(self, environment):
39 | super().__init__()
40 |
41 | self.mlp = MLP(
42 | np.prod(environment.observation_space.shape),
43 | 32, [16, 32, 64])
44 |
45 | def forward(self, x):
46 | x = x.flatten(start_dim=1)
47 | x = self.mlp(x)
48 | return x
49 |
50 | class Ensemble(nn.Module):
51 | def __init__(self, modules, environment):
52 | super().__init__()
53 |
54 | self.module_list = nn.ModuleList(modules)
55 |
56 | def forward(self, x):
57 | return [module(x) for module in self.module_list]
58 |
--------------------------------------------------------------------------------
/diversity/__pycache__/density_peaks.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/diversity/__pycache__/density_peaks.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__init__.py
--------------------------------------------------------------------------------
/helpers/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/__init__.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__pycache__/config.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/config.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__pycache__/debug.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/debug.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__pycache__/experiment_params.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/experiment_params.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__pycache__/nn.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/nn.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__pycache__/plotting.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/plotting.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__pycache__/probability.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/probability.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__pycache__/torch_knn.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/torch_knn.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/__pycache__/util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/util.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/config.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import argparse
3 | import pprint
4 | from typing import List
5 |
6 | def argparser():
7 | parser = argparse.ArgumentParser()
8 | parser.add_argument("--experiment_id", required=True,
9 | help="id of the experiment we want to run; auto-selects the parameters")
10 | parser.add_argument("--render", action="store_true",
11 | help="render the environment", default=False)
12 | parser.add_argument("--dont_train", action="store_true",
13 | help="don't train the model", default=False)
14 | parser.add_argument("--dont_save", action="store_true",
15 | help="don't save the model", default=False)
16 | parser.add_argument("--dont_load", action="store_true",
17 | help="don't load the model", default=False)
18 | parser.add_argument("--cpu", action="store_true", default=False)
19 | parser.add_argument("--profiler", action="store_true",
20 | help="print profiler data", default=False)
21 | return parser
22 |
23 |
24 | def clean_experiment_id(experiment_id):
25 | # Only get the experiment_id before --version
26 | return experiment_id.split("--version")[0]
27 |
28 |
29 | def get_params(experiments, experiment_id, print_params=True, recursing=False):
30 | print("WARNING: get_params is deprecated")
31 | experiment_id = clean_experiment_id(experiment_id)
32 | if experiment_id not in experiments:
33 | raise RuntimeWarning("The experiment ID "+experiment_id +
34 | " does not exist! Valid ids: " + str(experiments.keys()))
35 |
36 | e = experiments[experiment_id]
37 | e["__EXPERIMENT_ID__"] = experiment_id
38 |
39 | if "__PARENT__" in e:
40 | parent = get_params(experiments, e["__PARENT__"], print_params, True)
41 | e = {**parent, **e}
42 |
43 | if not recursing and print_params:
44 | print("Parameters: ", experiment_id)
45 | pprint.pprint(e)
46 |
47 | assert e.get("BUGGED", None) is None, e.get("BUGGED")
48 |
49 | return e
50 |
51 | def get_device_and_set_default(args = None):
52 | use_cuda = torch.cuda.is_available() and not (args is not None and args.cpu)
53 |
54 | device = "cuda" if use_cuda else "cpu"
55 | print("Device", device, " - Cuda available?", torch.cuda.is_available())
56 | if device == "cuda":
57 | torch.set_default_tensor_type(torch.cuda.FloatTensor)
58 | else:
59 | torch.set_default_tensor_type(torch.FloatTensor)
60 |
61 | return device
62 |
63 | class DefaultDevice:
64 | active_default_parameters: List[str] = []
65 |
66 | def __init__(self, default_device):
67 | self.default_device = default_device
68 |
69 | def __enter__(self):
70 | self.active_default_parameters.append(self.default_device)
71 | return self
72 |
73 | def __exit__(self, type, value, traceback):
74 | self.active_default_parameters.pop()
75 |
76 | @classmethod
77 | def current(cls):
78 | if len(cls.active_default_parameters) == 0:
79 | raise RuntimeError(f"No current global default device set. Use a with statement to set the default device.")
80 | return cls.active_default_parameters[-1]
81 |
--------------------------------------------------------------------------------
/helpers/datastructures.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from mlca.helpers.nn import SimpleConvNet, MLP
4 |
5 | class Ensemble(nn.Module):
6 | def __init__(self, environment):
7 | pass
8 |
9 | def forward(self, x):
10 | x = self.conv(x)
11 | x = self.mlp(x)
12 | return x
13 |
--------------------------------------------------------------------------------
/helpers/experiment_params.py:
--------------------------------------------------------------------------------
1 | """
2 | A small library that represents experiment parameters using
3 | Python Dataclasses. Supports creating lists of experiment parameters,
4 | chaining experiment paramters, and registering a current globally active
5 | parameter instance for a given paramter class (using a Python context).
6 | """
7 |
8 | import dataclasses
9 | from collections import OrderedDict
10 | import pprint
11 | from typing import List, Optional
12 |
13 | ExperimentId = str
14 |
15 | class ExperimentParameters():
16 | active_parameters: Optional[List] = None
17 |
18 | @classmethod
19 | def current(cls):
20 | if cls.active_parameters is None or len(cls.active_parameters) == 0:
21 | raise RuntimeError(f"No current global parameters set for {cls.__name__}. Use a with statement to set the current context.")
22 | return cls.active_parameters[-1]
23 |
24 | @classmethod
25 | def _set_active_parameters(cls, active_parameters):
26 | # Make the active parameter list when needed so we have a different one
27 | # for every parameters class
28 | if cls.active_parameters is None:
29 | cls.active_parameters = []
30 |
31 | cls.active_parameters.append(active_parameters)
32 |
33 | @classmethod
34 | def _clear_active_parameters(cls):
35 | cls.active_parameters.pop()
36 |
37 | def register_experiment_id(self, _experiment_id: ExperimentId):
38 | assert not hasattr(self, '_experiment_id'), "Warning: You cannot use the reserved field _experiment_id in your experiment params."
39 | self._experiment_id = _experiment_id
40 |
41 | def replace(self, **kwargs):
42 | return dataclasses.replace(self, **kwargs)
43 |
44 | def __enter__(self):
45 | self._set_active_parameters(self)
46 | return self
47 |
48 | def __exit__(self, type, value, traceback):
49 | self._clear_active_parameters()
50 |
51 | class ExperimentParameterList(dict):
52 | def __setitem__(self, key: ExperimentId, item: ExperimentParameters):
53 | assert key not in self.__dict__, f"The experiment {key} has already been registered."
54 | self.__dict__[key] = item
55 | item.register_experiment_id(key)
56 |
57 | def get(self, key: ExperimentId, print_params=True):
58 | if print_params:
59 | print("Get experiments", key)
60 | print(pprint.pformat(self.__dict__[key]))
61 | return self[key]
62 |
63 | def __getitem__(self, key: ExperimentId):
64 | return self.__dict__[key]
65 |
--------------------------------------------------------------------------------
/helpers/lists.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import operator
3 |
4 | def flatten(a):
5 | return functools.reduce(operator.concat, a)
6 |
--------------------------------------------------------------------------------
/helpers/probability.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | # Modified from
4 | # https://introcs.cs.princeton.edu/python/22module/gaussian.py.html
5 |
6 | BOUNDED_INFINITY = 10.
7 | def pdf(orig_x, mu=0.0, sigma=1.0):
8 | sigma = sigma.astype(np.float)
9 | orig_x = orig_x.astype(np.float)
10 | x = orig_x - mu / sigma
11 | a = np.exp(-x*x/2.0) / np.sqrt(2.0*np.pi) / sigma
12 | return np.where(np.logical_or(np.logical_or(np.isinf(a), np.isnan(a)), sigma == 0),
13 | np.where(orig_x == mu, np.ones_like(mu) * BOUNDED_INFINITY, np.zeros_like(mu)),
14 | a
15 | )
16 | # return np.divide(a, sigma, out=np.isclose(orig_x, mu).astype(np.float) * BOUNDED_INFINITY, where=sigma!=0)
17 |
18 | def cdf(z, mu=0.0, sigma=1.0):
19 | z = z - mu / sigma
20 | if z < -8.0: return 0.0
21 | if z > +8.0: return 1.0
22 | total = 0.0
23 | term = z
24 | i = 3
25 | while total != total + term:
26 | total += term
27 | term *= z * z / i
28 | i += 2
29 | return 0.5 + total * pdf(z)
30 |
--------------------------------------------------------------------------------
/helpers/statistics/__pycache__/welfords_std.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/statistics/__pycache__/welfords_std.cpython-37.pyc
--------------------------------------------------------------------------------
/helpers/statistics/welfords_std.py:
--------------------------------------------------------------------------------
1 | """
2 | SOURCE: https://gist.github.com/alexalemi/2151722
3 | """
4 |
5 | import math
6 | import numpy as np
7 |
8 | class Welford(object):
9 | """ Implements Welford's algorithm for computing a running mean
10 | and standard deviation as described at:
11 | http://www.johndcook.com/standard_deviation.html
12 |
13 | can take single values or iterables
14 |
15 | Properties:
16 | mean - returns the mean
17 | std - returns the std
18 | meanfull- returns the mean and std of the mean
19 |
20 | Usage:
21 | >>> foo = Welford()
22 | >>> foo(range(100))
23 | >>> foo
24 |
25 | >>> foo([1]*1000)
26 | >>> foo
27 |
28 | >>> foo.mean
29 | 5.409090909090906
30 | >>> foo.std
31 | 16.44374171455467
32 | >>> foo.meanfull
33 | (5.409090909090906, 0.4957974674244838)
34 | """
35 |
36 | def __init__(self, lst=None):
37 | self.k = 0
38 | self.M = 0
39 | self.S = 0
40 |
41 | self.__call__(lst)
42 |
43 | def update(self, x):
44 | if x is None:
45 | return
46 | self.k += 1
47 | newM = self.M + (x - self.M)*1./self.k
48 | newS = self.S + (x - self.M)*(x - newM)
49 | self.M, self.S = newM, newS
50 |
51 | def consume(self, lst):
52 | lst = iter(lst)
53 | for x in lst:
54 | self.update(x)
55 |
56 | def __call__(self, x):
57 | if hasattr(x, "__iter__"):
58 | self.consume(x)
59 | else:
60 | self.update(x)
61 |
62 | @property
63 | def mean(self):
64 | return self.M
65 |
66 | @property
67 | def meanfull(self):
68 | return self.mean, self.std/math.sqrt(self.k)
69 |
70 | @property
71 | def std(self):
72 | if self.k == 1:
73 | return 0
74 | return math.sqrt(self.S/(self.k-1))
75 |
76 | def __repr__(self):
77 | return "".format(self.mean, self.std)
78 |
79 | if __name__ == "__main__":
80 | w = Welford()
81 | arr = np.random.random(100)
82 | w.consume(arr)
83 | print(w.mean, w.std)
84 | print(arr.mean(), arr.std())
--------------------------------------------------------------------------------
/helpers/task_queue.py:
--------------------------------------------------------------------------------
1 | """
2 | Based On:
3 | https://medium.com/@shashwat_ds/a-tiny-multi-threaded-job-queue-in-30-lines-of-python-a344c3f3f7f0
4 | WARNING: Use this, not the source. Source had bugs.
5 | """
6 |
7 | from threading import Thread
8 | from queue import Queue
9 | import time
10 |
11 | class TaskQueue(Queue):
12 |
13 | def __init__(self, num_workers=1):
14 | super().__init__()
15 | self.num_workers = num_workers
16 | self.start_workers()
17 |
18 | def add_task(self, task, *args, **kwargs):
19 | args = args or ()
20 | kwargs = kwargs or {}
21 | self.put((task, args, kwargs))
22 |
23 | def start_workers(self):
24 | for i in range(self.num_workers):
25 | t = Thread(target=self.worker, args=[i])
26 | t.daemon = True
27 | t.start()
28 |
29 | def worker(self, worker_id):
30 | while True:
31 | tupl = self.get()
32 | # print("Worker", tupl)
33 | item, args, kwargs = tupl
34 | kwargs["worker_id"] = worker_id
35 | item(*args, **kwargs)
36 | self.task_done()
37 |
38 |
39 | def tests():
40 | def t(*args, **kwargs):
41 | time.sleep(1)
42 | print(args)
43 |
44 | q = TaskQueue(num_workers=3)
45 |
46 | for item in range(10):
47 | q.add_task(t, item)
48 |
49 | q.join() # block until all tasks are done
50 |
51 | if __name__ == "__main__":
52 | tests()
53 |
--------------------------------------------------------------------------------
/helpers/torch_knn.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import time
3 | import torch
4 | from mlca.helpers.config import DefaultDevice
5 |
6 | # Tested in mlca/curiosity/program_synthesis/scripts/misc/test_knn_speed.py
7 |
8 | class TorchKNN:
9 | def __init__(self, buffer_size, feature_size, num_neighbors):
10 | self.buffer = torch.zeros(buffer_size, feature_size, device=DefaultDevice.current())
11 | self.nearest_neighbors = torch.zeros(num_neighbors, feature_size, device=DefaultDevice.current())
12 |
13 | self.buffer_size = buffer_size
14 | self.feature_size = feature_size
15 | self.num_neighbors = num_neighbors
16 |
17 | self.num_points = 0
18 | self.buffer_pos = 0
19 |
20 | def add(self, x):
21 | assert x.shape[0] == self.feature_size
22 |
23 | self.buffer[self.buffer_pos] = x
24 |
25 | self.num_points += 1
26 | self.buffer_pos += 1
27 | if self.buffer_pos >= self.buffer_size:
28 | self.buffer_pos = 0
29 |
30 | def predict(self, x):
31 | if self.num_points == 0:
32 | return torch.rand(self.feature_size)
33 | else:
34 | distances = torch.norm(
35 | self.buffer[:min(self.num_points, self.buffer_size)] - x, dim=1)
36 |
37 | _, indices = torch.topk(
38 | distances, min(self.num_neighbors, self.num_points),
39 | largest=False, sorted=False)
40 | nearest = self.buffer[indices]
41 | prediction = torch.mean(nearest, dim=0)
42 |
43 | assert prediction.shape == (self.feature_size, )
44 | return prediction
45 |
46 |
47 | class TorchKNNRegressor:
48 | def __init__(self, buffer_size, feature_size, num_neighbors):
49 | self.query_buffer = torch.zeros(buffer_size, feature_size)
50 | self.target_buffer = torch.zeros(buffer_size, feature_size)
51 | self.nearest_neighbors = torch.zeros(num_neighbors, feature_size)
52 |
53 | self.buffer_size = buffer_size
54 | self.feature_size = feature_size
55 | self.num_neighbors = num_neighbors
56 |
57 | self.num_points = 0
58 | self.buffer_pos = 0
59 |
60 | def add(self, query, target):
61 | assert query.shape[0] == self.feature_size
62 | assert target.shape[0] == self.feature_size
63 |
64 | self.query_buffer[self.buffer_pos] = query
65 | self.target_buffer[self.buffer_pos] = target
66 |
67 | self.num_points += 1
68 | self.buffer_pos += 1
69 | if self.buffer_pos >= self.buffer_size:
70 | self.buffer_pos = 0
71 |
72 | def predict(self, x):
73 | if self.num_points == 0:
74 | return torch.rand(self.feature_size)
75 | else:
76 | distances = torch.norm(
77 | self.query_buffer[:min(self.num_points, self.buffer_size)] - x, dim=1)
78 |
79 | _, indices = torch.topk(
80 | distances, min(self.num_neighbors, self.num_points),
81 | largest=False, sorted=False)
82 | nearest = self.target_buffer[indices]
83 | prediction = torch.mean(nearest, dim=0)
84 |
85 | assert prediction.shape == (self.feature_size, )
86 | return prediction
87 |
--------------------------------------------------------------------------------
/helpers/util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import random
4 |
5 | def set_random_seed(random_seed):
6 | np.random.seed(random_seed)
7 | random.seed(random_seed)
8 | torch.manual_seed(random_seed)
9 | torch.cuda.manual_seed(random_seed)
10 | torch.cuda.manual_seed_all(random_seed)
11 |
--------------------------------------------------------------------------------
/learn_program_distance_experiments.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | from mlca.helpers.experiment_params import ExperimentParameters, ExperimentParameterList
3 | from enum import Enum
4 | from typing import Optional, List, Any
5 | from dataclasses import dataclass
6 |
7 | @dataclass
8 | class ProgramDistanceParams(ExperimentParameters):
9 | FEATURE_PAIRS: int
10 | TASK: str
11 | MODEL: str
12 | FEATURE_INPUT_OUTPUT: bool
13 | TEST_SYNTHESIZED_PROGRAMS_EXP_NAME: Optional[str]
14 | NEIGHBORS: Optional[int] = None
15 |
16 | ProgramDistanceExperimentList = ExperimentParameterList()
17 |
18 | ProgramDistanceExperimentList["mlp"] = ProgramDistanceParams(
19 | FEATURE_PAIRS = 1,
20 | NEIGHBORS = 10,
21 | TASK = "TEST_SINGLE",
22 | MODEL = "MLP",
23 | FEATURE_INPUT_OUTPUT = False,
24 | TEST_SYNTHESIZED_PROGRAMS_EXP_NAME="2-96_program-correlation-5",
25 | )
26 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | trained_models/
104 | .fuse_hidden*
105 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Ilya Kostrikov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/algo/__init__.py:
--------------------------------------------------------------------------------
1 | from .a2c_acktr import A2C_ACKTR
2 | from .ppo import PPO
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/algo/a2c_acktr.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 |
5 | from a2c_ppo_acktr.algo.kfac import KFACOptimizer
6 |
7 |
8 | class A2C_ACKTR():
9 | def __init__(self,
10 | actor_critic,
11 | value_loss_coef,
12 | entropy_coef,
13 | lr=None,
14 | eps=None,
15 | alpha=None,
16 | max_grad_norm=None,
17 | acktr=False):
18 |
19 | self.actor_critic = actor_critic
20 | self.acktr = acktr
21 |
22 | self.value_loss_coef = value_loss_coef
23 | self.entropy_coef = entropy_coef
24 |
25 | self.max_grad_norm = max_grad_norm
26 |
27 | if acktr:
28 | self.optimizer = KFACOptimizer(actor_critic)
29 | else:
30 | self.optimizer = optim.RMSprop(
31 | actor_critic.parameters(), lr, eps=eps, alpha=alpha)
32 |
33 | def update(self, rollouts):
34 | obs_shape = rollouts.obs.size()[2:]
35 | action_shape = rollouts.actions.size()[-1]
36 | num_steps, num_processes, _ = rollouts.rewards.size()
37 |
38 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions(
39 | rollouts.obs[:-1].view(-1, *obs_shape),
40 | rollouts.recurrent_hidden_states[0].view(
41 | -1, self.actor_critic.recurrent_hidden_state_size),
42 | rollouts.masks[:-1].view(-1, 1),
43 | rollouts.actions.view(-1, action_shape))
44 |
45 | values = values.view(num_steps, num_processes, 1)
46 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1)
47 |
48 | advantages = rollouts.returns[:-1] - values
49 | value_loss = advantages.pow(2).mean()
50 |
51 | action_loss = -(advantages.detach() * action_log_probs).mean()
52 |
53 | if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0:
54 | # Sampled fisher, see Martens 2014
55 | self.actor_critic.zero_grad()
56 | pg_fisher_loss = -action_log_probs.mean()
57 |
58 | value_noise = torch.randn(values.size())
59 | if values.is_cuda:
60 | value_noise = value_noise.cuda()
61 |
62 | sample_values = values + value_noise
63 | vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean()
64 |
65 | fisher_loss = pg_fisher_loss + vf_fisher_loss
66 | self.optimizer.acc_stats = True
67 | fisher_loss.backward(retain_graph=True)
68 | self.optimizer.acc_stats = False
69 |
70 | self.optimizer.zero_grad()
71 | (value_loss * self.value_loss_coef + action_loss -
72 | dist_entropy * self.entropy_coef).backward()
73 |
74 | if self.acktr == False:
75 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(),
76 | self.max_grad_norm)
77 |
78 | self.optimizer.step()
79 |
80 | return value_loss.item(), action_loss.item(), dist_entropy.item()
81 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/utils.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 |
4 | import torch
5 | import torch.nn as nn
6 |
7 | from a2c_ppo_acktr.envs import VecNormalize
8 |
9 |
10 | # Get a render function
11 | def get_render_func(venv):
12 | if hasattr(venv, 'envs'):
13 | return venv.envs[0].render
14 | elif hasattr(venv, 'venv'):
15 | return get_render_func(venv.venv)
16 | elif hasattr(venv, 'env'):
17 | return get_render_func(venv.env)
18 |
19 | return None
20 |
21 |
22 | def get_vec_normalize(venv):
23 | if isinstance(venv, VecNormalize):
24 | return venv
25 | elif hasattr(venv, 'venv'):
26 | return get_vec_normalize(venv.venv)
27 |
28 | return None
29 |
30 |
31 | # Necessary for my KFAC implementation.
32 | class AddBias(nn.Module):
33 | def __init__(self, bias):
34 | super(AddBias, self).__init__()
35 | self._bias = nn.Parameter(bias.unsqueeze(1))
36 |
37 | def forward(self, x):
38 | if x.dim() == 2:
39 | bias = self._bias.t().view(1, -1)
40 | else:
41 | bias = self._bias.t().view(1, -1, 1, 1)
42 |
43 | return x + bias
44 |
45 |
46 | def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
47 | """Decreases the learning rate linearly"""
48 | lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
49 | for param_group in optimizer.param_groups:
50 | param_group['lr'] = lr
51 |
52 |
53 | def init(module, weight_init, bias_init, gain=1):
54 | weight_init(module.weight.data, gain=gain)
55 | bias_init(module.bias.data)
56 | return module
57 |
58 |
59 | def cleanup_log_dir(log_dir):
60 | try:
61 | os.makedirs(log_dir)
62 | except OSError:
63 | files = glob.glob(os.path.join(log_dir, '*.monitor.csv'))
64 | for f in files:
65 | os.remove(f)
66 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/.benchmark_pattern:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/.gitignore:
--------------------------------------------------------------------------------
1 | *.swp
2 | *.pyc
3 | *.pkl
4 | *.py~
5 | .pytest_cache
6 | .DS_Store
7 | .idea
8 |
9 | # Setuptools distribution and build folders.
10 | /dist/
11 | /build
12 | keys/
13 |
14 | # Virtualenv
15 | /env
16 |
17 |
18 | *.sublime-project
19 | *.sublime-workspace
20 |
21 | .idea
22 |
23 | logs/
24 |
25 | .ipynb_checkpoints
26 | ghostdriver.log
27 |
28 | htmlcov
29 |
30 | junk
31 | src
32 |
33 | *.egg-info
34 | .cache
35 |
36 | MUJOCO_LOG.TXT
37 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 |
5 | services:
6 | - docker
7 |
8 | install:
9 | - pip install flake8
10 | - docker build . -t baselines-test
11 |
12 | script:
13 | - flake8 . --show-source --statistics
14 | - docker run -e RUNSLOW=1 baselines-test pytest -v .
15 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6
2 |
3 | RUN apt-get -y update && apt-get -y install ffmpeg
4 | # RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv
5 |
6 | ENV CODE_DIR /root/code
7 |
8 | COPY . $CODE_DIR/baselines
9 | WORKDIR $CODE_DIR/baselines
10 |
11 | # Clean up pycache and pyc files
12 | RUN rm -rf __pycache__ && \
13 | find . -name "*.pyc" -delete && \
14 | pip install tensorflow && \
15 | pip install -e .[test]
16 |
17 |
18 | CMD /bin/bash
19 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2017 OpenAI (http://openai.com)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/a2c/README.md:
--------------------------------------------------------------------------------
1 | # A2C
2 |
3 | - Original paper: https://arxiv.org/abs/1602.01783
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options
6 | - also refer to the repo-wide [README.md](../../README.md#training-models)
7 |
8 | ## Files
9 | - `run_atari`: file used to run the algorithm.
10 | - `policies.py`: contains the different versions of the A2C architecture (MlpPolicy, CNNPolicy, LstmPolicy...).
11 | - `a2c.py`: - Model : class used to initialize the step_model (sampling) and train_model (training)
12 | - learn : Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm.
13 | - `runner.py`: class used to generates a batch of experiences
14 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/a2c/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/a2c/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/a2c/runner.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from baselines.a2c.utils import discount_with_dones
3 | from baselines.common.runners import AbstractEnvRunner
4 |
5 | class Runner(AbstractEnvRunner):
6 | """
7 | We use this class to generate batches of experiences
8 |
9 | __init__:
10 | - Initialize the runner
11 |
12 | run():
13 | - Make a mini batch of experiences
14 | """
15 | def __init__(self, env, model, nsteps=5, gamma=0.99):
16 | super().__init__(env=env, model=model, nsteps=nsteps)
17 | self.gamma = gamma
18 | self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
19 | self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
20 |
21 | def run(self):
22 | # We initialize the lists that will contain the mb of experiences
23 | mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
24 | mb_states = self.states
25 | epinfos = []
26 | for n in range(self.nsteps):
27 | # Given observations, take action and value (V(s))
28 | # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
29 | actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
30 |
31 | # Append the experiences
32 | mb_obs.append(np.copy(self.obs))
33 | mb_actions.append(actions)
34 | mb_values.append(values)
35 | mb_dones.append(self.dones)
36 |
37 | # Take actions in env and look the results
38 | obs, rewards, dones, infos = self.env.step(actions)
39 | for info in infos:
40 | maybeepinfo = info.get('episode')
41 | if maybeepinfo: epinfos.append(maybeepinfo)
42 | self.states = states
43 | self.dones = dones
44 | self.obs = obs
45 | mb_rewards.append(rewards)
46 | mb_dones.append(self.dones)
47 |
48 | # Batch of steps to batch of rollouts
49 | mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
50 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
51 | mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
52 | mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
53 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
54 | mb_masks = mb_dones[:, :-1]
55 | mb_dones = mb_dones[:, 1:]
56 |
57 |
58 | if self.gamma > 0.0:
59 | # Discount/bootstrap off value fn
60 | last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
61 | for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
62 | rewards = rewards.tolist()
63 | dones = dones.tolist()
64 | if dones[-1] == 0:
65 | rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
66 | else:
67 | rewards = discount_with_dones(rewards, dones, self.gamma)
68 |
69 | mb_rewards[n] = rewards
70 |
71 | mb_actions = mb_actions.reshape(self.batch_action_shape)
72 |
73 | mb_rewards = mb_rewards.flatten()
74 | mb_values = mb_values.flatten()
75 | mb_masks = mb_masks.flatten()
76 | return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos
77 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/README.md:
--------------------------------------------------------------------------------
1 | # ACER
2 |
3 | - Original paper: https://arxiv.org/abs/1611.01224
4 | - `python -m baselines.run --alg=acer --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
5 | - also refer to the repo-wide [README.md](../../README.md#training-models)
6 |
7 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/defaults.py:
--------------------------------------------------------------------------------
1 | def atari():
2 | return dict(
3 | lrschedule='constant'
4 | )
5 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/policies.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from baselines.common.policies import nature_cnn
4 | from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample
5 |
6 |
7 | class AcerCnnPolicy(object):
8 |
9 | def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
10 | nbatch = nenv * nsteps
11 | nh, nw, nc = ob_space.shape
12 | ob_shape = (nbatch, nh, nw, nc * nstack)
13 | nact = ac_space.n
14 | X = tf.placeholder(tf.uint8, ob_shape) # obs
15 | with tf.variable_scope("model", reuse=reuse):
16 | h = nature_cnn(X)
17 | pi_logits = fc(h, 'pi', nact, init_scale=0.01)
18 | pi = tf.nn.softmax(pi_logits)
19 | q = fc(h, 'q', nact)
20 |
21 | a = sample(tf.nn.softmax(pi_logits)) # could change this to use self.pi instead
22 | self.initial_state = [] # not stateful
23 | self.X = X
24 | self.pi = pi # actual policy params now
25 | self.pi_logits = pi_logits
26 | self.q = q
27 | self.vf = q
28 |
29 | def step(ob, *args, **kwargs):
30 | # returns actions, mus, states
31 | a0, pi0 = sess.run([a, pi], {X: ob})
32 | return a0, pi0, [] # dummy state
33 |
34 | def out(ob, *args, **kwargs):
35 | pi0, q0 = sess.run([pi, q], {X: ob})
36 | return pi0, q0
37 |
38 | def act(ob, *args, **kwargs):
39 | return sess.run(a, {X: ob})
40 |
41 | self.step = step
42 | self.out = out
43 | self.act = act
44 |
45 | class AcerLstmPolicy(object):
46 |
47 | def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256):
48 | nbatch = nenv * nsteps
49 | nh, nw, nc = ob_space.shape
50 | ob_shape = (nbatch, nh, nw, nc * nstack)
51 | nact = ac_space.n
52 | X = tf.placeholder(tf.uint8, ob_shape) # obs
53 | M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
54 | S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
55 | with tf.variable_scope("model", reuse=reuse):
56 | h = nature_cnn(X)
57 |
58 | # lstm
59 | xs = batch_to_seq(h, nenv, nsteps)
60 | ms = batch_to_seq(M, nenv, nsteps)
61 | h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
62 | h5 = seq_to_batch(h5)
63 |
64 | pi_logits = fc(h5, 'pi', nact, init_scale=0.01)
65 | pi = tf.nn.softmax(pi_logits)
66 | q = fc(h5, 'q', nact)
67 |
68 | a = sample(pi_logits) # could change this to use self.pi instead
69 | self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
70 | self.X = X
71 | self.M = M
72 | self.S = S
73 | self.pi = pi # actual policy params now
74 | self.q = q
75 |
76 | def step(ob, state, mask, *args, **kwargs):
77 | # returns actions, mus, states
78 | a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask})
79 | return a0, pi0, s
80 |
81 | self.step = step
82 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/runner.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from baselines.common.runners import AbstractEnvRunner
3 | from baselines.common.vec_env.vec_frame_stack import VecFrameStack
4 | from gym import spaces
5 |
6 |
7 | class Runner(AbstractEnvRunner):
8 |
9 | def __init__(self, env, model, nsteps):
10 | super().__init__(env=env, model=model, nsteps=nsteps)
11 | assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!'
12 | assert isinstance(env, VecFrameStack)
13 |
14 | self.nact = env.action_space.n
15 | nenv = self.nenv
16 | self.nbatch = nenv * nsteps
17 | self.batch_ob_shape = (nenv*(nsteps+1),) + env.observation_space.shape
18 |
19 | self.obs = env.reset()
20 | self.obs_dtype = env.observation_space.dtype
21 | self.ac_dtype = env.action_space.dtype
22 | self.nstack = self.env.nstack
23 | self.nc = self.batch_ob_shape[-1] // self.nstack
24 |
25 |
26 | def run(self):
27 | # enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps
28 | enc_obs = np.split(self.env.stackedobs, self.env.nstack, axis=-1)
29 | mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], []
30 | for _ in range(self.nsteps):
31 | actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones)
32 | mb_obs.append(np.copy(self.obs))
33 | mb_actions.append(actions)
34 | mb_mus.append(mus)
35 | mb_dones.append(self.dones)
36 | obs, rewards, dones, _ = self.env.step(actions)
37 | # states information for statefull models like LSTM
38 | self.states = states
39 | self.dones = dones
40 | self.obs = obs
41 | mb_rewards.append(rewards)
42 | enc_obs.append(obs[..., -self.nc:])
43 | mb_obs.append(np.copy(self.obs))
44 | mb_dones.append(self.dones)
45 |
46 | enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0)
47 | mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0)
48 | mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0)
49 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
50 | mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0)
51 |
52 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
53 |
54 | mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done
55 | mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards
56 |
57 | # shapes are now [nenv, nsteps, []]
58 | # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy.
59 |
60 | return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks
61 |
62 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/README.md:
--------------------------------------------------------------------------------
1 | # ACKTR
2 |
3 | - Original paper: https://arxiv.org/abs/1708.05144
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
6 | - also refer to the repo-wide [README.md](../../README.md#training-models)
7 |
8 | ## ACKTR with continuous action spaces
9 | The code of ACKTR has been refactored to handle both discrete and continuous action spaces uniformly. In the original version, discrete and continuous action spaces were handled by different code (actkr_disc.py and acktr_cont.py) with little overlap. If interested in the original version of the acktr for continuous action spaces, use `old_acktr_cont` branch. Note that original code performs better on the mujoco tasks than the refactored version; we are still investigating why.
10 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/defaults.py:
--------------------------------------------------------------------------------
1 | def mujoco():
2 | return dict(
3 | nsteps=2500,
4 | value_network='copy'
5 | )
6 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
4 | with tf.variable_scope(name, reuse=reuse):
5 | assert (len(tf.get_variable_scope().name.split('/')) == 2)
6 |
7 | w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
8 | b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
9 | weight_decay_fc = 3e-4
10 |
11 | if weight_loss_dict is not None:
12 | weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
13 | if weight_loss_dict is not None:
14 | weight_loss_dict[w] = weight_decay_fc
15 | weight_loss_dict[b] = 0.0
16 |
17 | tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
18 |
19 | return tf.nn.bias_add(tf.matmul(x, w), b)
20 |
21 | def kl_div(action_dist1, action_dist2, action_size):
22 | mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
23 | mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
24 |
25 | numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
26 | denominator = 2 * tf.square(std2) + 1e-8
27 | return tf.reduce_sum(
28 | numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
29 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from baselines.bench.benchmarks import *
3 | from baselines.bench.monitor import *
4 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/bench/test_monitor.py:
--------------------------------------------------------------------------------
1 | from .monitor import Monitor
2 | import gym
3 | import json
4 |
5 | def test_monitor():
6 | import pandas
7 | import os
8 | import uuid
9 |
10 | env = gym.make("CartPole-v1")
11 | env.seed(0)
12 | mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
13 | menv = Monitor(env, mon_file)
14 | menv.reset()
15 | for _ in range(1000):
16 | _, _, done, _ = menv.step(0)
17 | if done:
18 | menv.reset()
19 |
20 | f = open(mon_file, 'rt')
21 |
22 | firstline = f.readline()
23 | assert firstline.startswith('#')
24 | metadata = json.loads(firstline[1:])
25 | assert metadata['env_id'] == "CartPole-v1"
26 | assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata"
27 |
28 | last_logline = pandas.read_csv(f, index_col=None)
29 | assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
30 | f.close()
31 | os.remove(mon_file)
32 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from baselines.common.console_util import *
3 | from baselines.common.dataset import Dataset
4 | from baselines.common.math_util import *
5 | from baselines.common.misc_util import *
6 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/cg.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
3 | """
4 | Demmel p 312
5 | """
6 | p = b.copy()
7 | r = b.copy()
8 | x = np.zeros_like(b)
9 | rdotr = r.dot(r)
10 |
11 | fmtstr = "%10i %10.3g %10.3g"
12 | titlestr = "%10s %10s %10s"
13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 |
15 | for i in range(cg_iters):
16 | if callback is not None:
17 | callback(x)
18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 | z = f_Ax(p)
20 | v = rdotr / p.dot(z)
21 | x += v*p
22 | r -= v*z
23 | newrdotr = r.dot(r)
24 | mu = newrdotr/rdotr
25 | p = r + mu*p
26 |
27 | rdotr = newrdotr
28 | if rdotr < residual_tol:
29 | break
30 |
31 | if callback is not None:
32 | callback(x)
33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631
34 | return x
35 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/console_util.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | from contextlib import contextmanager
3 | import numpy as np
4 | import time
5 | import shlex
6 | import subprocess
7 |
8 | # ================================================================
9 | # Misc
10 | # ================================================================
11 |
12 | def fmt_row(width, row, header=False):
13 | out = " | ".join(fmt_item(x, width) for x in row)
14 | if header: out = out + "\n" + "-"*len(out)
15 | return out
16 |
17 | def fmt_item(x, l):
18 | if isinstance(x, np.ndarray):
19 | assert x.ndim==0
20 | x = x.item()
21 | if isinstance(x, (float, np.float32, np.float64)):
22 | v = abs(x)
23 | if (v < 1e-4 or v > 1e+4) and v > 0:
24 | rep = "%7.2e" % x
25 | else:
26 | rep = "%7.5f" % x
27 | else: rep = str(x)
28 | return " "*(l - len(rep)) + rep
29 |
30 | color2num = dict(
31 | gray=30,
32 | red=31,
33 | green=32,
34 | yellow=33,
35 | blue=34,
36 | magenta=35,
37 | cyan=36,
38 | white=37,
39 | crimson=38
40 | )
41 |
42 | def colorize(string, color='green', bold=False, highlight=False):
43 | attr = []
44 | num = color2num[color]
45 | if highlight: num += 10
46 | attr.append(str(num))
47 | if bold: attr.append('1')
48 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
49 |
50 | def print_cmd(cmd, dry=False):
51 | if isinstance(cmd, str): # for shell=True
52 | pass
53 | else:
54 | cmd = ' '.join(shlex.quote(arg) for arg in cmd)
55 | print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd))
56 |
57 |
58 | def get_git_commit(cwd=None):
59 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8')
60 |
61 | def get_git_commit_message(cwd=None):
62 | return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8')
63 |
64 | def ccap(cmd, dry=False, env=None, **kwargs):
65 | print_cmd(cmd, dry)
66 | if not dry:
67 | subprocess.check_call(cmd, env=env, **kwargs)
68 |
69 |
70 | MESSAGE_DEPTH = 0
71 |
72 | @contextmanager
73 | def timed(msg):
74 | global MESSAGE_DEPTH #pylint: disable=W0603
75 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
76 | tstart = time.time()
77 | MESSAGE_DEPTH += 1
78 | yield
79 | MESSAGE_DEPTH -= 1
80 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
81 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/dataset.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | class Dataset(object):
4 | def __init__(self, data_map, deterministic=False, shuffle=True):
5 | self.data_map = data_map
6 | self.deterministic = deterministic
7 | self.enable_shuffle = shuffle
8 | self.n = next(iter(data_map.values())).shape[0]
9 | self._next_id = 0
10 | self.shuffle()
11 |
12 | def shuffle(self):
13 | if self.deterministic:
14 | return
15 | perm = np.arange(self.n)
16 | np.random.shuffle(perm)
17 |
18 | for key in self.data_map:
19 | self.data_map[key] = self.data_map[key][perm]
20 |
21 | self._next_id = 0
22 |
23 | def next_batch(self, batch_size):
24 | if self._next_id >= self.n and self.enable_shuffle:
25 | self.shuffle()
26 |
27 | cur_id = self._next_id
28 | cur_batch_size = min(batch_size, self.n - self._next_id)
29 | self._next_id += cur_batch_size
30 |
31 | data_map = dict()
32 | for key in self.data_map:
33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 | return data_map
35 |
36 | def iterate_once(self, batch_size):
37 | if self.enable_shuffle: self.shuffle()
38 |
39 | while self._next_id <= self.n - batch_size:
40 | yield self.next_batch(batch_size)
41 | self._next_id = 0
42 |
43 | def subset(self, num_elements, deterministic=True):
44 | data_map = dict()
45 | for key in self.data_map:
46 | data_map[key] = self.data_map[key][:num_elements]
47 | return Dataset(data_map, deterministic)
48 |
49 |
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 | arrays = tuple(map(np.asarray, arrays))
53 | n = arrays[0].shape[0]
54 | assert all(a.shape[0] == n for a in arrays[1:])
55 | inds = np.arange(n)
56 | if shuffle: np.random.shuffle(inds)
57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 | for batch_inds in np.array_split(inds, sections):
59 | if include_final_partial_batch or len(batch_inds) == batch_size:
60 | yield tuple(a[batch_inds] for a in arrays)
61 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/input.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import tensorflow as tf
3 | from gym.spaces import Discrete, Box, MultiDiscrete
4 |
5 | def observation_placeholder(ob_space, batch_size=None, name='Ob'):
6 | '''
7 | Create placeholder to feed observations into of the size appropriate to the observation space
8 |
9 | Parameters:
10 | ----------
11 |
12 | ob_space: gym.Space observation space
13 |
14 | batch_size: int size of the batch to be fed into input. Can be left None in most cases.
15 |
16 | name: str name of the placeholder
17 |
18 | Returns:
19 | -------
20 |
21 | tensorflow placeholder tensor
22 | '''
23 |
24 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \
25 | 'Can only deal with Discrete and Box observation spaces for now'
26 |
27 | dtype = ob_space.dtype
28 | if dtype == np.int8:
29 | dtype = np.uint8
30 |
31 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name)
32 |
33 |
34 | def observation_input(ob_space, batch_size=None, name='Ob'):
35 | '''
36 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input
37 | encoder of the appropriate type.
38 | '''
39 |
40 | placeholder = observation_placeholder(ob_space, batch_size, name)
41 | return placeholder, encode_observation(ob_space, placeholder)
42 |
43 | def encode_observation(ob_space, placeholder):
44 | '''
45 | Encode input in the way that is appropriate to the observation space
46 |
47 | Parameters:
48 | ----------
49 |
50 | ob_space: gym.Space observation space
51 |
52 | placeholder: tf.placeholder observation input placeholder
53 | '''
54 | if isinstance(ob_space, Discrete):
55 | return tf.to_float(tf.one_hot(placeholder, ob_space.n))
56 | elif isinstance(ob_space, Box):
57 | return tf.to_float(placeholder)
58 | elif isinstance(ob_space, MultiDiscrete):
59 | placeholder = tf.cast(placeholder, tf.int32)
60 | one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])]
61 | return tf.concat(one_hots, axis=-1)
62 | else:
63 | raise NotImplementedError
64 |
65 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/math_util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.signal
3 |
4 |
5 | def discount(x, gamma):
6 | """
7 | computes discounted sums along 0th dimension of x.
8 |
9 | inputs
10 | ------
11 | x: ndarray
12 | gamma: float
13 |
14 | outputs
15 | -------
16 | y: ndarray with same shape as x, satisfying
17 |
18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 | where k = len(x) - t - 1
20 |
21 | """
22 | assert x.ndim >= 1
23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 |
25 | def explained_variance(ypred,y):
26 | """
27 | Computes fraction of variance that ypred explains about y.
28 | Returns 1 - Var[y-ypred] / Var[y]
29 |
30 | interpretation:
31 | ev=0 => might as well have predicted zero
32 | ev=1 => perfect prediction
33 | ev<0 => worse than just predicting zero
34 |
35 | """
36 | assert y.ndim == 1 and ypred.ndim == 1
37 | vary = np.var(y)
38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 |
40 | def explained_variance_2d(ypred, y):
41 | assert y.ndim == 2 and ypred.ndim == 2
42 | vary = np.var(y, axis=0)
43 | out = 1 - np.var(y-ypred)/vary
44 | out[vary < 1e-10] = 0
45 | return out
46 |
47 | def ncc(ypred, y):
48 | return np.corrcoef(ypred, y)[1,0]
49 |
50 | def flatten_arrays(arrs):
51 | return np.concatenate([arr.flat for arr in arrs])
52 |
53 | def unflatten_vector(vec, shapes):
54 | i=0
55 | arrs = []
56 | for shape in shapes:
57 | size = np.prod(shape)
58 | arr = vec[i:i+size].reshape(shape)
59 | arrs.append(arr)
60 | i += size
61 | return arrs
62 |
63 | def discount_with_boundaries(X, New, gamma):
64 | """
65 | X: 2d array of floats, time x features
66 | New: 2d array of bools, indicating when a new episode has started
67 | """
68 | Y = np.zeros_like(X)
69 | T = X.shape[0]
70 | Y[T-1] = X[T-1]
71 | for t in range(T-2, -1, -1):
72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 | return Y
74 |
75 | def test_discount_with_boundaries():
76 | gamma=0.9
77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 | starts = [1.0, 0.0, 0.0, 1.0]
79 | y = discount_with_boundaries(x, starts, gamma)
80 | assert np.allclose(y, [
81 | 1 + gamma * 2 + gamma**2 * 3,
82 | 2 + gamma * 3,
83 | 3,
84 | 4
85 | ])
86 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
1 | import os, subprocess, sys
2 |
3 | def mpi_fork(n, bind_to_core=False):
4 | """Re-launches the current script with workers
5 | Returns "parent" for original parent, "child" for MPI children
6 | """
7 | if n<=1:
8 | return "child"
9 | if os.getenv("IN_MPI") is None:
10 | env = os.environ.copy()
11 | env.update(
12 | MKL_NUM_THREADS="1",
13 | OMP_NUM_THREADS="1",
14 | IN_MPI="1"
15 | )
16 | args = ["mpirun", "-np", str(n)]
17 | if bind_to_core:
18 | args += ["-bind-to", "core"]
19 | args += [sys.executable] + sys.argv
20 | subprocess.check_call(args, env=env)
21 | return "parent"
22 | else:
23 | return "child"
24 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
1 | from mpi4py import MPI
2 | import numpy as np
3 | from baselines.common import zipsame
4 |
5 |
6 | def mpi_mean(x, axis=0, comm=None, keepdims=False):
7 | x = np.asarray(x)
8 | assert x.ndim > 0
9 | if comm is None: comm = MPI.COMM_WORLD
10 | xsum = x.sum(axis=axis, keepdims=keepdims)
11 | n = xsum.size
12 | localsum = np.zeros(n+1, x.dtype)
13 | localsum[:n] = xsum.ravel()
14 | localsum[n] = x.shape[axis]
15 | globalsum = np.zeros_like(localsum)
16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM)
17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
18 |
19 | def mpi_moments(x, axis=0, comm=None, keepdims=False):
20 | x = np.asarray(x)
21 | assert x.ndim > 0
22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
23 | sqdiffs = np.square(x - mean)
24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
25 | assert count1 == count
26 | std = np.sqrt(meansqdiff)
27 | if not keepdims:
28 | newshape = mean.shape[:axis] + mean.shape[axis+1:]
29 | mean = mean.reshape(newshape)
30 | std = std.reshape(newshape)
31 | return mean, std, count
32 |
33 |
34 | def test_runningmeanstd():
35 | import subprocess
36 | subprocess.check_call(['mpirun', '-np', '3',
37 | 'python','-c',
38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
39 |
40 | def _helper_runningmeanstd():
41 | comm = MPI.COMM_WORLD
42 | np.random.seed(0)
43 | for (triple,axis) in [
44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
47 | ]:
48 |
49 |
50 | x = np.concatenate(triple, axis=axis)
51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
52 |
53 |
54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
55 |
56 | for (a1,a2) in zipsame(ms1, ms2):
57 | print(a1, a2)
58 | assert np.allclose(a1, a2)
59 | print("ok!")
60 |
61 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/runners.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from abc import ABC, abstractmethod
3 |
4 | class AbstractEnvRunner(ABC):
5 | def __init__(self, *, env, model, nsteps):
6 | self.env = env
7 | self.model = model
8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
11 | self.obs[:] = env.reset()
12 | self.nsteps = nsteps
13 | self.states = model.initial_state
14 | self.dones = [False for _ in range(nenv)]
15 |
16 | @abstractmethod
17 | def run(self):
18 | raise NotImplementedError
19 |
20 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/test_mpi_util.py:
--------------------------------------------------------------------------------
1 | from baselines.common import mpi_util
2 | from baselines import logger
3 | from baselines.common.tests.test_with_mpi import with_mpi
4 | try:
5 | from mpi4py import MPI
6 | except ImportError:
7 | MPI = None
8 |
9 | @with_mpi()
10 | def test_mpi_weighted_mean():
11 | comm = MPI.COMM_WORLD
12 | with logger.scoped_configure(comm=comm):
13 | if comm.rank == 0:
14 | name2valcount = {'a' : (10, 2), 'b' : (20,3)}
15 | elif comm.rank == 1:
16 | name2valcount = {'a' : (19, 1), 'c' : (42,3)}
17 | else:
18 | raise NotImplementedError
19 | d = mpi_util.mpi_weighted_mean(comm, name2valcount)
20 | correctval = {'a' : (10 * 2 + 19) / 3.0, 'b' : 20, 'c' : 42}
21 | if comm.rank == 0:
22 | assert d == correctval, '{} != {}'.format(d, correctval)
23 |
24 | for name, (val, count) in name2valcount.items():
25 | for _ in range(count):
26 | logger.logkv_mean(name, val)
27 | d2 = logger.dumpkvs()
28 | if comm.rank == 0:
29 | assert d2 == correctval
30 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os, pytest
2 | mark_slow = pytest.mark.skipif(not os.getenv('RUNSLOW'), reason='slow')
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/fixed_sequence_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from gym import Env
3 | from gym.spaces import Discrete
4 |
5 |
6 | class FixedSequenceEnv(Env):
7 | def __init__(
8 | self,
9 | n_actions=10,
10 | episode_len=100
11 | ):
12 | self.action_space = Discrete(n_actions)
13 | self.observation_space = Discrete(1)
14 | self.np_random = np.random.RandomState(0)
15 | self.episode_len = episode_len
16 | self.sequence = [self.np_random.randint(0, self.action_space.n)
17 | for _ in range(self.episode_len)]
18 | self.time = 0
19 |
20 |
21 | def reset(self):
22 | self.time = 0
23 | return 0
24 |
25 | def step(self, actions):
26 | rew = self._get_reward(actions)
27 | self._choose_next_state()
28 | done = False
29 | if self.episode_len and self.time >= self.episode_len:
30 | done = True
31 |
32 | return 0, rew, done, {}
33 |
34 | def seed(self, seed=None):
35 | self.np_random.seed(seed)
36 |
37 | def _choose_next_state(self):
38 | self.time += 1
39 |
40 | def _get_reward(self, actions):
41 | return 1 if actions == self.sequence[self.time] else 0
42 |
43 |
44 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/identity_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from abc import abstractmethod
3 | from gym import Env
4 | from gym.spaces import MultiDiscrete, Discrete, Box
5 | from collections import deque
6 |
7 | class IdentityEnv(Env):
8 | def __init__(
9 | self,
10 | episode_len=None,
11 | delay=0,
12 | zero_first_rewards=True
13 | ):
14 |
15 | self.observation_space = self.action_space
16 | self.episode_len = episode_len
17 | self.time = 0
18 | self.delay = delay
19 | self.zero_first_rewards = zero_first_rewards
20 | self.q = deque(maxlen=delay+1)
21 |
22 | def reset(self):
23 | self.q.clear()
24 | for _ in range(self.delay + 1):
25 | self.q.append(self.action_space.sample())
26 | self.time = 0
27 |
28 | return self.q[-1]
29 |
30 | def step(self, actions):
31 | rew = self._get_reward(self.q.popleft(), actions)
32 | if self.zero_first_rewards and self.time < self.delay:
33 | rew = 0
34 | self.q.append(self.action_space.sample())
35 | self.time += 1
36 | done = self.episode_len is not None and self.time >= self.episode_len
37 | return self.q[-1], rew, done, {}
38 |
39 | def seed(self, seed=None):
40 | self.action_space.seed(seed)
41 |
42 | @abstractmethod
43 | def _get_reward(self, state, actions):
44 | raise NotImplementedError
45 |
46 |
47 | class DiscreteIdentityEnv(IdentityEnv):
48 | def __init__(
49 | self,
50 | dim,
51 | episode_len=None,
52 | delay=0,
53 | zero_first_rewards=True
54 | ):
55 |
56 | self.action_space = Discrete(dim)
57 | super().__init__(episode_len=episode_len, delay=delay, zero_first_rewards=zero_first_rewards)
58 |
59 | def _get_reward(self, state, actions):
60 | return 1 if state == actions else 0
61 |
62 | class MultiDiscreteIdentityEnv(IdentityEnv):
63 | def __init__(
64 | self,
65 | dims,
66 | episode_len=None,
67 | delay=0,
68 | ):
69 |
70 | self.action_space = MultiDiscrete(dims)
71 | super().__init__(episode_len=episode_len, delay=delay)
72 |
73 | def _get_reward(self, state, actions):
74 | return 1 if all(state == actions) else 0
75 |
76 |
77 | class BoxIdentityEnv(IdentityEnv):
78 | def __init__(
79 | self,
80 | shape,
81 | episode_len=None,
82 | ):
83 |
84 | self.action_space = Box(low=-1.0, high=1.0, shape=shape, dtype=np.float32)
85 | super().__init__(episode_len=episode_len)
86 |
87 | def _get_reward(self, state, actions):
88 | diff = actions - state
89 | diff = diff[:]
90 | return -0.5 * np.dot(diff, diff)
91 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/identity_env_test.py:
--------------------------------------------------------------------------------
1 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv
2 |
3 |
4 | def test_discrete_nodelay():
5 | nsteps = 100
6 | eplen = 50
7 | env = DiscreteIdentityEnv(10, episode_len=eplen)
8 | ob = env.reset()
9 | for t in range(nsteps):
10 | action = env.action_space.sample()
11 | next_ob, rew, done, info = env.step(action)
12 | assert rew == (1 if action == ob else 0)
13 | if (t + 1) % eplen == 0:
14 | assert done
15 | next_ob = env.reset()
16 | else:
17 | assert not done
18 | ob = next_ob
19 |
20 | def test_discrete_delay1():
21 | eplen = 50
22 | env = DiscreteIdentityEnv(10, episode_len=eplen, delay=1)
23 | ob = env.reset()
24 | prev_ob = None
25 | for t in range(eplen):
26 | action = env.action_space.sample()
27 | next_ob, rew, done, info = env.step(action)
28 | if t > 0:
29 | assert rew == (1 if action == prev_ob else 0)
30 | else:
31 | assert rew == 0
32 | prev_ob = ob
33 | ob = next_ob
34 | if t < eplen - 1:
35 | assert not done
36 | assert done
37 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/mnist_env.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import numpy as np
3 | import tempfile
4 | from gym import Env
5 | from gym.spaces import Discrete, Box
6 |
7 |
8 |
9 | class MnistEnv(Env):
10 | def __init__(
11 | self,
12 | episode_len=None,
13 | no_images=None
14 | ):
15 | import filelock
16 | from tensorflow.examples.tutorials.mnist import input_data
17 | # we could use temporary directory for this with a context manager and
18 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data
19 | # this way the data is not cleaned up, but we only download it once per machine
20 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
21 | with filelock.FileLock(mnist_path + '.lock'):
22 | self.mnist = input_data.read_data_sets(mnist_path)
23 |
24 | self.np_random = np.random.RandomState()
25 |
26 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
27 | self.action_space = Discrete(10)
28 | self.episode_len = episode_len
29 | self.time = 0
30 | self.no_images = no_images
31 |
32 | self.train_mode()
33 | self.reset()
34 |
35 | def reset(self):
36 | self._choose_next_state()
37 | self.time = 0
38 |
39 | return self.state[0]
40 |
41 | def step(self, actions):
42 | rew = self._get_reward(actions)
43 | self._choose_next_state()
44 | done = False
45 | if self.episode_len and self.time >= self.episode_len:
46 | rew = 0
47 | done = True
48 |
49 | return self.state[0], rew, done, {}
50 |
51 | def seed(self, seed=None):
52 | self.np_random.seed(seed)
53 |
54 | def train_mode(self):
55 | self.dataset = self.mnist.train
56 |
57 | def test_mode(self):
58 | self.dataset = self.mnist.test
59 |
60 | def _choose_next_state(self):
61 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
62 | index = self.np_random.randint(0, max_index)
63 | image = self.dataset.images[index].reshape(28,28,1)*255
64 | label = self.dataset.labels[index]
65 | self.state = (image, label)
66 | self.time += 1
67 |
68 | def _get_reward(self, actions):
69 | return 1 if self.state[1] == actions else 0
70 |
71 |
72 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_cartpole.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 |
4 | from baselines.run import get_learn_function
5 | from baselines.common.tests.util import reward_per_episode_test
6 | from baselines.common.tests import mark_slow
7 |
8 | common_kwargs = dict(
9 | total_timesteps=30000,
10 | network='mlp',
11 | gamma=1.0,
12 | seed=0,
13 | )
14 |
15 | learn_kwargs = {
16 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
17 | 'acer': dict(value_network='copy'),
18 | 'acktr': dict(nsteps=32, value_network='copy', is_async=False),
19 | 'deepq': dict(total_timesteps=20000),
20 | 'ppo2': dict(value_network='copy'),
21 | 'trpo_mpi': {}
22 | }
23 |
24 | @mark_slow
25 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
26 | def test_cartpole(alg):
27 | '''
28 | Test if the algorithm (with an mlp policy)
29 | can learn to balance the cartpole
30 | '''
31 |
32 | kwargs = common_kwargs.copy()
33 | kwargs.update(learn_kwargs[alg])
34 |
35 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
36 | def env_fn():
37 |
38 | env = gym.make('CartPole-v0')
39 | env.seed(0)
40 | return env
41 |
42 | reward_per_episode_test(env_fn, learn_fn, 100)
43 |
44 | if __name__ == '__main__':
45 | test_cartpole('acer')
46 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_doc_examples.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | try:
3 | import mujoco_py
4 | _mujoco_present = True
5 | except BaseException:
6 | mujoco_py = None
7 | _mujoco_present = False
8 |
9 |
10 | @pytest.mark.skipif(
11 | not _mujoco_present,
12 | reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
13 | )
14 | def test_lstm_example():
15 | import tensorflow as tf
16 | from baselines.common import policies, models, cmd_util
17 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
18 |
19 | # create vectorized environment
20 | venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])
21 |
22 | with tf.Session() as sess:
23 | # build policy based on lstm network with 128 units
24 | policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)
25 |
26 | # initialize tensorflow variables
27 | sess.run(tf.global_variables_initializer())
28 |
29 | # prepare environment variables
30 | ob = venv.reset()
31 | state = policy.initial_state
32 | done = [False]
33 | step_counter = 0
34 |
35 | # run a single episode until the end (i.e. until done)
36 | while True:
37 | action, _, state, _ = policy.step(ob, S=state, M=done)
38 | ob, reward, done, _ = venv.step(action)
39 | step_counter += 1
40 | if done:
41 | break
42 |
43 |
44 | assert step_counter > 5
45 |
46 |
47 |
48 |
49 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_env_after_learn.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 | import tensorflow as tf
4 |
5 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
6 | from baselines.run import get_learn_function
7 | from baselines.common.tf_util import make_session
8 |
9 | algos = ['a2c', 'acer', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
10 |
11 | @pytest.mark.parametrize('algo', algos)
12 | def test_env_after_learn(algo):
13 | def make_env():
14 | # acktr requires too much RAM, fails on travis
15 | env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
16 | return env
17 |
18 | make_session(make_default=True, graph=tf.Graph())
19 | env = SubprocVecEnv([make_env])
20 |
21 | learn = get_learn_function(algo)
22 |
23 | # Commenting out the following line resolves the issue, though crash happens at env.reset().
24 | learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)
25 |
26 | env.reset()
27 | env.close()
28 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_fetchreach.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import gym
3 |
4 | from baselines.run import get_learn_function
5 | from baselines.common.tests.util import reward_per_episode_test
6 | from baselines.common.tests import mark_slow
7 |
8 | pytest.importorskip('mujoco_py')
9 |
10 | common_kwargs = dict(
11 | network='mlp',
12 | seed=0,
13 | )
14 |
15 | learn_kwargs = {
16 | 'her': dict(total_timesteps=2000)
17 | }
18 |
19 | @mark_slow
20 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
21 | def test_fetchreach(alg):
22 | '''
23 | Test if the algorithm (with an mlp policy)
24 | can learn the FetchReach task
25 | '''
26 |
27 | kwargs = common_kwargs.copy()
28 | kwargs.update(learn_kwargs[alg])
29 |
30 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
31 | def env_fn():
32 |
33 | env = gym.make('FetchReach-v1')
34 | env.seed(0)
35 | return env
36 |
37 | reward_per_episode_test(env_fn, learn_fn, -15)
38 |
39 | if __name__ == '__main__':
40 | test_fetchreach('her')
41 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_fixed_sequence.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
3 |
4 | from baselines.common.tests.util import simple_test
5 | from baselines.run import get_learn_function
6 | from baselines.common.tests import mark_slow
7 |
8 |
9 | common_kwargs = dict(
10 | seed=0,
11 | total_timesteps=50000,
12 | )
13 |
14 | learn_kwargs = {
15 | 'a2c': {},
16 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
17 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
18 | # github issue: https://github.com/openai/baselines/issues/188
19 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
20 | }
21 |
22 |
23 | alg_list = learn_kwargs.keys()
24 | rnn_list = ['lstm']
25 |
26 | @mark_slow
27 | @pytest.mark.parametrize("alg", alg_list)
28 | @pytest.mark.parametrize("rnn", rnn_list)
29 | def test_fixed_sequence(alg, rnn):
30 | '''
31 | Test if the algorithm (with a given policy)
32 | can learn an identity transformation (i.e. return observation as an action)
33 | '''
34 |
35 | kwargs = learn_kwargs[alg]
36 | kwargs.update(common_kwargs)
37 |
38 | env_fn = lambda: FixedSequenceEnv(n_actions=10, episode_len=5)
39 | learn = lambda e: get_learn_function(alg)(
40 | env=e,
41 | network=rnn,
42 | **kwargs
43 | )
44 |
45 | simple_test(env_fn, learn, 0.7)
46 |
47 |
48 | if __name__ == '__main__':
49 | test_fixed_sequence('ppo2', 'lstm')
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_identity.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv
3 | from baselines.run import get_learn_function
4 | from baselines.common.tests.util import simple_test
5 | from baselines.common.tests import mark_slow
6 |
7 | common_kwargs = dict(
8 | total_timesteps=30000,
9 | network='mlp',
10 | gamma=0.9,
11 | seed=0,
12 | )
13 |
14 | learn_kwargs = {
15 | 'a2c' : {},
16 | 'acktr': {},
17 | 'deepq': {},
18 | 'ddpg': dict(layer_norm=True),
19 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
20 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
21 | }
22 |
23 |
24 | algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
25 | algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi']
26 | algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi']
27 |
28 | @mark_slow
29 | @pytest.mark.parametrize("alg", algos_disc)
30 | def test_discrete_identity(alg):
31 | '''
32 | Test if the algorithm (with an mlp policy)
33 | can learn an identity transformation (i.e. return observation as an action)
34 | '''
35 |
36 | kwargs = learn_kwargs[alg]
37 | kwargs.update(common_kwargs)
38 |
39 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
40 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
41 | simple_test(env_fn, learn_fn, 0.9)
42 |
43 | @mark_slow
44 | @pytest.mark.parametrize("alg", algos_multidisc)
45 | def test_multidiscrete_identity(alg):
46 | '''
47 | Test if the algorithm (with an mlp policy)
48 | can learn an identity transformation (i.e. return observation as an action)
49 | '''
50 |
51 | kwargs = learn_kwargs[alg]
52 | kwargs.update(common_kwargs)
53 |
54 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
55 | env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100)
56 | simple_test(env_fn, learn_fn, 0.9)
57 |
58 | @mark_slow
59 | @pytest.mark.parametrize("alg", algos_cont)
60 | def test_continuous_identity(alg):
61 | '''
62 | Test if the algorithm (with an mlp policy)
63 | can learn an identity transformation (i.e. return observation as an action)
64 | to a required precision
65 | '''
66 |
67 | kwargs = learn_kwargs[alg]
68 | kwargs.update(common_kwargs)
69 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
70 |
71 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
72 | simple_test(env_fn, learn_fn, -0.1)
73 |
74 | if __name__ == '__main__':
75 | test_multidiscrete_identity('acktr')
76 |
77 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_mnist.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | # from baselines.acer import acer_simple as acer
4 | from baselines.common.tests.envs.mnist_env import MnistEnv
5 | from baselines.common.tests.util import simple_test
6 | from baselines.run import get_learn_function
7 | from baselines.common.tests import mark_slow
8 |
9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
10 | # GitHub issue https://github.com/openai/baselines/issues/189
11 | common_kwargs = {
12 | 'seed': 0,
13 | 'network':'cnn',
14 | 'gamma':0.9,
15 | 'pad':'SAME'
16 | }
17 |
18 | learn_args = {
19 | 'a2c': dict(total_timesteps=50000),
20 | 'acer': dict(total_timesteps=20000),
21 | 'deepq': dict(total_timesteps=5000),
22 | 'acktr': dict(total_timesteps=30000),
23 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
24 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
25 | }
26 |
27 |
28 | #tests pass, but are too slow on travis. Same algorithms are covered
29 | # by other tests with less compute-hungry nn's and by benchmarks
30 | @pytest.mark.skip
31 | @mark_slow
32 | @pytest.mark.parametrize("alg", learn_args.keys())
33 | def test_mnist(alg):
34 | '''
35 | Test if the algorithm can learn to classify MNIST digits.
36 | Uses CNN policy.
37 | '''
38 |
39 | learn_kwargs = learn_args[alg]
40 | learn_kwargs.update(common_kwargs)
41 |
42 | learn = get_learn_function(alg)
43 | learn_fn = lambda e: learn(env=e, **learn_kwargs)
44 | env_fn = lambda: MnistEnv(episode_len=100)
45 |
46 | simple_test(env_fn, learn_fn, 0.6)
47 |
48 | if __name__ == '__main__':
49 | test_mnist('acer')
50 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_plot_util.py:
--------------------------------------------------------------------------------
1 | # smoke tests of plot_util
2 | from baselines.common import plot_util as pu
3 | from baselines.common.tests.util import smoketest
4 |
5 |
6 | def test_plot_util():
7 | nruns = 4
8 | logdirs = [smoketest('--alg=ppo2 --env=CartPole-v0 --num_timesteps=10000') for _ in range(nruns)]
9 | data = pu.load_results(logdirs)
10 | assert len(data) == 4
11 |
12 | _, axes = pu.plot_results(data[:1]); assert len(axes) == 1
13 | _, axes = pu.plot_results(data, tiling='vertical'); assert axes.shape==(4,1)
14 | _, axes = pu.plot_results(data, tiling='horizontal'); assert axes.shape==(1,4)
15 | _, axes = pu.plot_results(data, tiling='symmetric'); assert axes.shape==(2,2)
16 | _, axes = pu.plot_results(data, split_fn=lambda _: ''); assert len(axes) == 1
17 |
18 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
4 |
5 |
6 | def test_piecewise_schedule():
7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
8 |
9 | assert np.isclose(ps.value(-10), 500)
10 | assert np.isclose(ps.value(0), 150)
11 | assert np.isclose(ps.value(5), 200)
12 | assert np.isclose(ps.value(9), 80)
13 | assert np.isclose(ps.value(50), 50)
14 | assert np.isclose(ps.value(80), 50)
15 | assert np.isclose(ps.value(150), 0)
16 | assert np.isclose(ps.value(175), -25)
17 | assert np.isclose(ps.value(201), 500)
18 | assert np.isclose(ps.value(500), 500)
19 |
20 | assert np.isclose(ps.value(200 - 1e-10), -50)
21 |
22 |
23 | def test_constant_schedule():
24 | cs = ConstantSchedule(5)
25 | for i in range(-100, 100):
26 | assert np.isclose(cs.value(i), 5)
27 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
4 |
5 |
6 | def test_tree_set():
7 | tree = SumSegmentTree(4)
8 |
9 | tree[2] = 1.0
10 | tree[3] = 3.0
11 |
12 | assert np.isclose(tree.sum(), 4.0)
13 | assert np.isclose(tree.sum(0, 2), 0.0)
14 | assert np.isclose(tree.sum(0, 3), 1.0)
15 | assert np.isclose(tree.sum(2, 3), 1.0)
16 | assert np.isclose(tree.sum(2, -1), 1.0)
17 | assert np.isclose(tree.sum(2, 4), 4.0)
18 |
19 |
20 | def test_tree_set_overlap():
21 | tree = SumSegmentTree(4)
22 |
23 | tree[2] = 1.0
24 | tree[2] = 3.0
25 |
26 | assert np.isclose(tree.sum(), 3.0)
27 | assert np.isclose(tree.sum(2, 3), 3.0)
28 | assert np.isclose(tree.sum(2, -1), 3.0)
29 | assert np.isclose(tree.sum(2, 4), 3.0)
30 | assert np.isclose(tree.sum(1, 2), 0.0)
31 |
32 |
33 | def test_prefixsum_idx():
34 | tree = SumSegmentTree(4)
35 |
36 | tree[2] = 1.0
37 | tree[3] = 3.0
38 |
39 | assert tree.find_prefixsum_idx(0.0) == 2
40 | assert tree.find_prefixsum_idx(0.5) == 2
41 | assert tree.find_prefixsum_idx(0.99) == 2
42 | assert tree.find_prefixsum_idx(1.01) == 3
43 | assert tree.find_prefixsum_idx(3.00) == 3
44 | assert tree.find_prefixsum_idx(4.00) == 3
45 |
46 |
47 | def test_prefixsum_idx2():
48 | tree = SumSegmentTree(4)
49 |
50 | tree[0] = 0.5
51 | tree[1] = 1.0
52 | tree[2] = 1.0
53 | tree[3] = 3.0
54 |
55 | assert tree.find_prefixsum_idx(0.00) == 0
56 | assert tree.find_prefixsum_idx(0.55) == 1
57 | assert tree.find_prefixsum_idx(0.99) == 1
58 | assert tree.find_prefixsum_idx(1.51) == 2
59 | assert tree.find_prefixsum_idx(3.00) == 3
60 | assert tree.find_prefixsum_idx(5.50) == 3
61 |
62 |
63 | def test_max_interval_tree():
64 | tree = MinSegmentTree(4)
65 |
66 | tree[0] = 1.0
67 | tree[2] = 0.5
68 | tree[3] = 3.0
69 |
70 | assert np.isclose(tree.min(), 0.5)
71 | assert np.isclose(tree.min(0, 2), 1.0)
72 | assert np.isclose(tree.min(0, 3), 0.5)
73 | assert np.isclose(tree.min(0, -1), 0.5)
74 | assert np.isclose(tree.min(2, 4), 0.5)
75 | assert np.isclose(tree.min(3, 4), 3.0)
76 |
77 | tree[2] = 0.7
78 |
79 | assert np.isclose(tree.min(), 0.7)
80 | assert np.isclose(tree.min(0, 2), 1.0)
81 | assert np.isclose(tree.min(0, 3), 0.7)
82 | assert np.isclose(tree.min(0, -1), 0.7)
83 | assert np.isclose(tree.min(2, 4), 0.7)
84 | assert np.isclose(tree.min(3, 4), 3.0)
85 |
86 | tree[2] = 4.0
87 |
88 | assert np.isclose(tree.min(), 1.0)
89 | assert np.isclose(tree.min(0, 2), 1.0)
90 | assert np.isclose(tree.min(0, 3), 1.0)
91 | assert np.isclose(tree.min(0, -1), 1.0)
92 | assert np.isclose(tree.min(2, 4), 3.0)
93 | assert np.isclose(tree.min(2, 3), 4.0)
94 | assert np.isclose(tree.min(2, -1), 4.0)
95 | assert np.isclose(tree.min(3, 4), 3.0)
96 |
97 |
98 | if __name__ == '__main__':
99 | test_tree_set()
100 | test_tree_set_overlap()
101 | test_prefixsum_idx()
102 | test_prefixsum_idx2()
103 | test_max_interval_tree()
104 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
1 | # tests for tf_util
2 | import tensorflow as tf
3 | from baselines.common.tf_util import (
4 | function,
5 | initialize,
6 | single_threaded_session
7 | )
8 |
9 |
10 | def test_function():
11 | with tf.Graph().as_default():
12 | x = tf.placeholder(tf.int32, (), name="x")
13 | y = tf.placeholder(tf.int32, (), name="y")
14 | z = 3 * x + 2 * y
15 | lin = function([x, y], z, givens={y: 0})
16 |
17 | with single_threaded_session():
18 | initialize()
19 |
20 | assert lin(2) == 6
21 | assert lin(x=3) == 9
22 | assert lin(2, 2) == 10
23 | assert lin(x=2, y=3) == 12
24 |
25 |
26 | def test_multikwargs():
27 | with tf.Graph().as_default():
28 | x = tf.placeholder(tf.int32, (), name="x")
29 | with tf.variable_scope("other"):
30 | x2 = tf.placeholder(tf.int32, (), name="x")
31 | z = 3 * x + 2 * x2
32 |
33 | lin = function([x, x2], z, givens={x2: 0})
34 | with single_threaded_session():
35 | initialize()
36 | assert lin(2) == 6
37 | assert lin(2, 2) == 10
38 |
39 |
40 | if __name__ == '__main__':
41 | test_function()
42 | test_multikwargs()
43 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_with_mpi.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import subprocess
4 | import cloudpickle
5 | import base64
6 | import pytest
7 | from functools import wraps
8 |
9 | try:
10 | from mpi4py import MPI
11 | except ImportError:
12 | MPI = None
13 |
14 | def with_mpi(nproc=2, timeout=30, skip_if_no_mpi=True):
15 | def outer_thunk(fn):
16 | @wraps(fn)
17 | def thunk(*args, **kwargs):
18 | serialized_fn = base64.b64encode(cloudpickle.dumps(lambda: fn(*args, **kwargs)))
19 | subprocess.check_call([
20 | 'mpiexec','-n', str(nproc),
21 | sys.executable,
22 | '-m', 'baselines.common.tests.test_with_mpi',
23 | serialized_fn
24 | ], env=os.environ, timeout=timeout)
25 |
26 | if skip_if_no_mpi:
27 | return pytest.mark.skipif(MPI is None, reason="MPI not present")(thunk)
28 | else:
29 | return thunk
30 |
31 | return outer_thunk
32 |
33 |
34 | if __name__ == '__main__':
35 | if len(sys.argv) > 1:
36 | fn = cloudpickle.loads(base64.b64decode(sys.argv[1]))
37 | assert callable(fn)
38 | fn()
39 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tile_images.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def tile_images(img_nhwc):
4 | """
5 | Tile N images into one big PxQ image
6 | (P,Q) are chosen to be as close as possible, and if N
7 | is square, then P=Q.
8 |
9 | input: img_nhwc, list or array of images, ndim=4 once turned into array
10 | n = batch index, h = height, w = width, c = channel
11 | returns:
12 | bigim_HWc, ndarray with ndim=3
13 | """
14 | img_nhwc = np.asarray(img_nhwc)
15 | N, h, w, c = img_nhwc.shape
16 | H = int(np.ceil(np.sqrt(N)))
17 | W = int(np.ceil(float(N)/H))
18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
22 | return img_Hh_Ww_c
23 |
24 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
1 | from .vec_env import AlreadySteppingError, NotSteppingError, VecEnv, VecEnvWrapper, VecEnvObservationWrapper, CloudpickleWrapper
2 | from .dummy_vec_env import DummyVecEnv
3 | from .shmem_vec_env import ShmemVecEnv
4 | from .subproc_vec_env import SubprocVecEnv
5 | from .vec_frame_stack import VecFrameStack
6 | from .vec_monitor import VecMonitor
7 | from .vec_normalize import VecNormalize
8 | from .vec_remove_dict_obs import VecExtractDictObs
9 |
10 | __all__ = ['AlreadySteppingError', 'NotSteppingError', 'VecEnv', 'VecEnvWrapper', 'VecEnvObservationWrapper', 'CloudpickleWrapper', 'DummyVecEnv', 'ShmemVecEnv', 'SubprocVecEnv', 'VecFrameStack', 'VecMonitor', 'VecNormalize', 'VecExtractDictObs']
11 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from .vec_env import VecEnv
3 | from .util import copy_obs_dict, dict_to_obs, obs_space_info
4 |
5 | class DummyVecEnv(VecEnv):
6 | """
7 | VecEnv that does runs multiple environments sequentially, that is,
8 | the step and reset commands are send to one environment at a time.
9 | Useful when debugging and when num_env == 1 (in the latter case,
10 | avoids communication overhead)
11 | """
12 | def __init__(self, env_fns):
13 | """
14 | Arguments:
15 |
16 | env_fns: iterable of callables functions that build environments
17 | """
18 | self.envs = [fn() for fn in env_fns]
19 | env = self.envs[0]
20 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
21 | obs_space = env.observation_space
22 | self.keys, shapes, dtypes = obs_space_info(obs_space)
23 |
24 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
25 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
26 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
27 | self.buf_infos = [{} for _ in range(self.num_envs)]
28 | self.actions = None
29 | self.spec = self.envs[0].spec
30 |
31 | def step_async(self, actions):
32 | listify = True
33 | try:
34 | if len(actions) == self.num_envs:
35 | listify = False
36 | except TypeError:
37 | pass
38 |
39 | if not listify:
40 | self.actions = actions
41 | else:
42 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
43 | self.actions = [actions]
44 |
45 | def step_wait(self):
46 | for e in range(self.num_envs):
47 | action = self.actions[e]
48 | # if isinstance(self.envs[e].action_space, spaces.Discrete):
49 | # action = int(action)
50 |
51 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
52 | if self.buf_dones[e]:
53 | obs = self.envs[e].reset()
54 | self._save_obs(e, obs)
55 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
56 | self.buf_infos.copy())
57 |
58 | def reset(self):
59 | for e in range(self.num_envs):
60 | obs = self.envs[e].reset()
61 | self._save_obs(e, obs)
62 | return self._obs_from_buf()
63 |
64 | def _save_obs(self, e, obs):
65 | for k in self.keys:
66 | if k is None:
67 | self.buf_obs[k][e] = obs
68 | else:
69 | self.buf_obs[k][e] = obs[k]
70 |
71 | def _obs_from_buf(self):
72 | return dict_to_obs(copy_obs_dict(self.buf_obs))
73 |
74 | def get_images(self):
75 | return [env.render(mode='rgb_array') for env in self.envs]
76 |
77 | def render(self, mode='human'):
78 | if self.num_envs == 1:
79 | return self.envs[0].render(mode=mode)
80 | else:
81 | return super().render(mode=mode)
82 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/test_video_recorder.py:
--------------------------------------------------------------------------------
1 | """
2 | Tests for asynchronous vectorized environments.
3 | """
4 |
5 | import gym
6 | import pytest
7 | import os
8 | import glob
9 | import tempfile
10 |
11 | from .dummy_vec_env import DummyVecEnv
12 | from .shmem_vec_env import ShmemVecEnv
13 | from .subproc_vec_env import SubprocVecEnv
14 | from .vec_video_recorder import VecVideoRecorder
15 |
16 | @pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv))
17 | @pytest.mark.parametrize('num_envs', (1, 4))
18 | @pytest.mark.parametrize('video_length', (10, 100))
19 | @pytest.mark.parametrize('video_interval', (1, 50))
20 | def test_video_recorder(klass, num_envs, video_length, video_interval):
21 | """
22 | Wrap an existing VecEnv with VevVideoRecorder,
23 | Make (video_interval + video_length + 1) steps,
24 | then check that the file is present
25 | """
26 |
27 | def make_fn():
28 | env = gym.make('PongNoFrameskip-v4')
29 | return env
30 | fns = [make_fn for _ in range(num_envs)]
31 | env = klass(fns)
32 |
33 | with tempfile.TemporaryDirectory() as video_path:
34 | env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length)
35 |
36 | env.reset()
37 | for _ in range(video_interval + video_length + 1):
38 | env.step([0] * num_envs)
39 | env.close()
40 |
41 |
42 | recorded_video = glob.glob(os.path.join(video_path, "*.mp4"))
43 |
44 | # first and second step
45 | assert len(recorded_video) == 2
46 | # Files are not empty
47 | assert all(os.stat(p).st_size != 0 for p in recorded_video)
48 |
49 |
50 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/util.py:
--------------------------------------------------------------------------------
1 | """
2 | Helpers for dealing with vectorized environments.
3 | """
4 |
5 | from collections import OrderedDict
6 |
7 | import gym
8 | import numpy as np
9 |
10 |
11 | def copy_obs_dict(obs):
12 | """
13 | Deep-copy an observation dict.
14 | """
15 | return {k: np.copy(v) for k, v in obs.items()}
16 |
17 |
18 | def dict_to_obs(obs_dict):
19 | """
20 | Convert an observation dict into a raw array if the
21 | original observation space was not a Dict space.
22 | """
23 | if set(obs_dict.keys()) == {None}:
24 | return obs_dict[None]
25 | return obs_dict
26 |
27 |
28 | def obs_space_info(obs_space):
29 | """
30 | Get dict-structured information about a gym.Space.
31 |
32 | Returns:
33 | A tuple (keys, shapes, dtypes):
34 | keys: a list of dict keys.
35 | shapes: a dict mapping keys to shapes.
36 | dtypes: a dict mapping keys to dtypes.
37 | """
38 | if isinstance(obs_space, gym.spaces.Dict):
39 | assert isinstance(obs_space.spaces, OrderedDict)
40 | subspaces = obs_space.spaces
41 | elif isinstance(obs_space, gym.spaces.Tuple):
42 | assert isinstance(obs_space.spaces, tuple)
43 | subspaces = {i: obs_space.spaces[i] for i in range(len(obs_space.spaces))}
44 | else:
45 | subspaces = {None: obs_space}
46 | keys = []
47 | shapes = {}
48 | dtypes = {}
49 | for key, box in subspaces.items():
50 | keys.append(key)
51 | shapes[key] = box.shape
52 | dtypes[key] = box.dtype
53 | return keys, shapes, dtypes
54 |
55 |
56 | def obs_to_dict(obs):
57 | """
58 | Convert an observation into a dict.
59 | """
60 | if isinstance(obs, dict):
61 | return obs
62 | return {None: obs}
63 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
1 | from .vec_env import VecEnvWrapper
2 | import numpy as np
3 | from gym import spaces
4 |
5 |
6 | class VecFrameStack(VecEnvWrapper):
7 | def __init__(self, venv, nstack):
8 | self.venv = venv
9 | self.nstack = nstack
10 | wos = venv.observation_space # wrapped ob space
11 | low = np.repeat(wos.low, self.nstack, axis=-1)
12 | high = np.repeat(wos.high, self.nstack, axis=-1)
13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
16 |
17 | def step_wait(self):
18 | obs, rews, news, infos = self.venv.step_wait()
19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
20 | for (i, new) in enumerate(news):
21 | if new:
22 | self.stackedobs[i] = 0
23 | self.stackedobs[..., -obs.shape[-1]:] = obs
24 | return self.stackedobs, rews, news, infos
25 |
26 | def reset(self):
27 | obs = self.venv.reset()
28 | self.stackedobs[...] = 0
29 | self.stackedobs[..., -obs.shape[-1]:] = obs
30 | return self.stackedobs
31 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_monitor.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | from baselines.bench.monitor import ResultsWriter
3 | import numpy as np
4 | import time
5 | from collections import deque
6 |
7 | class VecMonitor(VecEnvWrapper):
8 | def __init__(self, venv, filename=None, keep_buf=0, info_keywords=()):
9 | VecEnvWrapper.__init__(self, venv)
10 | self.eprets = None
11 | self.eplens = None
12 | self.epcount = 0
13 | self.tstart = time.time()
14 | if filename:
15 | self.results_writer = ResultsWriter(filename, header={'t_start': self.tstart},
16 | extra_keys=info_keywords)
17 | else:
18 | self.results_writer = None
19 | self.info_keywords = info_keywords
20 | self.keep_buf = keep_buf
21 | if self.keep_buf:
22 | self.epret_buf = deque([], maxlen=keep_buf)
23 | self.eplen_buf = deque([], maxlen=keep_buf)
24 |
25 | def reset(self):
26 | obs = self.venv.reset()
27 | self.eprets = np.zeros(self.num_envs, 'f')
28 | self.eplens = np.zeros(self.num_envs, 'i')
29 | return obs
30 |
31 | def step_wait(self):
32 | obs, rews, dones, infos = self.venv.step_wait()
33 | self.eprets += rews
34 | self.eplens += 1
35 |
36 | newinfos = list(infos[:])
37 | for i in range(len(dones)):
38 | if dones[i]:
39 | info = infos[i].copy()
40 | ret = self.eprets[i]
41 | eplen = self.eplens[i]
42 | epinfo = {'r': ret, 'l': eplen, 't': round(time.time() - self.tstart, 6)}
43 | for k in self.info_keywords:
44 | epinfo[k] = info[k]
45 | info['episode'] = epinfo
46 | if self.keep_buf:
47 | self.epret_buf.append(ret)
48 | self.eplen_buf.append(eplen)
49 | self.epcount += 1
50 | self.eprets[i] = 0
51 | self.eplens[i] = 0
52 | if self.results_writer:
53 | self.results_writer.write_row(epinfo)
54 | newinfos[i] = info
55 | return obs, rews, dones, newinfos
56 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
1 | from . import VecEnvWrapper
2 | import numpy as np
3 |
4 | class VecNormalize(VecEnvWrapper):
5 | """
6 | A vectorized wrapper that normalizes the observations
7 | and returns from an environment.
8 | """
9 |
10 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False):
11 | VecEnvWrapper.__init__(self, venv)
12 | if use_tf:
13 | from baselines.common.running_mean_std import TfRunningMeanStd
14 | self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None
15 | self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None
16 | else:
17 | from baselines.common.running_mean_std import RunningMeanStd
18 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
19 | self.ret_rms = RunningMeanStd(shape=()) if ret else None
20 | self.clipob = clipob
21 | self.cliprew = cliprew
22 | self.ret = np.zeros(self.num_envs)
23 | self.gamma = gamma
24 | self.epsilon = epsilon
25 |
26 | def step_wait(self):
27 | obs, rews, news, infos = self.venv.step_wait()
28 |
29 | # ADDED:
30 | for rollout in range(len(rews)):
31 | infos[rollout]["raw_reward"] = rews[rollout]
32 |
33 | self.ret = self.ret * self.gamma + rews
34 | obs = self._obfilt(obs)
35 | if self.ret_rms:
36 | self.ret_rms.update(self.ret)
37 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
38 | self.ret[news] = 0.
39 | return obs, rews, news, infos
40 |
41 | def _obfilt(self, obs):
42 | if self.ob_rms:
43 | self.ob_rms.update(obs)
44 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
45 | return obs
46 | else:
47 | return obs
48 |
49 | def reset(self):
50 | self.ret = np.zeros(self.num_envs)
51 | obs = self.venv.reset()
52 | return self._obfilt(obs)
53 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_remove_dict_obs.py:
--------------------------------------------------------------------------------
1 | from .vec_env import VecEnvObservationWrapper
2 |
3 | class VecExtractDictObs(VecEnvObservationWrapper):
4 | def __init__(self, venv, key):
5 | self.key = key
6 | super().__init__(venv=venv,
7 | observation_space=venv.observation_space.spaces[self.key])
8 |
9 | def process(self, obs):
10 | return obs[self.key]
11 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_video_recorder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from baselines import logger
3 | from baselines.common.vec_env import VecEnvWrapper
4 | from gym.wrappers.monitoring import video_recorder
5 |
6 |
7 | class VecVideoRecorder(VecEnvWrapper):
8 | """
9 | Wrap VecEnv to record rendered image as mp4 video.
10 | """
11 |
12 | def __init__(self, venv, directory, record_video_trigger, video_length=200):
13 | """
14 | # Arguments
15 | venv: VecEnv to wrap
16 | directory: Where to save videos
17 | record_video_trigger:
18 | Function that defines when to start recording.
19 | The function takes the current number of step,
20 | and returns whether we should start recording or not.
21 | video_length: Length of recorded video
22 | """
23 |
24 | VecEnvWrapper.__init__(self, venv)
25 | self.record_video_trigger = record_video_trigger
26 | self.video_recorder = None
27 |
28 | self.directory = os.path.abspath(directory)
29 | if not os.path.exists(self.directory): os.mkdir(self.directory)
30 |
31 | self.file_prefix = "vecenv"
32 | self.file_infix = '{}'.format(os.getpid())
33 | self.step_id = 0
34 | self.video_length = video_length
35 |
36 | self.recording = False
37 | self.recorded_frames = 0
38 |
39 | def reset(self):
40 | obs = self.venv.reset()
41 |
42 | self.start_video_recorder()
43 |
44 | return obs
45 |
46 | def start_video_recorder(self):
47 | self.close_video_recorder()
48 |
49 | base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id))
50 | self.video_recorder = video_recorder.VideoRecorder(
51 | env=self.venv,
52 | base_path=base_path,
53 | metadata={'step_id': self.step_id}
54 | )
55 |
56 | self.video_recorder.capture_frame()
57 | self.recorded_frames = 1
58 | self.recording = True
59 |
60 | def _video_enabled(self):
61 | return self.record_video_trigger(self.step_id)
62 |
63 | def step_wait(self):
64 | obs, rews, dones, infos = self.venv.step_wait()
65 |
66 | self.step_id += 1
67 | if self.recording:
68 | self.video_recorder.capture_frame()
69 | self.recorded_frames += 1
70 | if self.recorded_frames > self.video_length:
71 | logger.info("Saving video to ", self.video_recorder.path)
72 | self.close_video_recorder()
73 | elif self._video_enabled():
74 | self.start_video_recorder()
75 |
76 | return obs, rews, dones, infos
77 |
78 | def close_video_recorder(self):
79 | if self.recording:
80 | self.video_recorder.close()
81 | self.recording = False
82 | self.recorded_frames = 0
83 |
84 | def close(self):
85 | VecEnvWrapper.close(self)
86 | self.close_video_recorder()
87 |
88 | def __del__(self):
89 | self.close()
90 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/wrappers.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | class TimeLimit(gym.Wrapper):
4 | def __init__(self, env, max_episode_steps=None):
5 | super(TimeLimit, self).__init__(env)
6 | self._max_episode_steps = max_episode_steps
7 | self._elapsed_steps = 0
8 |
9 | def step(self, ac):
10 | observation, reward, done, info = self.env.step(ac)
11 | self._elapsed_steps += 1
12 | if self._elapsed_steps >= self._max_episode_steps:
13 | done = True
14 | info['TimeLimit.truncated'] = True
15 | return observation, reward, done, info
16 |
17 | def reset(self, **kwargs):
18 | self._elapsed_steps = 0
19 | return self.env.reset(**kwargs)
20 |
21 | class ClipActionsWrapper(gym.Wrapper):
22 | def step(self, action):
23 | import numpy as np
24 | action = np.nan_to_num(action)
25 | action = np.clip(action, self.action_space.low, self.action_space.high)
26 | return self.env.step(action)
27 |
28 | def reset(self, **kwargs):
29 | return self.env.reset(**kwargs)
30 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/README.md:
--------------------------------------------------------------------------------
1 | # DDPG
2 |
3 | - Original paper: https://arxiv.org/abs/1509.02971
4 | - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/
5 | - `python -m baselines.run --alg=ddpg --env=HalfCheetah-v2 --num_timesteps=1e6` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options.
6 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/memory.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class RingBuffer(object):
5 | def __init__(self, maxlen, shape, dtype='float32'):
6 | self.maxlen = maxlen
7 | self.start = 0
8 | self.length = 0
9 | self.data = np.zeros((maxlen,) + shape).astype(dtype)
10 |
11 | def __len__(self):
12 | return self.length
13 |
14 | def __getitem__(self, idx):
15 | if idx < 0 or idx >= self.length:
16 | raise KeyError()
17 | return self.data[(self.start + idx) % self.maxlen]
18 |
19 | def get_batch(self, idxs):
20 | return self.data[(self.start + idxs) % self.maxlen]
21 |
22 | def append(self, v):
23 | if self.length < self.maxlen:
24 | # We have space, simply increase the length.
25 | self.length += 1
26 | elif self.length == self.maxlen:
27 | # No space, "remove" the first item.
28 | self.start = (self.start + 1) % self.maxlen
29 | else:
30 | # This should never happen.
31 | raise RuntimeError()
32 | self.data[(self.start + self.length - 1) % self.maxlen] = v
33 |
34 |
35 | def array_min2d(x):
36 | x = np.array(x)
37 | if x.ndim >= 2:
38 | return x
39 | return x.reshape(-1, 1)
40 |
41 |
42 | class Memory(object):
43 | def __init__(self, limit, action_shape, observation_shape):
44 | self.limit = limit
45 |
46 | self.observations0 = RingBuffer(limit, shape=observation_shape)
47 | self.actions = RingBuffer(limit, shape=action_shape)
48 | self.rewards = RingBuffer(limit, shape=(1,))
49 | self.terminals1 = RingBuffer(limit, shape=(1,))
50 | self.observations1 = RingBuffer(limit, shape=observation_shape)
51 |
52 | def sample(self, batch_size):
53 | # Draw such that we always have a proceeding element.
54 | batch_idxs = np.random.randint(self.nb_entries - 2, size=batch_size)
55 |
56 | obs0_batch = self.observations0.get_batch(batch_idxs)
57 | obs1_batch = self.observations1.get_batch(batch_idxs)
58 | action_batch = self.actions.get_batch(batch_idxs)
59 | reward_batch = self.rewards.get_batch(batch_idxs)
60 | terminal1_batch = self.terminals1.get_batch(batch_idxs)
61 |
62 | result = {
63 | 'obs0': array_min2d(obs0_batch),
64 | 'obs1': array_min2d(obs1_batch),
65 | 'rewards': array_min2d(reward_batch),
66 | 'actions': array_min2d(action_batch),
67 | 'terminals1': array_min2d(terminal1_batch),
68 | }
69 | return result
70 |
71 | def append(self, obs0, action, reward, obs1, terminal1, training=True):
72 | if not training:
73 | return
74 |
75 | self.observations0.append(obs0)
76 | self.actions.append(action)
77 | self.rewards.append(reward)
78 | self.observations1.append(obs1)
79 | self.terminals1.append(terminal1)
80 |
81 | @property
82 | def nb_entries(self):
83 | return len(self.observations0)
84 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/models.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from baselines.common.models import get_network_builder
3 |
4 |
5 | class Model(object):
6 | def __init__(self, name, network='mlp', **network_kwargs):
7 | self.name = name
8 | self.network_builder = get_network_builder(network)(**network_kwargs)
9 |
10 | @property
11 | def vars(self):
12 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
13 |
14 | @property
15 | def trainable_vars(self):
16 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name)
17 |
18 | @property
19 | def perturbable_vars(self):
20 | return [var for var in self.trainable_vars if 'LayerNorm' not in var.name]
21 |
22 |
23 | class Actor(Model):
24 | def __init__(self, nb_actions, name='actor', network='mlp', **network_kwargs):
25 | super().__init__(name=name, network=network, **network_kwargs)
26 | self.nb_actions = nb_actions
27 |
28 | def __call__(self, obs, reuse=False):
29 | with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
30 | x = self.network_builder(obs)
31 | x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
32 | x = tf.nn.tanh(x)
33 | return x
34 |
35 |
36 | class Critic(Model):
37 | def __init__(self, name='critic', network='mlp', **network_kwargs):
38 | super().__init__(name=name, network=network, **network_kwargs)
39 | self.layer_norm = True
40 |
41 | def __call__(self, obs, action, reuse=False):
42 | with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
43 | x = tf.concat([obs, action], axis=-1) # this assumes observation and action can be concatenated
44 | x = self.network_builder(x)
45 | x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3), name='output')
46 | return x
47 |
48 | @property
49 | def output_vars(self):
50 | output_vars = [var for var in self.trainable_vars if 'output' in var.name]
51 | return output_vars
52 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/noise.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class AdaptiveParamNoiseSpec(object):
5 | def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01):
6 | self.initial_stddev = initial_stddev
7 | self.desired_action_stddev = desired_action_stddev
8 | self.adoption_coefficient = adoption_coefficient
9 |
10 | self.current_stddev = initial_stddev
11 |
12 | def adapt(self, distance):
13 | if distance > self.desired_action_stddev:
14 | # Decrease stddev.
15 | self.current_stddev /= self.adoption_coefficient
16 | else:
17 | # Increase stddev.
18 | self.current_stddev *= self.adoption_coefficient
19 |
20 | def get_stats(self):
21 | stats = {
22 | 'param_noise_stddev': self.current_stddev,
23 | }
24 | return stats
25 |
26 | def __repr__(self):
27 | fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})'
28 | return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient)
29 |
30 |
31 | class ActionNoise(object):
32 | def reset(self):
33 | pass
34 |
35 |
36 | class NormalActionNoise(ActionNoise):
37 | def __init__(self, mu, sigma):
38 | self.mu = mu
39 | self.sigma = sigma
40 |
41 | def __call__(self):
42 | return np.random.normal(self.mu, self.sigma)
43 |
44 | def __repr__(self):
45 | return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
46 |
47 |
48 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
49 | class OrnsteinUhlenbeckActionNoise(ActionNoise):
50 | def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None):
51 | self.theta = theta
52 | self.mu = mu
53 | self.sigma = sigma
54 | self.dt = dt
55 | self.x0 = x0
56 | self.reset()
57 |
58 | def __call__(self):
59 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
60 | self.x_prev = x
61 | return x
62 |
63 | def reset(self):
64 | self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
65 |
66 | def __repr__(self):
67 | return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
68 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/test_smoke.py:
--------------------------------------------------------------------------------
1 | from baselines.common.tests.util import smoketest
2 | def _run(argstr):
3 | smoketest('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr)
4 |
5 | def test_popart():
6 | _run('--normalize_returns=True --popart=True')
7 |
8 | def test_noise_normal():
9 | _run('--noise_type=normal_0.1')
10 |
11 | def test_noise_ou():
12 | _run('--noise_type=ou_0.1')
13 |
14 | def test_noise_adaptive():
15 | _run('--noise_type=adaptive-param_0.2,normal_0.1')
16 |
17 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/README.md:
--------------------------------------------------------------------------------
1 | ## If you are curious.
2 |
3 | ##### Train a Cartpole agent and watch it play once it converges!
4 |
5 | Here's a list of commands to run to quickly get a working example:
6 |
7 |
8 |
9 |
10 | ```bash
11 | # Train model and save the results to cartpole_model.pkl
12 | python -m baselines.run --alg=deepq --env=CartPole-v0 --save_path=./cartpole_model.pkl --num_timesteps=1e5
13 | # Load the model saved in cartpole_model.pkl and visualize the learned policy
14 | python -m baselines.run --alg=deepq --env=CartPole-v0 --load_path=./cartpole_model.pkl --num_timesteps=0 --play
15 | ```
16 |
17 | ## If you wish to apply DQN to solve a problem.
18 |
19 | Check out our simple agent trained with one stop shop `deepq.learn` function.
20 |
21 | - [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent.
22 |
23 | In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. Complimentary file `enjoy_cartpole.py` loads and visualizes the learned policy.
24 |
25 | ## If you wish to experiment with the algorithm
26 |
27 | ##### Check out the examples
28 |
29 | - [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm.
30 | - [baselines/deepq/defaults.py](defaults.py) - settings for training on atari. Run
31 |
32 | ```bash
33 | python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4
34 | ```
35 | to train on Atari Pong (see more in repo-wide [README.md](../../README.md#training-models))
36 |
37 |
38 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.deepq import models # noqa
2 | from baselines.deepq.build_graph import build_act, build_train # noqa
3 | from baselines.deepq.deepq import learn, load_act # noqa
4 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa
5 |
6 | def wrap_atari_dqn(env):
7 | from baselines.common.atari_wrappers import wrap_deepmind
8 | return wrap_deepmind(env, frame_stack=True, scale=False)
9 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/defaults.py:
--------------------------------------------------------------------------------
1 | def atari():
2 | return dict(
3 | network='conv_only',
4 | lr=1e-4,
5 | buffer_size=10000,
6 | exploration_fraction=0.1,
7 | exploration_final_eps=0.01,
8 | train_freq=4,
9 | learning_starts=10000,
10 | target_network_update_freq=1000,
11 | gamma=0.99,
12 | prioritized_replay=True,
13 | prioritized_replay_alpha=0.6,
14 | checkpoint_freq=10000,
15 | checkpoint_path=None,
16 | dueling=True
17 | )
18 |
19 | def retro():
20 | return atari()
21 |
22 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/enjoy_cartpole.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def main():
7 | env = gym.make("CartPole-v0")
8 | act = deepq.learn(env, network='mlp', total_timesteps=0, load_path="cartpole_model.pkl")
9 |
10 | while True:
11 | obs, done = env.reset(), False
12 | episode_rew = 0
13 | while not done:
14 | env.render()
15 | obs, rew, done, _ = env.step(act(obs[None])[0])
16 | episode_rew += rew
17 | print("Episode reward", episode_rew)
18 |
19 |
20 | if __name__ == '__main__':
21 | main()
22 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/enjoy_mountaincar.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 | from baselines.common import models
5 |
6 |
7 | def main():
8 | env = gym.make("MountainCar-v0")
9 | act = deepq.learn(
10 | env,
11 | network=models.mlp(num_layers=1, num_hidden=64),
12 | total_timesteps=0,
13 | load_path='mountaincar_model.pkl'
14 | )
15 |
16 | while True:
17 | obs, done = env.reset(), False
18 | episode_rew = 0
19 | while not done:
20 | env.render()
21 | obs, rew, done, _ = env.step(act(obs[None])[0])
22 | episode_rew += rew
23 | print("Episode reward", episode_rew)
24 |
25 |
26 | if __name__ == '__main__':
27 | main()
28 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/enjoy_pong.py:
--------------------------------------------------------------------------------
1 | import gym
2 | from baselines import deepq
3 |
4 |
5 | def main():
6 | env = gym.make("PongNoFrameskip-v4")
7 | env = deepq.wrap_atari_dqn(env)
8 | model = deepq.learn(
9 | env,
10 | "conv_only",
11 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
12 | hiddens=[256],
13 | dueling=True,
14 | total_timesteps=0
15 | )
16 |
17 | while True:
18 | obs, done = env.reset(), False
19 | episode_rew = 0
20 | while not done:
21 | env.render()
22 | obs, rew, done, _ = env.step(model(obs[None])[0])
23 | episode_rew += rew
24 | print("Episode reward", episode_rew)
25 |
26 |
27 | if __name__ == '__main__':
28 | main()
29 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/train_cartpole.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 |
5 |
6 | def callback(lcl, _glb):
7 | # stop training if reward exceeds 199
8 | is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
9 | return is_solved
10 |
11 |
12 | def main():
13 | env = gym.make("CartPole-v0")
14 | act = deepq.learn(
15 | env,
16 | network='mlp',
17 | lr=1e-3,
18 | total_timesteps=100000,
19 | buffer_size=50000,
20 | exploration_fraction=0.1,
21 | exploration_final_eps=0.02,
22 | print_freq=10,
23 | callback=callback
24 | )
25 | print("Saving model to cartpole_model.pkl")
26 | act.save("cartpole_model.pkl")
27 |
28 |
29 | if __name__ == '__main__':
30 | main()
31 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/train_mountaincar.py:
--------------------------------------------------------------------------------
1 | import gym
2 |
3 | from baselines import deepq
4 | from baselines.common import models
5 |
6 |
7 | def main():
8 | env = gym.make("MountainCar-v0")
9 | # Enabling layer_norm here is import for parameter space noise!
10 | act = deepq.learn(
11 | env,
12 | network=models.mlp(num_hidden=64, num_layers=1),
13 | lr=1e-3,
14 | total_timesteps=100000,
15 | buffer_size=50000,
16 | exploration_fraction=0.1,
17 | exploration_final_eps=0.1,
18 | print_freq=10,
19 | param_noise=True
20 | )
21 | print("Saving model to mountaincar_model.pkl")
22 | act.save("mountaincar_model.pkl")
23 |
24 |
25 | if __name__ == '__main__':
26 | main()
27 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/train_pong.py:
--------------------------------------------------------------------------------
1 | from baselines import deepq
2 | from baselines import bench
3 | from baselines import logger
4 | from baselines.common.atari_wrappers import make_atari
5 |
6 |
7 | def main():
8 | logger.configure()
9 | env = make_atari('PongNoFrameskip-v4')
10 | env = bench.Monitor(env, logger.get_dir())
11 | env = deepq.wrap_atari_dqn(env)
12 |
13 | model = deepq.learn(
14 | env,
15 | "conv_only",
16 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
17 | hiddens=[256],
18 | dueling=True,
19 | lr=1e-4,
20 | total_timesteps=int(1e7),
21 | buffer_size=10000,
22 | exploration_fraction=0.1,
23 | exploration_final_eps=0.01,
24 | train_freq=4,
25 | learning_starts=10000,
26 | target_network_update_freq=1000,
27 | gamma=0.99,
28 | )
29 |
30 | model.save('pong_model.pkl')
31 | env.close()
32 |
33 | if __name__ == '__main__':
34 | main()
35 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/models.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import tensorflow.contrib.layers as layers
3 |
4 |
5 | def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs):
6 | if isinstance(network, str):
7 | from baselines.common.models import get_network_builder
8 | network = get_network_builder(network)(**network_kwargs)
9 |
10 | def q_func_builder(input_placeholder, num_actions, scope, reuse=False):
11 | with tf.variable_scope(scope, reuse=reuse):
12 | latent = network(input_placeholder)
13 | if isinstance(latent, tuple):
14 | if latent[1] is not None:
15 | raise NotImplementedError("DQN is not compatible with recurrent policies yet")
16 | latent = latent[0]
17 |
18 | latent = layers.flatten(latent)
19 |
20 | with tf.variable_scope("action_value"):
21 | action_out = latent
22 | for hidden in hiddens:
23 | action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
24 | if layer_norm:
25 | action_out = layers.layer_norm(action_out, center=True, scale=True)
26 | action_out = tf.nn.relu(action_out)
27 | action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)
28 |
29 | if dueling:
30 | with tf.variable_scope("state_value"):
31 | state_out = latent
32 | for hidden in hiddens:
33 | state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
34 | if layer_norm:
35 | state_out = layers.layer_norm(state_out, center=True, scale=True)
36 | state_out = tf.nn.relu(state_out)
37 | state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
38 | action_scores_mean = tf.reduce_mean(action_scores, 1)
39 | action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
40 | q_out = state_score + action_scores_centered
41 | else:
42 | q_out = action_scores
43 | return q_out
44 |
45 | return q_func_builder
46 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/utils.py:
--------------------------------------------------------------------------------
1 | from baselines.common.input import observation_input
2 | from baselines.common.tf_util import adjust_shape
3 |
4 | # ================================================================
5 | # Placeholders
6 | # ================================================================
7 |
8 |
9 | class TfInput(object):
10 | def __init__(self, name="(unnamed)"):
11 | """Generalized Tensorflow placeholder. The main differences are:
12 | - possibly uses multiple placeholders internally and returns multiple values
13 | - can apply light postprocessing to the value feed to placeholder.
14 | """
15 | self.name = name
16 |
17 | def get(self):
18 | """Return the tf variable(s) representing the possibly postprocessed value
19 | of placeholder(s).
20 | """
21 | raise NotImplementedError
22 |
23 | def make_feed_dict(self, data):
24 | """Given data input it to the placeholder(s)."""
25 | raise NotImplementedError
26 |
27 |
28 | class PlaceholderTfInput(TfInput):
29 | def __init__(self, placeholder):
30 | """Wrapper for regular tensorflow placeholder."""
31 | super().__init__(placeholder.name)
32 | self._placeholder = placeholder
33 |
34 | def get(self):
35 | return self._placeholder
36 |
37 | def make_feed_dict(self, data):
38 | return {self._placeholder: adjust_shape(self._placeholder, data)}
39 |
40 |
41 | class ObservationInput(PlaceholderTfInput):
42 | def __init__(self, observation_space, name=None):
43 | """Creates an input placeholder tailored to a specific observation space
44 |
45 | Parameters
46 | ----------
47 |
48 | observation_space:
49 | observation space of the environment. Should be one of the gym.spaces types
50 | name: str
51 | tensorflow name of the underlying placeholder
52 | """
53 | inpt, self.processed_inpt = observation_input(observation_space, name=name)
54 | super().__init__(inpt)
55 |
56 | def get(self):
57 | return self.processed_inpt
58 |
59 |
60 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/README.md:
--------------------------------------------------------------------------------
1 | # Generative Adversarial Imitation Learning (GAIL)
2 |
3 | - Original paper: https://arxiv.org/abs/1606.03476
4 |
5 | For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md)
6 |
7 | ## If you want to train an imitation learning agent
8 |
9 | ### Step 1: Download expert data
10 |
11 | Download the expert data into `./data`, [download link](https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing)
12 |
13 | ### Step 2: Run GAIL
14 |
15 | Run with single rank:
16 |
17 | ```bash
18 | python -m baselines.gail.run_mujoco
19 | ```
20 |
21 | Run with multiple ranks:
22 |
23 | ```bash
24 | mpirun -np 16 python -m baselines.gail.run_mujoco
25 | ```
26 |
27 | See help (`-h`) for more options.
28 |
29 | #### In case you want to run Behavior Cloning (BC)
30 |
31 | ```bash
32 | python -m baselines.gail.behavior_clone
33 | ```
34 |
35 | See help (`-h`) for more options.
36 |
37 |
38 | ## Contributing
39 |
40 | Bug reports and pull requests are welcome on GitHub at https://github.com/openai/baselines/pulls.
41 |
42 | ## Maintainers
43 |
44 | - Yuan-Hong Liao, andrewliao11_at_gmail_dot_com
45 | - Ryan Julian, ryanjulian_at_gmail_dot_com
46 |
47 | ## Others
48 |
49 | Thanks to the open source:
50 |
51 | - @openai/imitation
52 | - @carpedm20/deep-rl-tensorflow
53 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/dataset/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/mlp_policy.py:
--------------------------------------------------------------------------------
1 | '''
2 | from baselines/ppo1/mlp_policy.py and add simple modification
3 | (1) add reuse argument
4 | (2) cache the `stochastic` placeholder
5 | '''
6 | import tensorflow as tf
7 | import gym
8 |
9 | import baselines.common.tf_util as U
10 | from baselines.common.mpi_running_mean_std import RunningMeanStd
11 | from baselines.common.distributions import make_pdtype
12 | from baselines.acktr.utils import dense
13 |
14 |
15 | class MlpPolicy(object):
16 | recurrent = False
17 |
18 | def __init__(self, name, reuse=False, *args, **kwargs):
19 | with tf.variable_scope(name):
20 | if reuse:
21 | tf.get_variable_scope().reuse_variables()
22 | self._init(*args, **kwargs)
23 | self.scope = tf.get_variable_scope().name
24 |
25 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
26 | assert isinstance(ob_space, gym.spaces.Box)
27 |
28 | self.pdtype = pdtype = make_pdtype(ac_space)
29 | sequence_length = None
30 |
31 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
32 |
33 | with tf.variable_scope("obfilter"):
34 | self.ob_rms = RunningMeanStd(shape=ob_space.shape)
35 |
36 | obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
37 | last_out = obz
38 | for i in range(num_hid_layers):
39 | last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
40 | self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]
41 |
42 | last_out = obz
43 | for i in range(num_hid_layers):
44 | last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0)))
45 |
46 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
47 | mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
48 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
49 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
50 | else:
51 | pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
52 |
53 | self.pd = pdtype.pdfromflat(pdparam)
54 |
55 | self.state_in = []
56 | self.state_out = []
57 |
58 | # change for BC
59 | stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=())
60 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
61 | self.ac = ac
62 | self._act = U.function([stochastic, ob], [ac, self.vpred])
63 |
64 | def act(self, stochastic, ob):
65 | ac1, vpred1 = self._act(stochastic, ob[None])
66 | return ac1[0], vpred1[0]
67 |
68 | def get_variables(self):
69 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
70 |
71 | def get_trainable_variables(self):
72 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
73 |
74 | def get_initial_state(self):
75 | return []
76 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-normalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-normalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/gail-result.md:
--------------------------------------------------------------------------------
1 | # Results of GAIL/BC on Mujoco
2 |
3 | Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including
4 | Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0.
5 |
6 | ## Results
7 |
8 | ### Training through iterations
9 |
10 | - Hoppers-v1
11 |
12 |
13 | - HalfCheetah-v1
14 |
15 |
16 | - Walker2d-v1
17 |
18 |
19 | - Humanoid-v1
20 |
21 |
22 | - HumanoidStandup-v1
23 |
24 |
25 | For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing)
26 |
27 | ### Determinstic Policy (Set std=0)
28 | | | Un-normalized | Normalized |
29 | |---|---|---|
30 | | Hopper-v1 |
|
|
31 | | HalfCheetah-v1 |
|
|
32 | | Walker2d-v1 |
|
|
33 | | Humanoid-v1 |
|
|
34 | | HumanoidStandup-v1 |
|
|
35 |
36 | ### Stochatic Policy
37 | | | Un-normalized | Normalized |
38 | |---|---|---|
39 | | Hopper-v1 |
|
|
40 | | HalfCheetah-v1 |
|
|
41 | | Walker2d-v1 |
|
|
42 | | Humanoid-v1 |
|
|
43 | | HumanoidStandup-v1 |
|
|
44 |
45 | ### details about GAIL imitator
46 |
47 | For all environments, the
48 | imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most
49 | 1024 transitions, and seed 0, 1, 2, 3, respectively.
50 |
51 | ### details about the BC imitators
52 |
53 | All BC imitators are trained with seed 0.
54 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/halfcheetah-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/halfcheetah-training.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/hopper-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/hopper-training.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/humanoid-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/humanoid-training.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/humanoidstandup-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/humanoidstandup-training.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/walker2d-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/walker2d-training.png
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/statistics.py:
--------------------------------------------------------------------------------
1 | '''
2 | This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py
3 | '''
4 |
5 | import tensorflow as tf
6 | import numpy as np
7 |
8 | import baselines.common.tf_util as U
9 |
10 |
11 | class stats():
12 |
13 | def __init__(self, scalar_keys=[], histogram_keys=[]):
14 | self.scalar_keys = scalar_keys
15 | self.histogram_keys = histogram_keys
16 | self.scalar_summaries = []
17 | self.scalar_summaries_ph = []
18 | self.histogram_summaries_ph = []
19 | self.histogram_summaries = []
20 | with tf.variable_scope('summary'):
21 | for k in scalar_keys:
22 | ph = tf.placeholder('float32', None, name=k+'.scalar.summary')
23 | sm = tf.summary.scalar(k+'.scalar.summary', ph)
24 | self.scalar_summaries_ph.append(ph)
25 | self.scalar_summaries.append(sm)
26 | for k in histogram_keys:
27 | ph = tf.placeholder('float32', None, name=k+'.histogram.summary')
28 | sm = tf.summary.scalar(k+'.histogram.summary', ph)
29 | self.histogram_summaries_ph.append(ph)
30 | self.histogram_summaries.append(sm)
31 |
32 | self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries)
33 |
34 | def add_all_summary(self, writer, values, iter):
35 | # Note that the order of the incoming ```values``` should be the same as the that of the
36 | # ```scalar_keys``` given in ```__init__```
37 | if np.sum(np.isnan(values)+0) != 0:
38 | return
39 | sess = U.get_session()
40 | keys = self.scalar_summaries_ph + self.histogram_summaries_ph
41 | feed_dict = {}
42 | for k, v in zip(keys, values):
43 | feed_dict.update({k: v})
44 | summaries_str = sess.run(self.summaries, feed_dict)
45 | writer.add_summary(summaries_str, iter)
46 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/actor_critic.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from baselines.her.util import store_args, nn
3 |
4 |
5 | class ActorCritic:
6 | @store_args
7 | def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers,
8 | **kwargs):
9 | """The actor-critic network and related training code.
10 |
11 | Args:
12 | inputs_tf (dict of tensors): all necessary inputs for the network: the
13 | observation (o), the goal (g), and the action (u)
14 | dimo (int): the dimension of the observations
15 | dimg (int): the dimension of the goals
16 | dimu (int): the dimension of the actions
17 | max_u (float): the maximum magnitude of actions; action outputs will be scaled
18 | accordingly
19 | o_stats (baselines.her.Normalizer): normalizer for observations
20 | g_stats (baselines.her.Normalizer): normalizer for goals
21 | hidden (int): number of hidden units that should be used in hidden layers
22 | layers (int): number of hidden layers
23 | """
24 | self.o_tf = inputs_tf['o']
25 | self.g_tf = inputs_tf['g']
26 | self.u_tf = inputs_tf['u']
27 |
28 | # Prepare inputs for actor and critic.
29 | o = self.o_stats.normalize(self.o_tf)
30 | g = self.g_stats.normalize(self.g_tf)
31 | input_pi = tf.concat(axis=1, values=[o, g]) # for actor
32 |
33 | # Networks.
34 | with tf.variable_scope('pi'):
35 | self.pi_tf = self.max_u * tf.tanh(nn(
36 | input_pi, [self.hidden] * self.layers + [self.dimu]))
37 | with tf.variable_scope('Q'):
38 | # for policy training
39 | input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
40 | self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
41 | # for critic training
42 | input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
43 | self._input_Q = input_Q # exposed for tests
44 | self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True)
45 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/experiment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/experiment/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/experiment/play.py:
--------------------------------------------------------------------------------
1 | # DEPRECATED, use --play flag to baselines.run instead
2 | import click
3 | import numpy as np
4 | import pickle
5 |
6 | from baselines import logger
7 | from baselines.common import set_global_seeds
8 | import baselines.her.experiment.config as config
9 | from baselines.her.rollout import RolloutWorker
10 |
11 |
12 | @click.command()
13 | @click.argument('policy_file', type=str)
14 | @click.option('--seed', type=int, default=0)
15 | @click.option('--n_test_rollouts', type=int, default=10)
16 | @click.option('--render', type=int, default=1)
17 | def main(policy_file, seed, n_test_rollouts, render):
18 | set_global_seeds(seed)
19 |
20 | # Load policy.
21 | with open(policy_file, 'rb') as f:
22 | policy = pickle.load(f)
23 | env_name = policy.info['env_name']
24 |
25 | # Prepare params.
26 | params = config.DEFAULT_PARAMS
27 | if env_name in config.DEFAULT_ENV_PARAMS:
28 | params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in
29 | params['env_name'] = env_name
30 | params = config.prepare_params(params)
31 | config.log_params(params, logger=logger)
32 |
33 | dims = config.configure_dims(params)
34 |
35 | eval_params = {
36 | 'exploit': True,
37 | 'use_target_net': params['test_with_polyak'],
38 | 'compute_Q': True,
39 | 'rollout_batch_size': 1,
40 | 'render': bool(render),
41 | }
42 |
43 | for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
44 | eval_params[name] = params[name]
45 |
46 | evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
47 | evaluator.seed(seed)
48 |
49 | # Run evaluation.
50 | evaluator.clear_history()
51 | for _ in range(n_test_rollouts):
52 | evaluator.generate_rollouts()
53 |
54 | # record logs
55 | for key, val in evaluator.logs('test'):
56 | logger.record_tabular(key, np.mean(val))
57 | logger.dump_tabular()
58 |
59 |
60 | if __name__ == '__main__':
61 | main()
62 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/her_sampler.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
5 | """Creates a sample function that can be used for HER experience replay.
6 |
7 | Args:
8 | replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none',
9 | regular DDPG experience replay is used
10 | replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
11 | as many HER replays as regular replays are used)
12 | reward_fun (function): function to re-compute the reward with substituted goals
13 | """
14 | if replay_strategy == 'future':
15 | future_p = 1 - (1. / (1 + replay_k))
16 | else: # 'replay_strategy' == 'none'
17 | future_p = 0
18 |
19 | def _sample_her_transitions(episode_batch, batch_size_in_transitions):
20 | """episode_batch is {key: array(buffer_size x T x dim_key)}
21 | """
22 | T = episode_batch['u'].shape[1]
23 | rollout_batch_size = episode_batch['u'].shape[0]
24 | batch_size = batch_size_in_transitions
25 |
26 | # Select which episodes and time steps to use.
27 | episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
28 | t_samples = np.random.randint(T, size=batch_size)
29 | transitions = {key: episode_batch[key][episode_idxs, t_samples].copy()
30 | for key in episode_batch.keys()}
31 |
32 | # Select future time indexes proportional with probability future_p. These
33 | # will be used for HER replay by substituting in future goals.
34 | her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
35 | future_offset = np.random.uniform(size=batch_size) * (T - t_samples)
36 | future_offset = future_offset.astype(int)
37 | future_t = (t_samples + 1 + future_offset)[her_indexes]
38 |
39 | # Replace goal with achieved goal but only for the previously-selected
40 | # HER transitions (as defined by her_indexes). For the other transitions,
41 | # keep the original goal.
42 | future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
43 | transitions['g'][her_indexes] = future_ag
44 |
45 | # Reconstruct info dictionary for reward computation.
46 | info = {}
47 | for key, value in transitions.items():
48 | if key.startswith('info_'):
49 | info[key.replace('info_', '')] = value
50 |
51 | # Re-compute reward since we may have substituted the goal.
52 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
53 | reward_params['info'] = info
54 | transitions['r'] = reward_fun(**reward_params)
55 |
56 | transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
57 | for k in transitions.keys()}
58 |
59 | assert(transitions['u'].shape[0] == batch_size_in_transitions)
60 |
61 | return transitions
62 |
63 | return _sample_her_transitions
64 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/README.md:
--------------------------------------------------------------------------------
1 | # PPOSGD
2 |
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
7 |
8 | - Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model`
9 | - Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model`
10 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/cnn_policy.py:
--------------------------------------------------------------------------------
1 | import baselines.common.tf_util as U
2 | import tensorflow as tf
3 | import gym
4 | from baselines.common.distributions import make_pdtype
5 |
6 | class CnnPolicy(object):
7 | recurrent = False
8 | def __init__(self, name, ob_space, ac_space, kind='large'):
9 | with tf.variable_scope(name):
10 | self._init(ob_space, ac_space, kind)
11 | self.scope = tf.get_variable_scope().name
12 |
13 | def _init(self, ob_space, ac_space, kind):
14 | assert isinstance(ob_space, gym.spaces.Box)
15 |
16 | self.pdtype = pdtype = make_pdtype(ac_space)
17 | sequence_length = None
18 |
19 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
20 |
21 | x = ob / 255.0
22 | if kind == 'small': # from A3C paper
23 | x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
24 | x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
25 | x = U.flattenallbut0(x)
26 | x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
27 | elif kind == 'large': # Nature DQN
28 | x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
29 | x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
30 | x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
31 | x = U.flattenallbut0(x)
32 | x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
33 | else:
34 | raise NotImplementedError
35 |
36 | logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
37 | self.pd = pdtype.pdfromflat(logits)
38 | self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0]
39 |
40 | self.state_in = []
41 | self.state_out = []
42 |
43 | stochastic = tf.placeholder(dtype=tf.bool, shape=())
44 | ac = self.pd.sample() # XXX
45 | self._act = U.function([stochastic, ob], [ac, self.vpred])
46 |
47 | def act(self, stochastic, ob):
48 | ac1, vpred1 = self._act(stochastic, ob[None])
49 | return ac1[0], vpred1[0]
50 | def get_variables(self):
51 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
52 | def get_trainable_variables(self):
53 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
54 | def get_initial_state(self):
55 | return []
56 |
57 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/mlp_policy.py:
--------------------------------------------------------------------------------
1 | from baselines.common.mpi_running_mean_std import RunningMeanStd
2 | import baselines.common.tf_util as U
3 | import tensorflow as tf
4 | import gym
5 | from baselines.common.distributions import make_pdtype
6 |
7 | class MlpPolicy(object):
8 | recurrent = False
9 | def __init__(self, name, *args, **kwargs):
10 | with tf.variable_scope(name):
11 | self._init(*args, **kwargs)
12 | self.scope = tf.get_variable_scope().name
13 |
14 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
15 | assert isinstance(ob_space, gym.spaces.Box)
16 |
17 | self.pdtype = pdtype = make_pdtype(ac_space)
18 | sequence_length = None
19 |
20 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 |
22 | with tf.variable_scope("obfilter"):
23 | self.ob_rms = RunningMeanStd(shape=ob_space.shape)
24 |
25 | with tf.variable_scope('vf'):
26 | obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
27 | last_out = obz
28 | for i in range(num_hid_layers):
29 | last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
30 | self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
31 |
32 | with tf.variable_scope('pol'):
33 | last_out = obz
34 | for i in range(num_hid_layers):
35 | last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
36 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
37 | mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
38 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
39 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
40 | else:
41 | pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))
42 |
43 | self.pd = pdtype.pdfromflat(pdparam)
44 |
45 | self.state_in = []
46 | self.state_out = []
47 |
48 | stochastic = tf.placeholder(dtype=tf.bool, shape=())
49 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
50 | self._act = U.function([stochastic, ob], [ac, self.vpred])
51 |
52 | def act(self, stochastic, ob):
53 | ac1, vpred1 = self._act(stochastic, ob[None])
54 | return ac1[0], vpred1[0]
55 | def get_variables(self):
56 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
57 | def get_trainable_variables(self):
58 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
59 | def get_initial_state(self):
60 | return []
61 |
62 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/run_atari.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from mpi4py import MPI
4 | from baselines.common import set_global_seeds
5 | from baselines import bench
6 | import os.path as osp
7 | from baselines import logger
8 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind
9 | from baselines.common.cmd_util import atari_arg_parser
10 |
11 | def train(env_id, num_timesteps, seed):
12 | from baselines.ppo1 import pposgd_simple, cnn_policy
13 | import baselines.common.tf_util as U
14 | rank = MPI.COMM_WORLD.Get_rank()
15 | sess = U.single_threaded_session()
16 | sess.__enter__()
17 | if rank == 0:
18 | logger.configure()
19 | else:
20 | logger.configure(format_strs=[])
21 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None
22 | set_global_seeds(workerseed)
23 | env = make_atari(env_id)
24 | def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
25 | return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
26 | env = bench.Monitor(env, logger.get_dir() and
27 | osp.join(logger.get_dir(), str(rank)))
28 | env.seed(workerseed)
29 |
30 | env = wrap_deepmind(env)
31 | env.seed(workerseed)
32 |
33 | pposgd_simple.learn(env, policy_fn,
34 | max_timesteps=int(num_timesteps * 1.1),
35 | timesteps_per_actorbatch=256,
36 | clip_param=0.2, entcoeff=0.01,
37 | optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
38 | gamma=0.99, lam=0.95,
39 | schedule='linear'
40 | )
41 | env.close()
42 |
43 | def main():
44 | args = atari_arg_parser().parse_args()
45 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
46 |
47 | if __name__ == '__main__':
48 | main()
49 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/run_humanoid.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import os
3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
4 | from baselines.common import tf_util as U
5 | from baselines import logger
6 |
7 | import gym
8 |
9 | def train(num_timesteps, seed, model_path=None):
10 | env_id = 'Humanoid-v2'
11 | from baselines.ppo1 import mlp_policy, pposgd_simple
12 | U.make_session(num_cpu=1).__enter__()
13 | def policy_fn(name, ob_space, ac_space):
14 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
15 | hid_size=64, num_hid_layers=2)
16 | env = make_mujoco_env(env_id, seed)
17 |
18 | # parameters below were the best found in a simple random search
19 | # these are good enough to make humanoid walk, but whether those are
20 | # an absolute best or not is not certain
21 | env = RewScale(env, 0.1)
22 | logger.log("NOTE: reward will be scaled by a factor of 10 in logged stats. Check the monitor for unscaled reward.")
23 | pi = pposgd_simple.learn(env, policy_fn,
24 | max_timesteps=num_timesteps,
25 | timesteps_per_actorbatch=2048,
26 | clip_param=0.1, entcoeff=0.0,
27 | optim_epochs=10,
28 | optim_stepsize=1e-4,
29 | optim_batchsize=64,
30 | gamma=0.99,
31 | lam=0.95,
32 | schedule='constant',
33 | )
34 | env.close()
35 | if model_path:
36 | U.save_state(model_path)
37 |
38 | return pi
39 |
40 | class RewScale(gym.RewardWrapper):
41 | def __init__(self, env, scale):
42 | gym.RewardWrapper.__init__(self, env)
43 | self.scale = scale
44 | def reward(self, r):
45 | return r * self.scale
46 |
47 | def main():
48 | logger.configure()
49 | parser = mujoco_arg_parser()
50 | parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
51 | parser.set_defaults(num_timesteps=int(5e7))
52 |
53 | args = parser.parse_args()
54 |
55 | if not args.play:
56 | # train the model
57 | train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
58 | else:
59 | # construct the model object, load pre-trained model and render
60 | pi = train(num_timesteps=1, seed=args.seed)
61 | U.load_state(args.model_path)
62 | env = make_mujoco_env('Humanoid-v2', seed=0)
63 |
64 | ob = env.reset()
65 | while True:
66 | action = pi.act(stochastic=False, ob=ob)[0]
67 | ob, _, done, _ = env.step(action)
68 | env.render()
69 | if done:
70 | ob = env.reset()
71 |
72 | if __name__ == '__main__':
73 | main()
74 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/run_mujoco.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
4 | from baselines.common import tf_util as U
5 | from baselines import logger
6 |
7 | def train(env_id, num_timesteps, seed):
8 | from baselines.ppo1 import mlp_policy, pposgd_simple
9 | U.make_session(num_cpu=1).__enter__()
10 | def policy_fn(name, ob_space, ac_space):
11 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
12 | hid_size=64, num_hid_layers=2)
13 | env = make_mujoco_env(env_id, seed)
14 | pposgd_simple.learn(env, policy_fn,
15 | max_timesteps=num_timesteps,
16 | timesteps_per_actorbatch=2048,
17 | clip_param=0.2, entcoeff=0.0,
18 | optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
19 | gamma=0.99, lam=0.95, schedule='linear',
20 | )
21 | env.close()
22 |
23 | def main():
24 | args = mujoco_arg_parser().parse_args()
25 | logger.configure()
26 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
27 |
28 | if __name__ == '__main__':
29 | main()
30 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/run_robotics.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | from mpi4py import MPI
4 | from baselines.common import set_global_seeds
5 | from baselines import logger
6 | from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
7 | import mujoco_py
8 |
9 |
10 | def train(env_id, num_timesteps, seed):
11 | from baselines.ppo1 import mlp_policy, pposgd_simple
12 | import baselines.common.tf_util as U
13 | rank = MPI.COMM_WORLD.Get_rank()
14 | sess = U.single_threaded_session()
15 | sess.__enter__()
16 | mujoco_py.ignore_mujoco_warnings().__enter__()
17 | workerseed = seed + 10000 * rank
18 | set_global_seeds(workerseed)
19 | env = make_robotics_env(env_id, workerseed, rank=rank)
20 | def policy_fn(name, ob_space, ac_space):
21 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
22 | hid_size=256, num_hid_layers=3)
23 |
24 | pposgd_simple.learn(env, policy_fn,
25 | max_timesteps=num_timesteps,
26 | timesteps_per_actorbatch=2048,
27 | clip_param=0.2, entcoeff=0.0,
28 | optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256,
29 | gamma=0.99, lam=0.95, schedule='linear',
30 | )
31 | env.close()
32 |
33 |
34 | def main():
35 | args = robotics_arg_parser().parse_args()
36 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
37 |
38 |
39 | if __name__ == '__main__':
40 | main()
41 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/README.md:
--------------------------------------------------------------------------------
1 | # PPO2
2 |
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 |
6 | - `python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
7 | - `python -m baselines.run --alg=ppo2 --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M frames on a Mujoco Ant environment.
8 | - also refer to the repo-wide [README.md](../../README.md#training-models)
9 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/defaults.py:
--------------------------------------------------------------------------------
1 | def mujoco():
2 | return dict(
3 | nsteps=2048,
4 | nminibatches=32,
5 | lam=0.95,
6 | gamma=0.99,
7 | noptepochs=10,
8 | log_interval=1,
9 | ent_coef=0.0,
10 | lr=lambda f: 3e-4 * f,
11 | cliprange=0.2,
12 | value_network='copy'
13 | )
14 |
15 | def atari():
16 | return dict(
17 | nsteps=128, nminibatches=4,
18 | lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
19 | ent_coef=.01,
20 | lr=lambda f : f * 2.5e-4,
21 | cliprange=0.1,
22 | )
23 |
24 | def retro():
25 | return atari()
26 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/microbatched_model.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | from baselines.ppo2.model import Model
4 |
5 | class MicrobatchedModel(Model):
6 | """
7 | Model that does training one microbatch at a time - when gradient computation
8 | on the entire minibatch causes some overflow
9 | """
10 | def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
11 | nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, comm, microbatch_size):
12 |
13 | self.nmicrobatches = nbatch_train // microbatch_size
14 | self.microbatch_size = microbatch_size
15 | assert nbatch_train % microbatch_size == 0, 'microbatch_size ({}) should divide nbatch_train ({}) evenly'.format(microbatch_size, nbatch_train)
16 |
17 | super().__init__(
18 | policy=policy,
19 | ob_space=ob_space,
20 | ac_space=ac_space,
21 | nbatch_act=nbatch_act,
22 | nbatch_train=microbatch_size,
23 | nsteps=nsteps,
24 | ent_coef=ent_coef,
25 | vf_coef=vf_coef,
26 | max_grad_norm=max_grad_norm,
27 | mpi_rank_weight=mpi_rank_weight,
28 | comm=comm)
29 |
30 | self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads]
31 | grads_ph_and_vars = list(zip(self.grads_ph, self.var))
32 | self._apply_gradients_op = self.trainer.apply_gradients(grads_ph_and_vars)
33 |
34 |
35 | def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
36 | assert states is None, "microbatches with recurrent models are not supported yet"
37 |
38 | # Here we calculate advantage A(s,a) = R + yV(s') - V(s)
39 | # Returns = R + yV(s')
40 | advs = returns - values
41 |
42 | # Normalize the advantages
43 | advs = (advs - advs.mean()) / (advs.std() + 1e-8)
44 |
45 | # Initialize empty list for per-microbatch stats like pg_loss, vf_loss, entropy, approxkl (whatever is in self.stats_list)
46 | stats_vs = []
47 |
48 | for microbatch_idx in range(self.nmicrobatches):
49 | _sli = range(microbatch_idx * self.microbatch_size, (microbatch_idx+1) * self.microbatch_size)
50 | td_map = {
51 | self.train_model.X: obs[_sli],
52 | self.A:actions[_sli],
53 | self.ADV:advs[_sli],
54 | self.R:returns[_sli],
55 | self.CLIPRANGE:cliprange,
56 | self.OLDNEGLOGPAC:neglogpacs[_sli],
57 | self.OLDVPRED:values[_sli]
58 | }
59 |
60 | # Compute gradient on a microbatch (note that variables do not change here) ...
61 | grad_v, stats_v = self.sess.run([self.grads, self.stats_list], td_map)
62 | if microbatch_idx == 0:
63 | sum_grad_v = grad_v
64 | else:
65 | # .. and add to the total of the gradients
66 | for i, g in enumerate(grad_v):
67 | sum_grad_v[i] += g
68 | stats_vs.append(stats_v)
69 |
70 | feed_dict = {ph: sum_g / self.nmicrobatches for ph, sum_g in zip(self.grads_ph, sum_grad_v)}
71 | feed_dict[self.LR] = lr
72 | # Update variables using average of the gradients
73 | self.sess.run(self._apply_gradients_op, feed_dict)
74 | # Return average of the stats
75 | return np.mean(np.array(stats_vs), axis=0).tolist()
76 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/runner.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from baselines.common.runners import AbstractEnvRunner
3 |
4 | class Runner(AbstractEnvRunner):
5 | """
6 | We use this object to make a mini batch of experiences
7 | __init__:
8 | - Initialize the runner
9 |
10 | run():
11 | - Make a mini batch
12 | """
13 | def __init__(self, *, env, model, nsteps, gamma, lam):
14 | super().__init__(env=env, model=model, nsteps=nsteps)
15 | # Lambda used in GAE (General Advantage Estimation)
16 | self.lam = lam
17 | # Discount rate
18 | self.gamma = gamma
19 |
20 | def run(self):
21 | # Here, we init the lists that will contain the mb of experiences
22 | mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[]
23 | mb_states = self.states
24 | epinfos = []
25 | # For n in range number of steps
26 | for _ in range(self.nsteps):
27 | # Given observations, get action value and neglopacs
28 | # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init
29 | actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones)
30 | mb_obs.append(self.obs.copy())
31 | mb_actions.append(actions)
32 | mb_values.append(values)
33 | mb_neglogpacs.append(neglogpacs)
34 | mb_dones.append(self.dones)
35 |
36 | # Take actions in env and look the results
37 | # Infos contains a ton of useful informations
38 | self.obs[:], rewards, self.dones, infos = self.env.step(actions)
39 | for info in infos:
40 | maybeepinfo = info.get('episode')
41 | if maybeepinfo: epinfos.append(maybeepinfo)
42 | mb_rewards.append(rewards)
43 | #batch of steps to batch of rollouts
44 | mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype)
45 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32)
46 | mb_actions = np.asarray(mb_actions)
47 | mb_values = np.asarray(mb_values, dtype=np.float32)
48 | mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32)
49 | mb_dones = np.asarray(mb_dones, dtype=np.bool)
50 | last_values = self.model.value(self.obs, S=self.states, M=self.dones)
51 |
52 | # discount/bootstrap off value fn
53 | mb_returns = np.zeros_like(mb_rewards)
54 | mb_advs = np.zeros_like(mb_rewards)
55 | lastgaelam = 0
56 | for t in reversed(range(self.nsteps)):
57 | if t == self.nsteps - 1:
58 | nextnonterminal = 1.0 - self.dones
59 | nextvalues = last_values
60 | else:
61 | nextnonterminal = 1.0 - mb_dones[t+1]
62 | nextvalues = mb_values[t+1]
63 | delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t]
64 | mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam
65 | mb_returns = mb_advs + mb_values
66 | return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)),
67 | mb_states, epinfos)
68 | # obs, returns, masks, actions, values, neglogpacs, states = runner.run()
69 | def sf01(arr):
70 | """
71 | swap and then flatten axes 0 and 1
72 | """
73 | s = arr.shape
74 | return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:])
75 |
76 |
77 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/test_microbatches.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import tensorflow as tf
3 | import numpy as np
4 | from functools import partial
5 |
6 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
7 | from baselines.common.tf_util import make_session
8 | from baselines.ppo2.ppo2 import learn
9 |
10 | from baselines.ppo2.microbatched_model import MicrobatchedModel
11 |
12 | def test_microbatches():
13 | def env_fn():
14 | env = gym.make('CartPole-v0')
15 | env.seed(0)
16 | return env
17 |
18 | learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0)
19 |
20 | env_ref = DummyVecEnv([env_fn])
21 | sess_ref = make_session(make_default=True, graph=tf.Graph())
22 | learn_fn(env=env_ref)
23 | vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()}
24 |
25 | env_test = DummyVecEnv([env_fn])
26 | sess_test = make_session(make_default=True, graph=tf.Graph())
27 | learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2))
28 | # learn_fn(env=env_test)
29 | vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()}
30 |
31 | for v in vars_ref:
32 | np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3)
33 |
34 | if __name__ == '__main__':
35 | test_microbatches()
36 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/trpo_mpi/README.md:
--------------------------------------------------------------------------------
1 | # trpo_mpi
2 |
3 | - Original paper: https://arxiv.org/abs/1502.05477
4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 16 python -m baselines.run --alg=trpo_mpi --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options.
6 | - `python -m baselines.run --alg=trpo_mpi --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M timesteps on a Mujoco Ant environment.
7 | - also refer to the repo-wide [README.md](../../README.md#training-models)
8 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/trpo_mpi/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/trpo_mpi/__init__.py
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/trpo_mpi/defaults.py:
--------------------------------------------------------------------------------
1 | from baselines.common.models import mlp, cnn_small
2 |
3 |
4 | def atari():
5 | return dict(
6 | network = cnn_small(),
7 | timesteps_per_batch=512,
8 | max_kl=0.001,
9 | cg_iters=10,
10 | cg_damping=1e-3,
11 | gamma=0.98,
12 | lam=1.0,
13 | vf_iters=3,
14 | vf_stepsize=1e-4,
15 | entcoeff=0.00,
16 | )
17 |
18 | def mujoco():
19 | return dict(
20 | network = mlp(num_hidden=32, num_layers=2),
21 | timesteps_per_batch=1024,
22 | max_kl=0.01,
23 | cg_iters=10,
24 | cg_damping=0.1,
25 | gamma=0.99,
26 | lam=0.98,
27 | vf_iters=5,
28 | vf_stepsize=1e-3,
29 | normalize_observations=True,
30 | )
31 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | select = F,E999,W291,W293
3 | exclude =
4 | .git,
5 | __pycache__,
6 | baselines/ppo1,
7 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/setup.py:
--------------------------------------------------------------------------------
1 | import re
2 | from setuptools import setup, find_packages
3 | import sys
4 |
5 | if sys.version_info.major != 3:
6 | print('This Python is only compatible with Python 3, but you are running '
7 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major))
8 |
9 |
10 | extras = {
11 | 'test': [
12 | 'filelock',
13 | 'pytest',
14 | 'pytest-forked',
15 | 'atari-py',
16 | 'matplotlib',
17 | 'pandas'
18 | ],
19 | 'mpi': [
20 | 'mpi4py'
21 | ]
22 | }
23 |
24 | all_deps = []
25 | for group_name in extras:
26 | all_deps += extras[group_name]
27 |
28 | extras['all'] = all_deps
29 |
30 | setup(name='baselines',
31 | packages=[package for package in find_packages()
32 | if package.startswith('baselines')],
33 | install_requires=[
34 | 'gym>=0.10.0, <1.0.0',
35 | 'scipy',
36 | 'tqdm',
37 | 'joblib',
38 | 'cloudpickle',
39 | 'click',
40 | 'opencv-python'
41 | ],
42 | extras_require=extras,
43 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
44 | author='OpenAI',
45 | url='https://github.com/openai/baselines',
46 | author_email='gym@openai.com',
47 | version='0.1.6')
48 |
49 |
50 | # ensure there is some tensorflow build with version above 1.4
51 | import pkg_resources
52 | tf_pkg = None
53 | for tf_pkg_name in ['tensorflow', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-gpu']:
54 | try:
55 | tf_pkg = pkg_resources.get_distribution(tf_pkg_name)
56 | except pkg_resources.DistributionNotFound:
57 | pass
58 | assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4'
59 | from distutils.version import LooseVersion
60 | assert LooseVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= LooseVersion('1.4.0')
61 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/enjoy.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | # workaround to unpickle olf model files
4 | import sys
5 |
6 | import numpy as np
7 | import torch
8 |
9 | from a2c_ppo_acktr.envs import VecPyTorch, make_vec_envs
10 | from a2c_ppo_acktr.utils import get_render_func, get_vec_normalize
11 |
12 | sys.path.append('a2c_ppo_acktr')
13 |
14 | parser = argparse.ArgumentParser(description='RL')
15 | parser.add_argument(
16 | '--seed', type=int, default=1, help='random seed (default: 1)')
17 | parser.add_argument(
18 | '--log-interval',
19 | type=int,
20 | default=10,
21 | help='log interval, one log per n updates (default: 10)')
22 | parser.add_argument(
23 | '--env-name',
24 | default='PongNoFrameskip-v4',
25 | help='environment to train on (default: PongNoFrameskip-v4)')
26 | parser.add_argument(
27 | '--load-dir',
28 | default='./trained_models/',
29 | help='directory to save agent logs (default: ./trained_models/)')
30 | parser.add_argument(
31 | '--non-det',
32 | action='store_true',
33 | default=False,
34 | help='whether to use a non-deterministic policy')
35 | args = parser.parse_args()
36 |
37 | args.det = not args.non_det
38 |
39 | env = make_vec_envs(
40 | args.env_name,
41 | args.seed + 1000,
42 | 1,
43 | None,
44 | None,
45 | device='cpu',
46 | allow_early_resets=False)
47 |
48 | # Get a render function
49 | render_func = get_render_func(env)
50 |
51 | # We need to use the same statistics for normalization as used in training
52 | actor_critic, ob_rms = \
53 | torch.load(os.path.join(args.load_dir, args.env_name + ".pt"))
54 |
55 | vec_norm = get_vec_normalize(env)
56 | if vec_norm is not None:
57 | vec_norm.eval()
58 | vec_norm.ob_rms = ob_rms
59 |
60 | recurrent_hidden_states = torch.zeros(1,
61 | actor_critic.recurrent_hidden_state_size)
62 | masks = torch.zeros(1, 1)
63 |
64 | obs = env.reset()
65 |
66 | if render_func is not None:
67 | render_func('human')
68 |
69 | if args.env_name.find('Bullet') > -1:
70 | import pybullet as p
71 |
72 | torsoId = -1
73 | for i in range(p.getNumBodies()):
74 | if (p.getBodyInfo(i)[0].decode() == "torso"):
75 | torsoId = i
76 |
77 | while True:
78 | with torch.no_grad():
79 | value, action, _, recurrent_hidden_states = actor_critic.act(
80 | obs, recurrent_hidden_states, masks, deterministic=args.det)
81 |
82 | # Obser reward and next obs
83 | obs, reward, done, _ = env.step(action)
84 |
85 | masks.fill_(0.0 if done else 1.0)
86 |
87 | if args.env_name.find('Bullet') > -1:
88 | if torsoId > -1:
89 | distance = 5
90 | yaw = 0
91 | humanPos, humanOrn = p.getBasePositionAndOrientation(torsoId)
92 | p.resetDebugVisualizerCamera(distance, yaw, -20, humanPos)
93 |
94 | if render_func is not None:
95 | render_func('human')
96 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/evaluation.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 | from a2c_ppo_acktr import utils
5 | from a2c_ppo_acktr.envs import make_vec_envs
6 |
7 |
8 | def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir,
9 | device):
10 | eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes,
11 | None, eval_log_dir, device, True)
12 |
13 | vec_norm = utils.get_vec_normalize(eval_envs)
14 | if vec_norm is not None:
15 | vec_norm.eval()
16 | vec_norm.ob_rms = ob_rms
17 |
18 | eval_episode_rewards = []
19 |
20 | obs = eval_envs.reset()
21 | eval_recurrent_hidden_states = torch.zeros(
22 | num_processes, actor_critic.recurrent_hidden_state_size, device=device)
23 | eval_masks = torch.zeros(num_processes, 1, device=device)
24 |
25 | while len(eval_episode_rewards) < 10:
26 | with torch.no_grad():
27 | _, action, _, eval_recurrent_hidden_states = actor_critic.act(
28 | obs,
29 | eval_recurrent_hidden_states,
30 | eval_masks,
31 | deterministic=True)
32 |
33 | # Obser reward and next obs
34 | obs, _, done, infos = eval_envs.step(action)
35 |
36 | eval_masks = torch.tensor(
37 | [[0.0] if done_ else [1.0] for done_ in done],
38 | dtype=torch.float32,
39 | device=device)
40 |
41 | for info in infos:
42 | if 'episode' in info.keys():
43 | eval_episode_rewards.append(info['episode']['r'])
44 |
45 | eval_envs.close()
46 |
47 | print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
48 | len(eval_episode_rewards), np.mean(eval_episode_rewards)))
49 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/gail_experts/README.md:
--------------------------------------------------------------------------------
1 | ## Data
2 |
3 | Download from
4 | https://drive.google.com/open?id=1Ipu5k99nwewVDG1yFetUxqtwVlgBg5su
5 |
6 | and store in this folder.
7 |
8 | ## Convert to pytorch
9 |
10 | ```bash
11 | python convert_to_pytorch.py --h5-file trajs_halfcheetah.h5
12 | ```
13 |
14 | ## Run
15 |
16 | ```bash
17 | python main.py --env-name "HalfCheetah-v2" --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --gae-lambda 0.95 --num-env-steps 10000000 --use-linear-lr-decay --use-proper-time-limits --gail
18 | ```
19 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/gail_experts/convert_to_pytorch.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import sys
4 |
5 | import h5py
6 | import numpy as np
7 | import torch
8 |
9 |
10 | def main():
11 | parser = argparse.ArgumentParser(
12 | 'Converts expert trajectories from h5 to pt format.')
13 | parser.add_argument(
14 | '--h5-file',
15 | default='trajs_halfcheetah.h5',
16 | help='input h5 file',
17 | type=str)
18 | parser.add_argument(
19 | '--pt-file',
20 | default=None,
21 | help='output pt file, by default replaces file extension with pt',
22 | type=str)
23 | args = parser.parse_args()
24 |
25 | if args.pt_file is None:
26 | args.pt_file = os.path.splitext(args.h5_file)[0] + '.pt'
27 |
28 | with h5py.File(args.h5_file, 'r') as f:
29 | dataset_size = f['obs_B_T_Do'].shape[0] # full dataset size
30 |
31 | states = f['obs_B_T_Do'][:dataset_size, ...][...]
32 | actions = f['a_B_T_Da'][:dataset_size, ...][...]
33 | rewards = f['r_B_T'][:dataset_size, ...][...]
34 | lens = f['len_B'][:dataset_size, ...][...]
35 |
36 | states = torch.from_numpy(states).float()
37 | actions = torch.from_numpy(actions).float()
38 | rewards = torch.from_numpy(rewards).float()
39 | lens = torch.from_numpy(lens).long()
40 |
41 | data = {
42 | 'states': states,
43 | 'actions': actions,
44 | 'rewards': rewards,
45 | 'lengths': lens
46 | }
47 |
48 | torch.save(data, args.pt_file)
49 |
50 |
51 | if __name__ == '__main__':
52 | main()
53 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/generate_tmux_yaml.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import yaml
4 |
5 | parser = argparse.ArgumentParser(description='Process some integers.')
6 | parser.add_argument(
7 | '--num-seeds',
8 | type=int,
9 | default=4,
10 | help='number of random seeds to generate')
11 | parser.add_argument(
12 | '--env-names',
13 | default="PongNoFrameskip-v4",
14 | help='environment name separated by semicolons')
15 | args = parser.parse_args()
16 |
17 | ppo_mujoco_template = "python main.py --env-name {0} --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --tau 0.95 --num-env-steps 1000000 --use-linear-lr-decay --no-cuda --log-dir /tmp/gym/{1}/{1}-{2} --seed {2} --use-proper-time-limits"
18 |
19 | ppo_atari_template = "env CUDA_VISIBLE_DEVICES={2} python main.py --env-name {0} --algo ppo --use-gae --lr 2.5e-4 --clip-param 0.1 --value-loss-coef 0.5 --num-processes 8 --num-steps 128 --num-mini-batch 4 --log-interval 1 --use-linear-lr-decay --entropy-coef 0.01 --log-dir /tmp/gym/{1}/{1}-{2} --seed {2}"
20 |
21 | template = ppo_atari_template
22 |
23 | config = {"session_name": "run-all", "windows": []}
24 |
25 | for i in range(args.num_seeds):
26 | panes_list = []
27 | for env_name in args.env_names.split(';'):
28 | panes_list.append(
29 | template.format(env_name,
30 | env_name.split('-')[0].lower(), i))
31 |
32 | config["windows"].append({
33 | "window_name": "seed-{}".format(i),
34 | "panes": panes_list
35 | })
36 |
37 | yaml.dump(config, open("run_all.yaml", "w"), default_flow_style=False)
38 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/requirements.txt:
--------------------------------------------------------------------------------
1 | gym
2 | matplotlib
3 | pybullet
4 |
--------------------------------------------------------------------------------
/pytorch-a2c-ppo-acktr-gail_modified/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | setup(
4 | name='a2c-ppo-acktr',
5 | packages=find_packages(),
6 | version='0.0.1',
7 | install_requires=['gym', 'matplotlib', 'pybullet'])
8 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tensorboardX
2 | sklearn
3 | numpy
4 | matplotlib
5 | torch-ac
6 | gym
7 | gym-minigrid
8 | colored_traceback
9 | graphviz
10 | gym[atari]
11 | box2d-py
12 | opencv-python
13 | torchvision
14 | pybullet
15 | tqdm
16 | tensorflow-gpu
17 | kdtree
--------------------------------------------------------------------------------
/scripts/__pycache__/analyze_synthesized_programs.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/scripts/__pycache__/analyze_synthesized_programs.cpython-37.pyc
--------------------------------------------------------------------------------