├── README.md ├── VAE_CVAE_MNIST_mod ├── README.md ├── __pycache__ │ ├── models.cpython-37.pyc │ └── utils.cpython-37.pyc ├── figs │ ├── 1519649452.702026 │ │ ├── E9-Dist.png │ │ └── E9I937.png │ └── 1519649461.195146 │ │ ├── E9-Dist.png │ │ └── E9I937.png ├── models.py ├── requirements.txt ├── train.py └── utils.py ├── __pycache__ ├── datastructures.cpython-37.pyc ├── executor.cpython-37.pyc ├── find_duplicate_programs.cpython-37.pyc ├── gridworld_environments.cpython-37.pyc ├── internal_rewards.cpython-37.pyc ├── learn_program_distance_experiments.cpython-37.pyc ├── operations.cpython-37.pyc ├── operations_list.cpython-37.pyc ├── predict_performance.cpython-37.pyc ├── predict_performance_experiments.cpython-37.pyc ├── program.cpython-37.pyc ├── program_types.cpython-37.pyc ├── run_agent.cpython-37.pyc ├── search_program_experiments.cpython-37.pyc ├── search_programs.cpython-37.pyc ├── simulate_search.cpython-37.pyc └── test_synthesized_programs_experiments.cpython-37.pyc ├── datastructures.py ├── diversity ├── __pycache__ │ └── density_peaks.cpython-37.pyc └── density_peaks.py ├── executor.py ├── find_duplicate_programs.py ├── gridworld_environments.py ├── helpers ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── config.cpython-37.pyc │ ├── debug.cpython-37.pyc │ ├── experiment_params.cpython-37.pyc │ ├── nn.cpython-37.pyc │ ├── plotting.cpython-37.pyc │ ├── probability.cpython-37.pyc │ ├── torch_knn.cpython-37.pyc │ └── util.cpython-37.pyc ├── config.py ├── datastructures.py ├── debug.py ├── experiment_params.py ├── lists.py ├── nn.py ├── plotting.py ├── probability.py ├── statistics │ ├── __pycache__ │ │ └── welfords_std.cpython-37.pyc │ └── welfords_std.py ├── task_queue.py ├── torch_knn.py └── util.py ├── internal_rewards.py ├── learn_program_distance_experiments.py ├── operations.py ├── operations_list.py ├── predict_performance.py ├── predict_performance_experiments.py ├── program.py ├── program_synthesis.py ├── program_types.py ├── pytorch-a2c-ppo-acktr-gail_modified ├── .gitignore ├── LICENSE ├── README.md ├── a2c_ppo_acktr │ ├── __init__.py │ ├── algo │ │ ├── __init__.py │ │ ├── a2c_acktr.py │ │ ├── gail.py │ │ ├── kfac.py │ │ └── ppo.py │ ├── arguments.py │ ├── distributions.py │ ├── envs.py │ ├── model.py │ ├── storage.py │ └── utils.py ├── baselines_modified │ ├── .benchmark_pattern │ ├── .gitignore │ ├── .travis.yml │ ├── Dockerfile │ ├── LICENSE │ ├── README.md │ ├── baselines │ │ ├── __init__.py │ │ ├── a2c │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── a2c.py │ │ │ ├── runner.py │ │ │ └── utils.py │ │ ├── acer │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── acer.py │ │ │ ├── buffer.py │ │ │ ├── defaults.py │ │ │ ├── policies.py │ │ │ └── runner.py │ │ ├── acktr │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── acktr.py │ │ │ ├── defaults.py │ │ │ ├── kfac.py │ │ │ ├── kfac_utils.py │ │ │ └── utils.py │ │ ├── bench │ │ │ ├── __init__.py │ │ │ ├── benchmarks.py │ │ │ ├── monitor.py │ │ │ └── test_monitor.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── atari_wrappers.py │ │ │ ├── cg.py │ │ │ ├── cmd_util.py │ │ │ ├── console_util.py │ │ │ ├── dataset.py │ │ │ ├── distributions.py │ │ │ ├── input.py │ │ │ ├── math_util.py │ │ │ ├── misc_util.py │ │ │ ├── models.py │ │ │ ├── mpi_adam.py │ │ │ ├── mpi_adam_optimizer.py │ │ │ ├── mpi_fork.py │ │ │ ├── mpi_moments.py │ │ │ ├── mpi_running_mean_std.py │ │ │ ├── mpi_util.py │ │ │ ├── plot_util.py │ │ │ ├── policies.py │ │ │ ├── retro_wrappers.py │ │ │ ├── runners.py │ │ │ ├── running_mean_std.py │ │ │ ├── schedules.py │ │ │ ├── segment_tree.py │ │ │ ├── test_mpi_util.py │ │ │ ├── tests │ │ │ │ ├── __init__.py │ │ │ │ ├── envs │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── fixed_sequence_env.py │ │ │ │ │ ├── identity_env.py │ │ │ │ │ ├── identity_env_test.py │ │ │ │ │ └── mnist_env.py │ │ │ │ ├── test_cartpole.py │ │ │ │ ├── test_doc_examples.py │ │ │ │ ├── test_env_after_learn.py │ │ │ │ ├── test_fetchreach.py │ │ │ │ ├── test_fixed_sequence.py │ │ │ │ ├── test_identity.py │ │ │ │ ├── test_mnist.py │ │ │ │ ├── test_plot_util.py │ │ │ │ ├── test_schedules.py │ │ │ │ ├── test_segment_tree.py │ │ │ │ ├── test_serialization.py │ │ │ │ ├── test_tf_util.py │ │ │ │ ├── test_with_mpi.py │ │ │ │ └── util.py │ │ │ ├── tf_util.py │ │ │ ├── tile_images.py │ │ │ ├── vec_env │ │ │ │ ├── __init__.py │ │ │ │ ├── dummy_vec_env.py │ │ │ │ ├── shmem_vec_env.py │ │ │ │ ├── subproc_vec_env.py │ │ │ │ ├── test_vec_env.py │ │ │ │ ├── test_video_recorder.py │ │ │ │ ├── util.py │ │ │ │ ├── vec_env.py │ │ │ │ ├── vec_frame_stack.py │ │ │ │ ├── vec_monitor.py │ │ │ │ ├── vec_normalize.py │ │ │ │ ├── vec_remove_dict_obs.py │ │ │ │ └── vec_video_recorder.py │ │ │ └── wrappers.py │ │ ├── ddpg │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── ddpg.py │ │ │ ├── ddpg_learner.py │ │ │ ├── memory.py │ │ │ ├── models.py │ │ │ ├── noise.py │ │ │ └── test_smoke.py │ │ ├── deepq │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── build_graph.py │ │ │ ├── deepq.py │ │ │ ├── defaults.py │ │ │ ├── experiments │ │ │ │ ├── __init__.py │ │ │ │ ├── custom_cartpole.py │ │ │ │ ├── enjoy_cartpole.py │ │ │ │ ├── enjoy_mountaincar.py │ │ │ │ ├── enjoy_pong.py │ │ │ │ ├── train_cartpole.py │ │ │ │ ├── train_mountaincar.py │ │ │ │ └── train_pong.py │ │ │ ├── models.py │ │ │ ├── replay_buffer.py │ │ │ └── utils.py │ │ ├── gail │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── adversary.py │ │ │ ├── behavior_clone.py │ │ │ ├── dataset │ │ │ │ ├── __init__.py │ │ │ │ └── mujoco_dset.py │ │ │ ├── gail-eval.py │ │ │ ├── mlp_policy.py │ │ │ ├── result │ │ │ │ ├── HalfCheetah-normalized-deterministic-scores.png │ │ │ │ ├── HalfCheetah-normalized-stochastic-scores.png │ │ │ │ ├── HalfCheetah-unnormalized-deterministic-scores.png │ │ │ │ ├── HalfCheetah-unnormalized-stochastic-scores.png │ │ │ │ ├── Hopper-normalized-deterministic-scores.png │ │ │ │ ├── Hopper-normalized-stochastic-scores.png │ │ │ │ ├── Hopper-unnormalized-deterministic-scores.png │ │ │ │ ├── Hopper-unnormalized-stochastic-scores.png │ │ │ │ ├── Humanoid-normalized-deterministic-scores.png │ │ │ │ ├── Humanoid-normalized-stochastic-scores.png │ │ │ │ ├── Humanoid-unnormalized-deterministic-scores.png │ │ │ │ ├── Humanoid-unnormalized-stochastic-scores.png │ │ │ │ ├── HumanoidStandup-normalized-deterministic-scores.png │ │ │ │ ├── HumanoidStandup-normalized-stochastic-scores.png │ │ │ │ ├── HumanoidStandup-unnormalized-deterministic-scores.png │ │ │ │ ├── HumanoidStandup-unnormalized-stochastic-scores.png │ │ │ │ ├── Walker2d-normalized-deterministic-scores.png │ │ │ │ ├── Walker2d-normalized-stochastic-scores.png │ │ │ │ ├── Walker2d-unnormalized-deterministic-scores.png │ │ │ │ ├── Walker2d-unnormalized-stochastic-scores.png │ │ │ │ ├── gail-result.md │ │ │ │ ├── halfcheetah-training.png │ │ │ │ ├── hopper-training.png │ │ │ │ ├── humanoid-training.png │ │ │ │ ├── humanoidstandup-training.png │ │ │ │ └── walker2d-training.png │ │ │ ├── run_mujoco.py │ │ │ ├── statistics.py │ │ │ └── trpo_mpi.py │ │ ├── her │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── actor_critic.py │ │ │ ├── ddpg.py │ │ │ ├── experiment │ │ │ │ ├── __init__.py │ │ │ │ ├── config.py │ │ │ │ ├── data_generation │ │ │ │ │ └── fetch_data_generation.py │ │ │ │ ├── play.py │ │ │ │ └── plot.py │ │ │ ├── her.py │ │ │ ├── her_sampler.py │ │ │ ├── normalizer.py │ │ │ ├── replay_buffer.py │ │ │ ├── rollout.py │ │ │ └── util.py │ │ ├── logger.py │ │ ├── ppo1 │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── cnn_policy.py │ │ │ ├── mlp_policy.py │ │ │ ├── pposgd_simple.py │ │ │ ├── run_atari.py │ │ │ ├── run_humanoid.py │ │ │ ├── run_mujoco.py │ │ │ └── run_robotics.py │ │ ├── ppo2 │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── defaults.py │ │ │ ├── microbatched_model.py │ │ │ ├── model.py │ │ │ ├── ppo2.py │ │ │ ├── runner.py │ │ │ └── test_microbatches.py │ │ ├── results_plotter.py │ │ ├── run.py │ │ └── trpo_mpi │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── defaults.py │ │ │ └── trpo_mpi.py │ ├── benchmarks_atari10M.htm │ ├── benchmarks_mujoco1M.htm │ ├── setup.cfg │ └── setup.py ├── enjoy.py ├── evaluation.py ├── gail_experts │ ├── README.md │ └── convert_to_pytorch.py ├── generate_tmux_yaml.py ├── main.py ├── requirements.txt ├── run_all.yaml ├── setup.py └── visualize.ipynb ├── requirements.txt ├── run_agent.py ├── scripts ├── __pycache__ │ └── analyze_synthesized_programs.cpython-37.pyc ├── analyze_synthesized_programs.py ├── compare_experiments.py ├── learn_program_distance.py └── manually_evaluate_program.py ├── search_program_experiments.py ├── search_programs.py ├── simulate_search.py ├── test_synthesized_programs.py └── test_synthesized_programs_experiments.py /README.md: -------------------------------------------------------------------------------- 1 | # Meta-Learning Curiosity Algorithms 2 | This is the code for "Meta-Learning Curiosity Algorithms" by [Ferran Alet](http://alet-etal.com/)\*, [Martin Schneider](https://github.com/mfranzs)\*, [Tomas Lozano-Perez](https://people.csail.mit.edu/tlp/), and [Leslie Kaelbling](https://people.csail.mit.edu/lpk/). Published at ICLR 2020 (and previously in Meta-Learning and Reinforcment Learning Workshops at NeurIPS 2019). 3 | 4 | See the paper [here](https://openreview.net/pdf?id=BygdyxHFDS). 5 | 6 | ## Overview of Running an Experiment 7 | 1. Specify your operations in operations.py. 8 | 2. Specify a list of operations to use in operations_list.py. 9 | 3. Run program_synthesis.py to synthesize programs with your list of operations. 10 | 4. Specify an experperiment in test_synthesized_programs_experiments.py. 11 | 5. Run test_synthesized_programs.py to search over your program space. 12 | 6. Use scripts/analyze_synthesized_programs.py to analyze your results. 13 | 14 | ## Code Overview 15 | **datastructures.py**: The datastructures manipulated by program operations.\ 16 | **executor.py**: Executes a Program object.\ 17 | **find_duplicate_programs.py**: Takes a list of programs and finds / prunes duplicates by testing each program on a fake environment and looking at the output signature.\ 18 | **gridworld_environments.py**: Our gridworld environments.\ 19 | **internal_rewards.py**: The module that runs intrinsic curiosity programs and reward combiner programs.\ 20 | **operations_list.py**: A configuration file that specifies the operations that can appear in different program classes\ 21 | operations.py: The operations that are composed to create a program.\ 22 | **predict_performance.py**: The regressor that predicts program performance from its \ 23 | **predict_performance_experiments.py**: A configuration file for experimenting with performance regressors.\ 24 | **program.py**: The core abstraction of a program, represented by a DAG of operations**.\ 25 | **program_synthesis.py**: The search module that synthesizes programs.\ 26 | **program_types.py**: The types that operations in our language can output.\ 27 | **run_agent.py**: The module that runs an agent in an environment.\ 28 | **search_programs.py**: The module that searches over a program space, given a list of programs, an environment, and a program selection metric.\ 29 | **search_programs_experiments.py**: A configuration file for simulating program searches.\ 30 | **simulate_search.py**: A module that simulates searching through programs.\ 31 | **test_synthesized_programs.py**: The module that takes a set of synthesized programs and initiates a search over them.\ 32 | **test_synthesized_programs_experiments.py**: The configuration file for testing / searching over programs.\ 33 | -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/README.md: -------------------------------------------------------------------------------- 1 | # Variational Autoencoder & Conditional Variational Autoenoder on MNIST 2 | 3 | VAE paper: [Auto-Encoding Variational Bayes](https://arxiv.org/abs/1312.6114) 4 | 5 | CVAE paper: [Learning Structured Output Representation using Deep Conditional Generative Models](https://papers.nips.cc/paper/5775-learning-structured-output-representation-using-deep-conditional-generative-models) 6 | 7 | --- 8 | In order to run _conditional_ variational autoencoder, add `--conditional` to the the command. Check out the other commandline options in the code for hyperparameter settings (like learning rate, batch size, encoder/decoder layer depth and size). 9 | 10 | --- 11 | 12 | ## Results 13 | 14 | All plots obtained after 10 epochs of training. Hyperparameters accordning to default settings in the code; not tuned. 15 | 16 | ### z ~ q(z|x) and q(z|x,c) 17 | The modeled latent distribution after 10 epochs and 100 samples per digit. 18 | 19 | VAE | CVAE 20 | --- | --- 21 | | 22 | 23 | ### p(x|z) and p(x|z,c) 24 | Randonly sampled z, and their output. For CVAE, each c has been given as input once. 25 | 26 | VAE | CVAE 27 | --- | --- 28 | | 29 | -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/__pycache__/models.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/__pycache__/models.cpython-37.pyc -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/figs/1519649452.702026/E9-Dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/figs/1519649452.702026/E9-Dist.png -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/figs/1519649452.702026/E9I937.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/figs/1519649452.702026/E9I937.png -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/figs/1519649461.195146/E9-Dist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/figs/1519649461.195146/E9-Dist.png -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/figs/1519649461.195146/E9I937.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/VAE_CVAE_MNIST_mod/figs/1519649461.195146/E9I937.png -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.10.0 2 | kiwisolver==1.0.1 3 | matplotlib==3.0.2 4 | numpy==1.16.1 5 | pandas==0.24.1 6 | Pillow==5.4.1 7 | pyparsing==2.3.1 8 | python-dateutil==2.8.0 9 | pytz==2018.9 10 | scipy==1.2.1 11 | seaborn==0.9.0 12 | six==1.12.0 13 | torch==1.0.1.post2 14 | torchvision==0.2.1 15 | -------------------------------------------------------------------------------- /VAE_CVAE_MNIST_mod/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def idx2onehot(idx, n): 5 | 6 | assert torch.max(idx).item() < n 7 | if idx.dim() == 1: 8 | idx = idx.unsqueeze(1) 9 | 10 | onehot = torch.zeros(idx.size(0), n) 11 | onehot.scatter_(1, idx, 1) 12 | 13 | return onehot 14 | -------------------------------------------------------------------------------- /__pycache__/datastructures.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/datastructures.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/executor.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/executor.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/find_duplicate_programs.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/find_duplicate_programs.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/gridworld_environments.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/gridworld_environments.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/internal_rewards.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/internal_rewards.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/learn_program_distance_experiments.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/learn_program_distance_experiments.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/operations.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/operations.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/operations_list.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/operations_list.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/predict_performance.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/predict_performance.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/predict_performance_experiments.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/predict_performance_experiments.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/program.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/program.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/program_types.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/program_types.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/run_agent.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/run_agent.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/search_program_experiments.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/search_program_experiments.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/search_programs.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/search_programs.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/simulate_search.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/simulate_search.cpython-37.pyc -------------------------------------------------------------------------------- /__pycache__/test_synthesized_programs_experiments.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/__pycache__/test_synthesized_programs_experiments.cpython-37.pyc -------------------------------------------------------------------------------- /datastructures.py: -------------------------------------------------------------------------------- 1 | """ 2 | The datastructures manipulated by the operations in operations.py 3 | """ 4 | 5 | from torch import nn 6 | 7 | import collections 8 | import numpy as np 9 | 10 | from mlca.helpers.nn import SimpleConvNet, MLP 11 | 12 | CHW = collections.namedtuple('CHW', ('channels', 'height', 'width')) 13 | 14 | class CNNModule(nn.Module): 15 | def __init__(self, environment): 16 | super().__init__() 17 | 18 | observation_space = environment.observation_space.shape 19 | image_size = CHW( 20 | observation_space[2], 21 | observation_space[0], 22 | observation_space[1]) 23 | 24 | self.conv = SimpleConvNet(image_size.channels, 1, [], [3], {"USE_BATCH_NORM": True}) 25 | self.mlp = MLP( 26 | self.conv.output_size( 27 | (image_size.width, image_size.height)), 32, [32, 32] 28 | ) 29 | 30 | def forward(self, x): 31 | # x = x.permute(0, 3, 1, 2) 32 | x = self.conv(x) 33 | x = x.flatten(start_dim=1) 34 | x = self.mlp(x) 35 | return x 36 | 37 | class ObservationMLPModule(nn.Module): 38 | def __init__(self, environment): 39 | super().__init__() 40 | 41 | self.mlp = MLP( 42 | np.prod(environment.observation_space.shape), 43 | 32, [16, 32, 64]) 44 | 45 | def forward(self, x): 46 | x = x.flatten(start_dim=1) 47 | x = self.mlp(x) 48 | return x 49 | 50 | class Ensemble(nn.Module): 51 | def __init__(self, modules, environment): 52 | super().__init__() 53 | 54 | self.module_list = nn.ModuleList(modules) 55 | 56 | def forward(self, x): 57 | return [module(x) for module in self.module_list] 58 | -------------------------------------------------------------------------------- /diversity/__pycache__/density_peaks.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/diversity/__pycache__/density_peaks.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__init__.py -------------------------------------------------------------------------------- /helpers/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__pycache__/debug.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/debug.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__pycache__/experiment_params.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/experiment_params.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__pycache__/nn.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/nn.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__pycache__/plotting.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/plotting.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__pycache__/probability.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/probability.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__pycache__/torch_knn.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/torch_knn.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/__pycache__/util.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/__pycache__/util.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/config.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import pprint 4 | from typing import List 5 | 6 | def argparser(): 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--experiment_id", required=True, 9 | help="id of the experiment we want to run; auto-selects the parameters") 10 | parser.add_argument("--render", action="store_true", 11 | help="render the environment", default=False) 12 | parser.add_argument("--dont_train", action="store_true", 13 | help="don't train the model", default=False) 14 | parser.add_argument("--dont_save", action="store_true", 15 | help="don't save the model", default=False) 16 | parser.add_argument("--dont_load", action="store_true", 17 | help="don't load the model", default=False) 18 | parser.add_argument("--cpu", action="store_true", default=False) 19 | parser.add_argument("--profiler", action="store_true", 20 | help="print profiler data", default=False) 21 | return parser 22 | 23 | 24 | def clean_experiment_id(experiment_id): 25 | # Only get the experiment_id before --version 26 | return experiment_id.split("--version")[0] 27 | 28 | 29 | def get_params(experiments, experiment_id, print_params=True, recursing=False): 30 | print("WARNING: get_params is deprecated") 31 | experiment_id = clean_experiment_id(experiment_id) 32 | if experiment_id not in experiments: 33 | raise RuntimeWarning("The experiment ID "+experiment_id + 34 | " does not exist! Valid ids: " + str(experiments.keys())) 35 | 36 | e = experiments[experiment_id] 37 | e["__EXPERIMENT_ID__"] = experiment_id 38 | 39 | if "__PARENT__" in e: 40 | parent = get_params(experiments, e["__PARENT__"], print_params, True) 41 | e = {**parent, **e} 42 | 43 | if not recursing and print_params: 44 | print("Parameters: ", experiment_id) 45 | pprint.pprint(e) 46 | 47 | assert e.get("BUGGED", None) is None, e.get("BUGGED") 48 | 49 | return e 50 | 51 | def get_device_and_set_default(args = None): 52 | use_cuda = torch.cuda.is_available() and not (args is not None and args.cpu) 53 | 54 | device = "cuda" if use_cuda else "cpu" 55 | print("Device", device, " - Cuda available?", torch.cuda.is_available()) 56 | if device == "cuda": 57 | torch.set_default_tensor_type(torch.cuda.FloatTensor) 58 | else: 59 | torch.set_default_tensor_type(torch.FloatTensor) 60 | 61 | return device 62 | 63 | class DefaultDevice: 64 | active_default_parameters: List[str] = [] 65 | 66 | def __init__(self, default_device): 67 | self.default_device = default_device 68 | 69 | def __enter__(self): 70 | self.active_default_parameters.append(self.default_device) 71 | return self 72 | 73 | def __exit__(self, type, value, traceback): 74 | self.active_default_parameters.pop() 75 | 76 | @classmethod 77 | def current(cls): 78 | if len(cls.active_default_parameters) == 0: 79 | raise RuntimeError(f"No current global default device set. Use a with statement to set the default device.") 80 | return cls.active_default_parameters[-1] 81 | -------------------------------------------------------------------------------- /helpers/datastructures.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from mlca.helpers.nn import SimpleConvNet, MLP 4 | 5 | class Ensemble(nn.Module): 6 | def __init__(self, environment): 7 | pass 8 | 9 | def forward(self, x): 10 | x = self.conv(x) 11 | x = self.mlp(x) 12 | return x 13 | -------------------------------------------------------------------------------- /helpers/experiment_params.py: -------------------------------------------------------------------------------- 1 | """ 2 | A small library that represents experiment parameters using 3 | Python Dataclasses. Supports creating lists of experiment parameters, 4 | chaining experiment paramters, and registering a current globally active 5 | parameter instance for a given paramter class (using a Python context). 6 | """ 7 | 8 | import dataclasses 9 | from collections import OrderedDict 10 | import pprint 11 | from typing import List, Optional 12 | 13 | ExperimentId = str 14 | 15 | class ExperimentParameters(): 16 | active_parameters: Optional[List] = None 17 | 18 | @classmethod 19 | def current(cls): 20 | if cls.active_parameters is None or len(cls.active_parameters) == 0: 21 | raise RuntimeError(f"No current global parameters set for {cls.__name__}. Use a with statement to set the current context.") 22 | return cls.active_parameters[-1] 23 | 24 | @classmethod 25 | def _set_active_parameters(cls, active_parameters): 26 | # Make the active parameter list when needed so we have a different one 27 | # for every parameters class 28 | if cls.active_parameters is None: 29 | cls.active_parameters = [] 30 | 31 | cls.active_parameters.append(active_parameters) 32 | 33 | @classmethod 34 | def _clear_active_parameters(cls): 35 | cls.active_parameters.pop() 36 | 37 | def register_experiment_id(self, _experiment_id: ExperimentId): 38 | assert not hasattr(self, '_experiment_id'), "Warning: You cannot use the reserved field _experiment_id in your experiment params." 39 | self._experiment_id = _experiment_id 40 | 41 | def replace(self, **kwargs): 42 | return dataclasses.replace(self, **kwargs) 43 | 44 | def __enter__(self): 45 | self._set_active_parameters(self) 46 | return self 47 | 48 | def __exit__(self, type, value, traceback): 49 | self._clear_active_parameters() 50 | 51 | class ExperimentParameterList(dict): 52 | def __setitem__(self, key: ExperimentId, item: ExperimentParameters): 53 | assert key not in self.__dict__, f"The experiment {key} has already been registered." 54 | self.__dict__[key] = item 55 | item.register_experiment_id(key) 56 | 57 | def get(self, key: ExperimentId, print_params=True): 58 | if print_params: 59 | print("Get experiments", key) 60 | print(pprint.pformat(self.__dict__[key])) 61 | return self[key] 62 | 63 | def __getitem__(self, key: ExperimentId): 64 | return self.__dict__[key] 65 | -------------------------------------------------------------------------------- /helpers/lists.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import operator 3 | 4 | def flatten(a): 5 | return functools.reduce(operator.concat, a) 6 | -------------------------------------------------------------------------------- /helpers/probability.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # Modified from 4 | # https://introcs.cs.princeton.edu/python/22module/gaussian.py.html 5 | 6 | BOUNDED_INFINITY = 10. 7 | def pdf(orig_x, mu=0.0, sigma=1.0): 8 | sigma = sigma.astype(np.float) 9 | orig_x = orig_x.astype(np.float) 10 | x = orig_x - mu / sigma 11 | a = np.exp(-x*x/2.0) / np.sqrt(2.0*np.pi) / sigma 12 | return np.where(np.logical_or(np.logical_or(np.isinf(a), np.isnan(a)), sigma == 0), 13 | np.where(orig_x == mu, np.ones_like(mu) * BOUNDED_INFINITY, np.zeros_like(mu)), 14 | a 15 | ) 16 | # return np.divide(a, sigma, out=np.isclose(orig_x, mu).astype(np.float) * BOUNDED_INFINITY, where=sigma!=0) 17 | 18 | def cdf(z, mu=0.0, sigma=1.0): 19 | z = z - mu / sigma 20 | if z < -8.0: return 0.0 21 | if z > +8.0: return 1.0 22 | total = 0.0 23 | term = z 24 | i = 3 25 | while total != total + term: 26 | total += term 27 | term *= z * z / i 28 | i += 2 29 | return 0.5 + total * pdf(z) 30 | -------------------------------------------------------------------------------- /helpers/statistics/__pycache__/welfords_std.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/helpers/statistics/__pycache__/welfords_std.cpython-37.pyc -------------------------------------------------------------------------------- /helpers/statistics/welfords_std.py: -------------------------------------------------------------------------------- 1 | """ 2 | SOURCE: https://gist.github.com/alexalemi/2151722 3 | """ 4 | 5 | import math 6 | import numpy as np 7 | 8 | class Welford(object): 9 | """ Implements Welford's algorithm for computing a running mean 10 | and standard deviation as described at: 11 | http://www.johndcook.com/standard_deviation.html 12 | 13 | can take single values or iterables 14 | 15 | Properties: 16 | mean - returns the mean 17 | std - returns the std 18 | meanfull- returns the mean and std of the mean 19 | 20 | Usage: 21 | >>> foo = Welford() 22 | >>> foo(range(100)) 23 | >>> foo 24 | 25 | >>> foo([1]*1000) 26 | >>> foo 27 | 28 | >>> foo.mean 29 | 5.409090909090906 30 | >>> foo.std 31 | 16.44374171455467 32 | >>> foo.meanfull 33 | (5.409090909090906, 0.4957974674244838) 34 | """ 35 | 36 | def __init__(self, lst=None): 37 | self.k = 0 38 | self.M = 0 39 | self.S = 0 40 | 41 | self.__call__(lst) 42 | 43 | def update(self, x): 44 | if x is None: 45 | return 46 | self.k += 1 47 | newM = self.M + (x - self.M)*1./self.k 48 | newS = self.S + (x - self.M)*(x - newM) 49 | self.M, self.S = newM, newS 50 | 51 | def consume(self, lst): 52 | lst = iter(lst) 53 | for x in lst: 54 | self.update(x) 55 | 56 | def __call__(self, x): 57 | if hasattr(x, "__iter__"): 58 | self.consume(x) 59 | else: 60 | self.update(x) 61 | 62 | @property 63 | def mean(self): 64 | return self.M 65 | 66 | @property 67 | def meanfull(self): 68 | return self.mean, self.std/math.sqrt(self.k) 69 | 70 | @property 71 | def std(self): 72 | if self.k == 1: 73 | return 0 74 | return math.sqrt(self.S/(self.k-1)) 75 | 76 | def __repr__(self): 77 | return "".format(self.mean, self.std) 78 | 79 | if __name__ == "__main__": 80 | w = Welford() 81 | arr = np.random.random(100) 82 | w.consume(arr) 83 | print(w.mean, w.std) 84 | print(arr.mean(), arr.std()) -------------------------------------------------------------------------------- /helpers/task_queue.py: -------------------------------------------------------------------------------- 1 | """ 2 | Based On: 3 | https://medium.com/@shashwat_ds/a-tiny-multi-threaded-job-queue-in-30-lines-of-python-a344c3f3f7f0 4 | WARNING: Use this, not the source. Source had bugs. 5 | """ 6 | 7 | from threading import Thread 8 | from queue import Queue 9 | import time 10 | 11 | class TaskQueue(Queue): 12 | 13 | def __init__(self, num_workers=1): 14 | super().__init__() 15 | self.num_workers = num_workers 16 | self.start_workers() 17 | 18 | def add_task(self, task, *args, **kwargs): 19 | args = args or () 20 | kwargs = kwargs or {} 21 | self.put((task, args, kwargs)) 22 | 23 | def start_workers(self): 24 | for i in range(self.num_workers): 25 | t = Thread(target=self.worker, args=[i]) 26 | t.daemon = True 27 | t.start() 28 | 29 | def worker(self, worker_id): 30 | while True: 31 | tupl = self.get() 32 | # print("Worker", tupl) 33 | item, args, kwargs = tupl 34 | kwargs["worker_id"] = worker_id 35 | item(*args, **kwargs) 36 | self.task_done() 37 | 38 | 39 | def tests(): 40 | def t(*args, **kwargs): 41 | time.sleep(1) 42 | print(args) 43 | 44 | q = TaskQueue(num_workers=3) 45 | 46 | for item in range(10): 47 | q.add_task(t, item) 48 | 49 | q.join() # block until all tasks are done 50 | 51 | if __name__ == "__main__": 52 | tests() 53 | -------------------------------------------------------------------------------- /helpers/torch_knn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | import torch 4 | from mlca.helpers.config import DefaultDevice 5 | 6 | # Tested in mlca/curiosity/program_synthesis/scripts/misc/test_knn_speed.py 7 | 8 | class TorchKNN: 9 | def __init__(self, buffer_size, feature_size, num_neighbors): 10 | self.buffer = torch.zeros(buffer_size, feature_size, device=DefaultDevice.current()) 11 | self.nearest_neighbors = torch.zeros(num_neighbors, feature_size, device=DefaultDevice.current()) 12 | 13 | self.buffer_size = buffer_size 14 | self.feature_size = feature_size 15 | self.num_neighbors = num_neighbors 16 | 17 | self.num_points = 0 18 | self.buffer_pos = 0 19 | 20 | def add(self, x): 21 | assert x.shape[0] == self.feature_size 22 | 23 | self.buffer[self.buffer_pos] = x 24 | 25 | self.num_points += 1 26 | self.buffer_pos += 1 27 | if self.buffer_pos >= self.buffer_size: 28 | self.buffer_pos = 0 29 | 30 | def predict(self, x): 31 | if self.num_points == 0: 32 | return torch.rand(self.feature_size) 33 | else: 34 | distances = torch.norm( 35 | self.buffer[:min(self.num_points, self.buffer_size)] - x, dim=1) 36 | 37 | _, indices = torch.topk( 38 | distances, min(self.num_neighbors, self.num_points), 39 | largest=False, sorted=False) 40 | nearest = self.buffer[indices] 41 | prediction = torch.mean(nearest, dim=0) 42 | 43 | assert prediction.shape == (self.feature_size, ) 44 | return prediction 45 | 46 | 47 | class TorchKNNRegressor: 48 | def __init__(self, buffer_size, feature_size, num_neighbors): 49 | self.query_buffer = torch.zeros(buffer_size, feature_size) 50 | self.target_buffer = torch.zeros(buffer_size, feature_size) 51 | self.nearest_neighbors = torch.zeros(num_neighbors, feature_size) 52 | 53 | self.buffer_size = buffer_size 54 | self.feature_size = feature_size 55 | self.num_neighbors = num_neighbors 56 | 57 | self.num_points = 0 58 | self.buffer_pos = 0 59 | 60 | def add(self, query, target): 61 | assert query.shape[0] == self.feature_size 62 | assert target.shape[0] == self.feature_size 63 | 64 | self.query_buffer[self.buffer_pos] = query 65 | self.target_buffer[self.buffer_pos] = target 66 | 67 | self.num_points += 1 68 | self.buffer_pos += 1 69 | if self.buffer_pos >= self.buffer_size: 70 | self.buffer_pos = 0 71 | 72 | def predict(self, x): 73 | if self.num_points == 0: 74 | return torch.rand(self.feature_size) 75 | else: 76 | distances = torch.norm( 77 | self.query_buffer[:min(self.num_points, self.buffer_size)] - x, dim=1) 78 | 79 | _, indices = torch.topk( 80 | distances, min(self.num_neighbors, self.num_points), 81 | largest=False, sorted=False) 82 | nearest = self.target_buffer[indices] 83 | prediction = torch.mean(nearest, dim=0) 84 | 85 | assert prediction.shape == (self.feature_size, ) 86 | return prediction 87 | -------------------------------------------------------------------------------- /helpers/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import random 4 | 5 | def set_random_seed(random_seed): 6 | np.random.seed(random_seed) 7 | random.seed(random_seed) 8 | torch.manual_seed(random_seed) 9 | torch.cuda.manual_seed(random_seed) 10 | torch.cuda.manual_seed_all(random_seed) 11 | -------------------------------------------------------------------------------- /learn_program_distance_experiments.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | from mlca.helpers.experiment_params import ExperimentParameters, ExperimentParameterList 3 | from enum import Enum 4 | from typing import Optional, List, Any 5 | from dataclasses import dataclass 6 | 7 | @dataclass 8 | class ProgramDistanceParams(ExperimentParameters): 9 | FEATURE_PAIRS: int 10 | TASK: str 11 | MODEL: str 12 | FEATURE_INPUT_OUTPUT: bool 13 | TEST_SYNTHESIZED_PROGRAMS_EXP_NAME: Optional[str] 14 | NEIGHBORS: Optional[int] = None 15 | 16 | ProgramDistanceExperimentList = ExperimentParameterList() 17 | 18 | ProgramDistanceExperimentList["mlp"] = ProgramDistanceParams( 19 | FEATURE_PAIRS = 1, 20 | NEIGHBORS = 10, 21 | TASK = "TEST_SINGLE", 22 | MODEL = "MLP", 23 | FEATURE_INPUT_OUTPUT = False, 24 | TEST_SYNTHESIZED_PROGRAMS_EXP_NAME="2-96_program-correlation-5", 25 | ) 26 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | trained_models/ 104 | .fuse_hidden* 105 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Ilya Kostrikov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/algo/__init__.py: -------------------------------------------------------------------------------- 1 | from .a2c_acktr import A2C_ACKTR 2 | from .ppo import PPO -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/algo/a2c_acktr.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | 5 | from a2c_ppo_acktr.algo.kfac import KFACOptimizer 6 | 7 | 8 | class A2C_ACKTR(): 9 | def __init__(self, 10 | actor_critic, 11 | value_loss_coef, 12 | entropy_coef, 13 | lr=None, 14 | eps=None, 15 | alpha=None, 16 | max_grad_norm=None, 17 | acktr=False): 18 | 19 | self.actor_critic = actor_critic 20 | self.acktr = acktr 21 | 22 | self.value_loss_coef = value_loss_coef 23 | self.entropy_coef = entropy_coef 24 | 25 | self.max_grad_norm = max_grad_norm 26 | 27 | if acktr: 28 | self.optimizer = KFACOptimizer(actor_critic) 29 | else: 30 | self.optimizer = optim.RMSprop( 31 | actor_critic.parameters(), lr, eps=eps, alpha=alpha) 32 | 33 | def update(self, rollouts): 34 | obs_shape = rollouts.obs.size()[2:] 35 | action_shape = rollouts.actions.size()[-1] 36 | num_steps, num_processes, _ = rollouts.rewards.size() 37 | 38 | values, action_log_probs, dist_entropy, _ = self.actor_critic.evaluate_actions( 39 | rollouts.obs[:-1].view(-1, *obs_shape), 40 | rollouts.recurrent_hidden_states[0].view( 41 | -1, self.actor_critic.recurrent_hidden_state_size), 42 | rollouts.masks[:-1].view(-1, 1), 43 | rollouts.actions.view(-1, action_shape)) 44 | 45 | values = values.view(num_steps, num_processes, 1) 46 | action_log_probs = action_log_probs.view(num_steps, num_processes, 1) 47 | 48 | advantages = rollouts.returns[:-1] - values 49 | value_loss = advantages.pow(2).mean() 50 | 51 | action_loss = -(advantages.detach() * action_log_probs).mean() 52 | 53 | if self.acktr and self.optimizer.steps % self.optimizer.Ts == 0: 54 | # Sampled fisher, see Martens 2014 55 | self.actor_critic.zero_grad() 56 | pg_fisher_loss = -action_log_probs.mean() 57 | 58 | value_noise = torch.randn(values.size()) 59 | if values.is_cuda: 60 | value_noise = value_noise.cuda() 61 | 62 | sample_values = values + value_noise 63 | vf_fisher_loss = -(values - sample_values.detach()).pow(2).mean() 64 | 65 | fisher_loss = pg_fisher_loss + vf_fisher_loss 66 | self.optimizer.acc_stats = True 67 | fisher_loss.backward(retain_graph=True) 68 | self.optimizer.acc_stats = False 69 | 70 | self.optimizer.zero_grad() 71 | (value_loss * self.value_loss_coef + action_loss - 72 | dist_entropy * self.entropy_coef).backward() 73 | 74 | if self.acktr == False: 75 | nn.utils.clip_grad_norm_(self.actor_critic.parameters(), 76 | self.max_grad_norm) 77 | 78 | self.optimizer.step() 79 | 80 | return value_loss.item(), action_loss.item(), dist_entropy.item() 81 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/a2c_ppo_acktr/utils.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from a2c_ppo_acktr.envs import VecNormalize 8 | 9 | 10 | # Get a render function 11 | def get_render_func(venv): 12 | if hasattr(venv, 'envs'): 13 | return venv.envs[0].render 14 | elif hasattr(venv, 'venv'): 15 | return get_render_func(venv.venv) 16 | elif hasattr(venv, 'env'): 17 | return get_render_func(venv.env) 18 | 19 | return None 20 | 21 | 22 | def get_vec_normalize(venv): 23 | if isinstance(venv, VecNormalize): 24 | return venv 25 | elif hasattr(venv, 'venv'): 26 | return get_vec_normalize(venv.venv) 27 | 28 | return None 29 | 30 | 31 | # Necessary for my KFAC implementation. 32 | class AddBias(nn.Module): 33 | def __init__(self, bias): 34 | super(AddBias, self).__init__() 35 | self._bias = nn.Parameter(bias.unsqueeze(1)) 36 | 37 | def forward(self, x): 38 | if x.dim() == 2: 39 | bias = self._bias.t().view(1, -1) 40 | else: 41 | bias = self._bias.t().view(1, -1, 1, 1) 42 | 43 | return x + bias 44 | 45 | 46 | def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr): 47 | """Decreases the learning rate linearly""" 48 | lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs))) 49 | for param_group in optimizer.param_groups: 50 | param_group['lr'] = lr 51 | 52 | 53 | def init(module, weight_init, bias_init, gain=1): 54 | weight_init(module.weight.data, gain=gain) 55 | bias_init(module.bias.data) 56 | return module 57 | 58 | 59 | def cleanup_log_dir(log_dir): 60 | try: 61 | os.makedirs(log_dir) 62 | except OSError: 63 | files = glob.glob(os.path.join(log_dir, '*.monitor.csv')) 64 | for f in files: 65 | os.remove(f) 66 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/.benchmark_pattern: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | *.pyc 3 | *.pkl 4 | *.py~ 5 | .pytest_cache 6 | .DS_Store 7 | .idea 8 | 9 | # Setuptools distribution and build folders. 10 | /dist/ 11 | /build 12 | keys/ 13 | 14 | # Virtualenv 15 | /env 16 | 17 | 18 | *.sublime-project 19 | *.sublime-workspace 20 | 21 | .idea 22 | 23 | logs/ 24 | 25 | .ipynb_checkpoints 26 | ghostdriver.log 27 | 28 | htmlcov 29 | 30 | junk 31 | src 32 | 33 | *.egg-info 34 | .cache 35 | 36 | MUJOCO_LOG.TXT 37 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | services: 6 | - docker 7 | 8 | install: 9 | - pip install flake8 10 | - docker build . -t baselines-test 11 | 12 | script: 13 | - flake8 . --show-source --statistics 14 | - docker run -e RUNSLOW=1 baselines-test pytest -v . 15 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | 3 | RUN apt-get -y update && apt-get -y install ffmpeg 4 | # RUN apt-get -y update && apt-get -y install git wget python-dev python3-dev libopenmpi-dev python-pip zlib1g-dev cmake python-opencv 5 | 6 | ENV CODE_DIR /root/code 7 | 8 | COPY . $CODE_DIR/baselines 9 | WORKDIR $CODE_DIR/baselines 10 | 11 | # Clean up pycache and pyc files 12 | RUN rm -rf __pycache__ && \ 13 | find . -name "*.pyc" -delete && \ 14 | pip install tensorflow && \ 15 | pip install -e .[test] 16 | 17 | 18 | CMD /bin/bash 19 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/a2c/README.md: -------------------------------------------------------------------------------- 1 | # A2C 2 | 3 | - Original paper: https://arxiv.org/abs/1602.01783 4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ 5 | - `python -m baselines.run --alg=a2c --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options 6 | - also refer to the repo-wide [README.md](../../README.md#training-models) 7 | 8 | ## Files 9 | - `run_atari`: file used to run the algorithm. 10 | - `policies.py`: contains the different versions of the A2C architecture (MlpPolicy, CNNPolicy, LstmPolicy...). 11 | - `a2c.py`: - Model : class used to initialize the step_model (sampling) and train_model (training) 12 | - learn : Main entrypoint for A2C algorithm. Train a policy with given network architecture on a given environment using a2c algorithm. 13 | - `runner.py`: class used to generates a batch of experiences 14 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/a2c/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/a2c/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/a2c/runner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from baselines.a2c.utils import discount_with_dones 3 | from baselines.common.runners import AbstractEnvRunner 4 | 5 | class Runner(AbstractEnvRunner): 6 | """ 7 | We use this class to generate batches of experiences 8 | 9 | __init__: 10 | - Initialize the runner 11 | 12 | run(): 13 | - Make a mini batch of experiences 14 | """ 15 | def __init__(self, env, model, nsteps=5, gamma=0.99): 16 | super().__init__(env=env, model=model, nsteps=nsteps) 17 | self.gamma = gamma 18 | self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()] 19 | self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype 20 | 21 | def run(self): 22 | # We initialize the lists that will contain the mb of experiences 23 | mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] 24 | mb_states = self.states 25 | epinfos = [] 26 | for n in range(self.nsteps): 27 | # Given observations, take action and value (V(s)) 28 | # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init 29 | actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones) 30 | 31 | # Append the experiences 32 | mb_obs.append(np.copy(self.obs)) 33 | mb_actions.append(actions) 34 | mb_values.append(values) 35 | mb_dones.append(self.dones) 36 | 37 | # Take actions in env and look the results 38 | obs, rewards, dones, infos = self.env.step(actions) 39 | for info in infos: 40 | maybeepinfo = info.get('episode') 41 | if maybeepinfo: epinfos.append(maybeepinfo) 42 | self.states = states 43 | self.dones = dones 44 | self.obs = obs 45 | mb_rewards.append(rewards) 46 | mb_dones.append(self.dones) 47 | 48 | # Batch of steps to batch of rollouts 49 | mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) 50 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) 51 | mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) 52 | mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) 53 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) 54 | mb_masks = mb_dones[:, :-1] 55 | mb_dones = mb_dones[:, 1:] 56 | 57 | 58 | if self.gamma > 0.0: 59 | # Discount/bootstrap off value fn 60 | last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() 61 | for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): 62 | rewards = rewards.tolist() 63 | dones = dones.tolist() 64 | if dones[-1] == 0: 65 | rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] 66 | else: 67 | rewards = discount_with_dones(rewards, dones, self.gamma) 68 | 69 | mb_rewards[n] = rewards 70 | 71 | mb_actions = mb_actions.reshape(self.batch_action_shape) 72 | 73 | mb_rewards = mb_rewards.flatten() 74 | mb_values = mb_values.flatten() 75 | mb_masks = mb_masks.flatten() 76 | return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values, epinfos 77 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/README.md: -------------------------------------------------------------------------------- 1 | # ACER 2 | 3 | - Original paper: https://arxiv.org/abs/1611.01224 4 | - `python -m baselines.run --alg=acer --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options. 5 | - also refer to the repo-wide [README.md](../../README.md#training-models) 6 | 7 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/defaults.py: -------------------------------------------------------------------------------- 1 | def atari(): 2 | return dict( 3 | lrschedule='constant' 4 | ) 5 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from baselines.common.policies import nature_cnn 4 | from baselines.a2c.utils import fc, batch_to_seq, seq_to_batch, lstm, sample 5 | 6 | 7 | class AcerCnnPolicy(object): 8 | 9 | def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False): 10 | nbatch = nenv * nsteps 11 | nh, nw, nc = ob_space.shape 12 | ob_shape = (nbatch, nh, nw, nc * nstack) 13 | nact = ac_space.n 14 | X = tf.placeholder(tf.uint8, ob_shape) # obs 15 | with tf.variable_scope("model", reuse=reuse): 16 | h = nature_cnn(X) 17 | pi_logits = fc(h, 'pi', nact, init_scale=0.01) 18 | pi = tf.nn.softmax(pi_logits) 19 | q = fc(h, 'q', nact) 20 | 21 | a = sample(tf.nn.softmax(pi_logits)) # could change this to use self.pi instead 22 | self.initial_state = [] # not stateful 23 | self.X = X 24 | self.pi = pi # actual policy params now 25 | self.pi_logits = pi_logits 26 | self.q = q 27 | self.vf = q 28 | 29 | def step(ob, *args, **kwargs): 30 | # returns actions, mus, states 31 | a0, pi0 = sess.run([a, pi], {X: ob}) 32 | return a0, pi0, [] # dummy state 33 | 34 | def out(ob, *args, **kwargs): 35 | pi0, q0 = sess.run([pi, q], {X: ob}) 36 | return pi0, q0 37 | 38 | def act(ob, *args, **kwargs): 39 | return sess.run(a, {X: ob}) 40 | 41 | self.step = step 42 | self.out = out 43 | self.act = act 44 | 45 | class AcerLstmPolicy(object): 46 | 47 | def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False, nlstm=256): 48 | nbatch = nenv * nsteps 49 | nh, nw, nc = ob_space.shape 50 | ob_shape = (nbatch, nh, nw, nc * nstack) 51 | nact = ac_space.n 52 | X = tf.placeholder(tf.uint8, ob_shape) # obs 53 | M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) 54 | S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states 55 | with tf.variable_scope("model", reuse=reuse): 56 | h = nature_cnn(X) 57 | 58 | # lstm 59 | xs = batch_to_seq(h, nenv, nsteps) 60 | ms = batch_to_seq(M, nenv, nsteps) 61 | h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm) 62 | h5 = seq_to_batch(h5) 63 | 64 | pi_logits = fc(h5, 'pi', nact, init_scale=0.01) 65 | pi = tf.nn.softmax(pi_logits) 66 | q = fc(h5, 'q', nact) 67 | 68 | a = sample(pi_logits) # could change this to use self.pi instead 69 | self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32) 70 | self.X = X 71 | self.M = M 72 | self.S = S 73 | self.pi = pi # actual policy params now 74 | self.q = q 75 | 76 | def step(ob, state, mask, *args, **kwargs): 77 | # returns actions, mus, states 78 | a0, pi0, s = sess.run([a, pi, snew], {X: ob, S: state, M: mask}) 79 | return a0, pi0, s 80 | 81 | self.step = step 82 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acer/runner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from baselines.common.runners import AbstractEnvRunner 3 | from baselines.common.vec_env.vec_frame_stack import VecFrameStack 4 | from gym import spaces 5 | 6 | 7 | class Runner(AbstractEnvRunner): 8 | 9 | def __init__(self, env, model, nsteps): 10 | super().__init__(env=env, model=model, nsteps=nsteps) 11 | assert isinstance(env.action_space, spaces.Discrete), 'This ACER implementation works only with discrete action spaces!' 12 | assert isinstance(env, VecFrameStack) 13 | 14 | self.nact = env.action_space.n 15 | nenv = self.nenv 16 | self.nbatch = nenv * nsteps 17 | self.batch_ob_shape = (nenv*(nsteps+1),) + env.observation_space.shape 18 | 19 | self.obs = env.reset() 20 | self.obs_dtype = env.observation_space.dtype 21 | self.ac_dtype = env.action_space.dtype 22 | self.nstack = self.env.nstack 23 | self.nc = self.batch_ob_shape[-1] // self.nstack 24 | 25 | 26 | def run(self): 27 | # enc_obs = np.split(self.obs, self.nstack, axis=3) # so now list of obs steps 28 | enc_obs = np.split(self.env.stackedobs, self.env.nstack, axis=-1) 29 | mb_obs, mb_actions, mb_mus, mb_dones, mb_rewards = [], [], [], [], [] 30 | for _ in range(self.nsteps): 31 | actions, mus, states = self.model._step(self.obs, S=self.states, M=self.dones) 32 | mb_obs.append(np.copy(self.obs)) 33 | mb_actions.append(actions) 34 | mb_mus.append(mus) 35 | mb_dones.append(self.dones) 36 | obs, rewards, dones, _ = self.env.step(actions) 37 | # states information for statefull models like LSTM 38 | self.states = states 39 | self.dones = dones 40 | self.obs = obs 41 | mb_rewards.append(rewards) 42 | enc_obs.append(obs[..., -self.nc:]) 43 | mb_obs.append(np.copy(self.obs)) 44 | mb_dones.append(self.dones) 45 | 46 | enc_obs = np.asarray(enc_obs, dtype=self.obs_dtype).swapaxes(1, 0) 47 | mb_obs = np.asarray(mb_obs, dtype=self.obs_dtype).swapaxes(1, 0) 48 | mb_actions = np.asarray(mb_actions, dtype=self.ac_dtype).swapaxes(1, 0) 49 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) 50 | mb_mus = np.asarray(mb_mus, dtype=np.float32).swapaxes(1, 0) 51 | 52 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) 53 | 54 | mb_masks = mb_dones # Used for statefull models like LSTM's to mask state when done 55 | mb_dones = mb_dones[:, 1:] # Used for calculating returns. The dones array is now aligned with rewards 56 | 57 | # shapes are now [nenv, nsteps, []] 58 | # When pulling from buffer, arrays will now be reshaped in place, preventing a deep copy. 59 | 60 | return enc_obs, mb_obs, mb_actions, mb_rewards, mb_mus, mb_dones, mb_masks 61 | 62 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/README.md: -------------------------------------------------------------------------------- 1 | # ACKTR 2 | 3 | - Original paper: https://arxiv.org/abs/1708.05144 4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ 5 | - `python -m baselines.run --alg=acktr --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options. 6 | - also refer to the repo-wide [README.md](../../README.md#training-models) 7 | 8 | ## ACKTR with continuous action spaces 9 | The code of ACKTR has been refactored to handle both discrete and continuous action spaces uniformly. In the original version, discrete and continuous action spaces were handled by different code (actkr_disc.py and acktr_cont.py) with little overlap. If interested in the original version of the acktr for continuous action spaces, use `old_acktr_cont` branch. Note that original code performs better on the mujoco tasks than the refactored version; we are still investigating why. 10 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/defaults.py: -------------------------------------------------------------------------------- 1 | def mujoco(): 2 | return dict( 3 | nsteps=2500, 4 | value_network='copy' 5 | ) 6 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/acktr/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None): 4 | with tf.variable_scope(name, reuse=reuse): 5 | assert (len(tf.get_variable_scope().name.split('/')) == 2) 6 | 7 | w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init) 8 | b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) 9 | weight_decay_fc = 3e-4 10 | 11 | if weight_loss_dict is not None: 12 | weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss') 13 | if weight_loss_dict is not None: 14 | weight_loss_dict[w] = weight_decay_fc 15 | weight_loss_dict[b] = 0.0 16 | 17 | tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay) 18 | 19 | return tf.nn.bias_add(tf.matmul(x, w), b) 20 | 21 | def kl_div(action_dist1, action_dist2, action_size): 22 | mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:] 23 | mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:] 24 | 25 | numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2) 26 | denominator = 2 * tf.square(std2) + 1e-8 27 | return tf.reduce_sum( 28 | numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1) 29 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/bench/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F403 2 | from baselines.bench.benchmarks import * 3 | from baselines.bench.monitor import * 4 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/bench/test_monitor.py: -------------------------------------------------------------------------------- 1 | from .monitor import Monitor 2 | import gym 3 | import json 4 | 5 | def test_monitor(): 6 | import pandas 7 | import os 8 | import uuid 9 | 10 | env = gym.make("CartPole-v1") 11 | env.seed(0) 12 | mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4() 13 | menv = Monitor(env, mon_file) 14 | menv.reset() 15 | for _ in range(1000): 16 | _, _, done, _ = menv.step(0) 17 | if done: 18 | menv.reset() 19 | 20 | f = open(mon_file, 'rt') 21 | 22 | firstline = f.readline() 23 | assert firstline.startswith('#') 24 | metadata = json.loads(firstline[1:]) 25 | assert metadata['env_id'] == "CartPole-v1" 26 | assert set(metadata.keys()) == {'env_id', 't_start'}, "Incorrect keys in monitor metadata" 27 | 28 | last_logline = pandas.read_csv(f, index_col=None) 29 | assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" 30 | f.close() 31 | os.remove(mon_file) 32 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F403 2 | from baselines.common.console_util import * 3 | from baselines.common.dataset import Dataset 4 | from baselines.common.math_util import * 5 | from baselines.common.misc_util import * 6 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/cg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 3 | """ 4 | Demmel p 312 5 | """ 6 | p = b.copy() 7 | r = b.copy() 8 | x = np.zeros_like(b) 9 | rdotr = r.dot(r) 10 | 11 | fmtstr = "%10i %10.3g %10.3g" 12 | titlestr = "%10s %10s %10s" 13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) 14 | 15 | for i in range(cg_iters): 16 | if callback is not None: 17 | callback(x) 18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) 19 | z = f_Ax(p) 20 | v = rdotr / p.dot(z) 21 | x += v*p 22 | r -= v*z 23 | newrdotr = r.dot(r) 24 | mu = newrdotr/rdotr 25 | p = r + mu*p 26 | 27 | rdotr = newrdotr 28 | if rdotr < residual_tol: 29 | break 30 | 31 | if callback is not None: 32 | callback(x) 33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 34 | return x 35 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | import shlex 6 | import subprocess 7 | 8 | # ================================================================ 9 | # Misc 10 | # ================================================================ 11 | 12 | def fmt_row(width, row, header=False): 13 | out = " | ".join(fmt_item(x, width) for x in row) 14 | if header: out = out + "\n" + "-"*len(out) 15 | return out 16 | 17 | def fmt_item(x, l): 18 | if isinstance(x, np.ndarray): 19 | assert x.ndim==0 20 | x = x.item() 21 | if isinstance(x, (float, np.float32, np.float64)): 22 | v = abs(x) 23 | if (v < 1e-4 or v > 1e+4) and v > 0: 24 | rep = "%7.2e" % x 25 | else: 26 | rep = "%7.5f" % x 27 | else: rep = str(x) 28 | return " "*(l - len(rep)) + rep 29 | 30 | color2num = dict( 31 | gray=30, 32 | red=31, 33 | green=32, 34 | yellow=33, 35 | blue=34, 36 | magenta=35, 37 | cyan=36, 38 | white=37, 39 | crimson=38 40 | ) 41 | 42 | def colorize(string, color='green', bold=False, highlight=False): 43 | attr = [] 44 | num = color2num[color] 45 | if highlight: num += 10 46 | attr.append(str(num)) 47 | if bold: attr.append('1') 48 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 49 | 50 | def print_cmd(cmd, dry=False): 51 | if isinstance(cmd, str): # for shell=True 52 | pass 53 | else: 54 | cmd = ' '.join(shlex.quote(arg) for arg in cmd) 55 | print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd)) 56 | 57 | 58 | def get_git_commit(cwd=None): 59 | return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8') 60 | 61 | def get_git_commit_message(cwd=None): 62 | return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8') 63 | 64 | def ccap(cmd, dry=False, env=None, **kwargs): 65 | print_cmd(cmd, dry) 66 | if not dry: 67 | subprocess.check_call(cmd, env=env, **kwargs) 68 | 69 | 70 | MESSAGE_DEPTH = 0 71 | 72 | @contextmanager 73 | def timed(msg): 74 | global MESSAGE_DEPTH #pylint: disable=W0603 75 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 76 | tstart = time.time() 77 | MESSAGE_DEPTH += 1 78 | yield 79 | MESSAGE_DEPTH -= 1 80 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 81 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Dataset(object): 4 | def __init__(self, data_map, deterministic=False, shuffle=True): 5 | self.data_map = data_map 6 | self.deterministic = deterministic 7 | self.enable_shuffle = shuffle 8 | self.n = next(iter(data_map.values())).shape[0] 9 | self._next_id = 0 10 | self.shuffle() 11 | 12 | def shuffle(self): 13 | if self.deterministic: 14 | return 15 | perm = np.arange(self.n) 16 | np.random.shuffle(perm) 17 | 18 | for key in self.data_map: 19 | self.data_map[key] = self.data_map[key][perm] 20 | 21 | self._next_id = 0 22 | 23 | def next_batch(self, batch_size): 24 | if self._next_id >= self.n and self.enable_shuffle: 25 | self.shuffle() 26 | 27 | cur_id = self._next_id 28 | cur_batch_size = min(batch_size, self.n - self._next_id) 29 | self._next_id += cur_batch_size 30 | 31 | data_map = dict() 32 | for key in self.data_map: 33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 34 | return data_map 35 | 36 | def iterate_once(self, batch_size): 37 | if self.enable_shuffle: self.shuffle() 38 | 39 | while self._next_id <= self.n - batch_size: 40 | yield self.next_batch(batch_size) 41 | self._next_id = 0 42 | 43 | def subset(self, num_elements, deterministic=True): 44 | data_map = dict() 45 | for key in self.data_map: 46 | data_map[key] = self.data_map[key][:num_elements] 47 | return Dataset(data_map, deterministic) 48 | 49 | 50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 52 | arrays = tuple(map(np.asarray, arrays)) 53 | n = arrays[0].shape[0] 54 | assert all(a.shape[0] == n for a in arrays[1:]) 55 | inds = np.arange(n) 56 | if shuffle: np.random.shuffle(inds) 57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 58 | for batch_inds in np.array_split(inds, sections): 59 | if include_final_partial_batch or len(batch_inds) == batch_size: 60 | yield tuple(a[batch_inds] for a in arrays) 61 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/input.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym.spaces import Discrete, Box, MultiDiscrete 4 | 5 | def observation_placeholder(ob_space, batch_size=None, name='Ob'): 6 | ''' 7 | Create placeholder to feed observations into of the size appropriate to the observation space 8 | 9 | Parameters: 10 | ---------- 11 | 12 | ob_space: gym.Space observation space 13 | 14 | batch_size: int size of the batch to be fed into input. Can be left None in most cases. 15 | 16 | name: str name of the placeholder 17 | 18 | Returns: 19 | ------- 20 | 21 | tensorflow placeholder tensor 22 | ''' 23 | 24 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \ 25 | 'Can only deal with Discrete and Box observation spaces for now' 26 | 27 | dtype = ob_space.dtype 28 | if dtype == np.int8: 29 | dtype = np.uint8 30 | 31 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name) 32 | 33 | 34 | def observation_input(ob_space, batch_size=None, name='Ob'): 35 | ''' 36 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input 37 | encoder of the appropriate type. 38 | ''' 39 | 40 | placeholder = observation_placeholder(ob_space, batch_size, name) 41 | return placeholder, encode_observation(ob_space, placeholder) 42 | 43 | def encode_observation(ob_space, placeholder): 44 | ''' 45 | Encode input in the way that is appropriate to the observation space 46 | 47 | Parameters: 48 | ---------- 49 | 50 | ob_space: gym.Space observation space 51 | 52 | placeholder: tf.placeholder observation input placeholder 53 | ''' 54 | if isinstance(ob_space, Discrete): 55 | return tf.to_float(tf.one_hot(placeholder, ob_space.n)) 56 | elif isinstance(ob_space, Box): 57 | return tf.to_float(placeholder) 58 | elif isinstance(ob_space, MultiDiscrete): 59 | placeholder = tf.cast(placeholder, tf.int32) 60 | one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])] 61 | return tf.concat(one_hots, axis=-1) 62 | else: 63 | raise NotImplementedError 64 | 65 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(x, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of x. 8 | 9 | inputs 10 | ------ 11 | x: ndarray 12 | gamma: float 13 | 14 | outputs 15 | ------- 16 | y: ndarray with same shape as x, satisfying 17 | 18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 19 | where k = len(x) - t - 1 20 | 21 | """ 22 | assert x.ndim >= 1 23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 24 | 25 | def explained_variance(ypred,y): 26 | """ 27 | Computes fraction of variance that ypred explains about y. 28 | Returns 1 - Var[y-ypred] / Var[y] 29 | 30 | interpretation: 31 | ev=0 => might as well have predicted zero 32 | ev=1 => perfect prediction 33 | ev<0 => worse than just predicting zero 34 | 35 | """ 36 | assert y.ndim == 1 and ypred.ndim == 1 37 | vary = np.var(y) 38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 39 | 40 | def explained_variance_2d(ypred, y): 41 | assert y.ndim == 2 and ypred.ndim == 2 42 | vary = np.var(y, axis=0) 43 | out = 1 - np.var(y-ypred)/vary 44 | out[vary < 1e-10] = 0 45 | return out 46 | 47 | def ncc(ypred, y): 48 | return np.corrcoef(ypred, y)[1,0] 49 | 50 | def flatten_arrays(arrs): 51 | return np.concatenate([arr.flat for arr in arrs]) 52 | 53 | def unflatten_vector(vec, shapes): 54 | i=0 55 | arrs = [] 56 | for shape in shapes: 57 | size = np.prod(shape) 58 | arr = vec[i:i+size].reshape(shape) 59 | arrs.append(arr) 60 | i += size 61 | return arrs 62 | 63 | def discount_with_boundaries(X, New, gamma): 64 | """ 65 | X: 2d array of floats, time x features 66 | New: 2d array of bools, indicating when a new episode has started 67 | """ 68 | Y = np.zeros_like(X) 69 | T = X.shape[0] 70 | Y[T-1] = X[T-1] 71 | for t in range(T-2, -1, -1): 72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) 73 | return Y 74 | 75 | def test_discount_with_boundaries(): 76 | gamma=0.9 77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') 78 | starts = [1.0, 0.0, 0.0, 1.0] 79 | y = discount_with_boundaries(x, starts, gamma) 80 | assert np.allclose(y, [ 81 | 1 + gamma * 2 + gamma**2 * 3, 82 | 2 + gamma * 3, 83 | 3, 84 | 4 85 | ]) 86 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, sys 2 | 3 | def mpi_fork(n, bind_to_core=False): 4 | """Re-launches the current script with workers 5 | Returns "parent" for original parent, "child" for MPI children 6 | """ 7 | if n<=1: 8 | return "child" 9 | if os.getenv("IN_MPI") is None: 10 | env = os.environ.copy() 11 | env.update( 12 | MKL_NUM_THREADS="1", 13 | OMP_NUM_THREADS="1", 14 | IN_MPI="1" 15 | ) 16 | args = ["mpirun", "-np", str(n)] 17 | if bind_to_core: 18 | args += ["-bind-to", "core"] 19 | args += [sys.executable] + sys.argv 20 | subprocess.check_call(args, env=env) 21 | return "parent" 22 | else: 23 | return "child" 24 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from baselines.common import zipsame 4 | 5 | 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False): 7 | x = np.asarray(x) 8 | assert x.ndim > 0 9 | if comm is None: comm = MPI.COMM_WORLD 10 | xsum = x.sum(axis=axis, keepdims=keepdims) 11 | n = xsum.size 12 | localsum = np.zeros(n+1, x.dtype) 13 | localsum[:n] = xsum.ravel() 14 | localsum[n] = x.shape[axis] 15 | globalsum = np.zeros_like(localsum) 16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM) 17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] 18 | 19 | def mpi_moments(x, axis=0, comm=None, keepdims=False): 20 | x = np.asarray(x) 21 | assert x.ndim > 0 22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True) 23 | sqdiffs = np.square(x - mean) 24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) 25 | assert count1 == count 26 | std = np.sqrt(meansqdiff) 27 | if not keepdims: 28 | newshape = mean.shape[:axis] + mean.shape[axis+1:] 29 | mean = mean.reshape(newshape) 30 | std = std.reshape(newshape) 31 | return mean, std, count 32 | 33 | 34 | def test_runningmeanstd(): 35 | import subprocess 36 | subprocess.check_call(['mpirun', '-np', '3', 37 | 'python','-c', 38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) 39 | 40 | def _helper_runningmeanstd(): 41 | comm = MPI.COMM_WORLD 42 | np.random.seed(0) 43 | for (triple,axis) in [ 44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 47 | ]: 48 | 49 | 50 | x = np.concatenate(triple, axis=axis) 51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 52 | 53 | 54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 55 | 56 | for (a1,a2) in zipsame(ms1, ms2): 57 | print(a1, a2) 58 | assert np.allclose(a1, a2) 59 | print("ok!") 60 | 61 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/runners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | 4 | class AbstractEnvRunner(ABC): 5 | def __init__(self, *, env, model, nsteps): 6 | self.env = env 7 | self.model = model 8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape 10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) 11 | self.obs[:] = env.reset() 12 | self.nsteps = nsteps 13 | self.states = model.initial_state 14 | self.dones = [False for _ in range(nenv)] 15 | 16 | @abstractmethod 17 | def run(self): 18 | raise NotImplementedError 19 | 20 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/test_mpi_util.py: -------------------------------------------------------------------------------- 1 | from baselines.common import mpi_util 2 | from baselines import logger 3 | from baselines.common.tests.test_with_mpi import with_mpi 4 | try: 5 | from mpi4py import MPI 6 | except ImportError: 7 | MPI = None 8 | 9 | @with_mpi() 10 | def test_mpi_weighted_mean(): 11 | comm = MPI.COMM_WORLD 12 | with logger.scoped_configure(comm=comm): 13 | if comm.rank == 0: 14 | name2valcount = {'a' : (10, 2), 'b' : (20,3)} 15 | elif comm.rank == 1: 16 | name2valcount = {'a' : (19, 1), 'c' : (42,3)} 17 | else: 18 | raise NotImplementedError 19 | d = mpi_util.mpi_weighted_mean(comm, name2valcount) 20 | correctval = {'a' : (10 * 2 + 19) / 3.0, 'b' : 20, 'c' : 42} 21 | if comm.rank == 0: 22 | assert d == correctval, '{} != {}'.format(d, correctval) 23 | 24 | for name, (val, count) in name2valcount.items(): 25 | for _ in range(count): 26 | logger.logkv_mean(name, val) 27 | d2 = logger.dumpkvs() 28 | if comm.rank == 0: 29 | assert d2 == correctval 30 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/__init__.py: -------------------------------------------------------------------------------- 1 | import os, pytest 2 | mark_slow = pytest.mark.skipif(not os.getenv('RUNSLOW'), reason='slow') -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/fixed_sequence_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import Env 3 | from gym.spaces import Discrete 4 | 5 | 6 | class FixedSequenceEnv(Env): 7 | def __init__( 8 | self, 9 | n_actions=10, 10 | episode_len=100 11 | ): 12 | self.action_space = Discrete(n_actions) 13 | self.observation_space = Discrete(1) 14 | self.np_random = np.random.RandomState(0) 15 | self.episode_len = episode_len 16 | self.sequence = [self.np_random.randint(0, self.action_space.n) 17 | for _ in range(self.episode_len)] 18 | self.time = 0 19 | 20 | 21 | def reset(self): 22 | self.time = 0 23 | return 0 24 | 25 | def step(self, actions): 26 | rew = self._get_reward(actions) 27 | self._choose_next_state() 28 | done = False 29 | if self.episode_len and self.time >= self.episode_len: 30 | done = True 31 | 32 | return 0, rew, done, {} 33 | 34 | def seed(self, seed=None): 35 | self.np_random.seed(seed) 36 | 37 | def _choose_next_state(self): 38 | self.time += 1 39 | 40 | def _get_reward(self, actions): 41 | return 1 if actions == self.sequence[self.time] else 0 42 | 43 | 44 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/identity_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import abstractmethod 3 | from gym import Env 4 | from gym.spaces import MultiDiscrete, Discrete, Box 5 | from collections import deque 6 | 7 | class IdentityEnv(Env): 8 | def __init__( 9 | self, 10 | episode_len=None, 11 | delay=0, 12 | zero_first_rewards=True 13 | ): 14 | 15 | self.observation_space = self.action_space 16 | self.episode_len = episode_len 17 | self.time = 0 18 | self.delay = delay 19 | self.zero_first_rewards = zero_first_rewards 20 | self.q = deque(maxlen=delay+1) 21 | 22 | def reset(self): 23 | self.q.clear() 24 | for _ in range(self.delay + 1): 25 | self.q.append(self.action_space.sample()) 26 | self.time = 0 27 | 28 | return self.q[-1] 29 | 30 | def step(self, actions): 31 | rew = self._get_reward(self.q.popleft(), actions) 32 | if self.zero_first_rewards and self.time < self.delay: 33 | rew = 0 34 | self.q.append(self.action_space.sample()) 35 | self.time += 1 36 | done = self.episode_len is not None and self.time >= self.episode_len 37 | return self.q[-1], rew, done, {} 38 | 39 | def seed(self, seed=None): 40 | self.action_space.seed(seed) 41 | 42 | @abstractmethod 43 | def _get_reward(self, state, actions): 44 | raise NotImplementedError 45 | 46 | 47 | class DiscreteIdentityEnv(IdentityEnv): 48 | def __init__( 49 | self, 50 | dim, 51 | episode_len=None, 52 | delay=0, 53 | zero_first_rewards=True 54 | ): 55 | 56 | self.action_space = Discrete(dim) 57 | super().__init__(episode_len=episode_len, delay=delay, zero_first_rewards=zero_first_rewards) 58 | 59 | def _get_reward(self, state, actions): 60 | return 1 if state == actions else 0 61 | 62 | class MultiDiscreteIdentityEnv(IdentityEnv): 63 | def __init__( 64 | self, 65 | dims, 66 | episode_len=None, 67 | delay=0, 68 | ): 69 | 70 | self.action_space = MultiDiscrete(dims) 71 | super().__init__(episode_len=episode_len, delay=delay) 72 | 73 | def _get_reward(self, state, actions): 74 | return 1 if all(state == actions) else 0 75 | 76 | 77 | class BoxIdentityEnv(IdentityEnv): 78 | def __init__( 79 | self, 80 | shape, 81 | episode_len=None, 82 | ): 83 | 84 | self.action_space = Box(low=-1.0, high=1.0, shape=shape, dtype=np.float32) 85 | super().__init__(episode_len=episode_len) 86 | 87 | def _get_reward(self, state, actions): 88 | diff = actions - state 89 | diff = diff[:] 90 | return -0.5 * np.dot(diff, diff) 91 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/identity_env_test.py: -------------------------------------------------------------------------------- 1 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv 2 | 3 | 4 | def test_discrete_nodelay(): 5 | nsteps = 100 6 | eplen = 50 7 | env = DiscreteIdentityEnv(10, episode_len=eplen) 8 | ob = env.reset() 9 | for t in range(nsteps): 10 | action = env.action_space.sample() 11 | next_ob, rew, done, info = env.step(action) 12 | assert rew == (1 if action == ob else 0) 13 | if (t + 1) % eplen == 0: 14 | assert done 15 | next_ob = env.reset() 16 | else: 17 | assert not done 18 | ob = next_ob 19 | 20 | def test_discrete_delay1(): 21 | eplen = 50 22 | env = DiscreteIdentityEnv(10, episode_len=eplen, delay=1) 23 | ob = env.reset() 24 | prev_ob = None 25 | for t in range(eplen): 26 | action = env.action_space.sample() 27 | next_ob, rew, done, info = env.step(action) 28 | if t > 0: 29 | assert rew == (1 if action == prev_ob else 0) 30 | else: 31 | assert rew == 0 32 | prev_ob = ob 33 | ob = next_ob 34 | if t < eplen - 1: 35 | assert not done 36 | assert done 37 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/envs/mnist_env.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import numpy as np 3 | import tempfile 4 | from gym import Env 5 | from gym.spaces import Discrete, Box 6 | 7 | 8 | 9 | class MnistEnv(Env): 10 | def __init__( 11 | self, 12 | episode_len=None, 13 | no_images=None 14 | ): 15 | import filelock 16 | from tensorflow.examples.tutorials.mnist import input_data 17 | # we could use temporary directory for this with a context manager and 18 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data 19 | # this way the data is not cleaned up, but we only download it once per machine 20 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data') 21 | with filelock.FileLock(mnist_path + '.lock'): 22 | self.mnist = input_data.read_data_sets(mnist_path) 23 | 24 | self.np_random = np.random.RandomState() 25 | 26 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1)) 27 | self.action_space = Discrete(10) 28 | self.episode_len = episode_len 29 | self.time = 0 30 | self.no_images = no_images 31 | 32 | self.train_mode() 33 | self.reset() 34 | 35 | def reset(self): 36 | self._choose_next_state() 37 | self.time = 0 38 | 39 | return self.state[0] 40 | 41 | def step(self, actions): 42 | rew = self._get_reward(actions) 43 | self._choose_next_state() 44 | done = False 45 | if self.episode_len and self.time >= self.episode_len: 46 | rew = 0 47 | done = True 48 | 49 | return self.state[0], rew, done, {} 50 | 51 | def seed(self, seed=None): 52 | self.np_random.seed(seed) 53 | 54 | def train_mode(self): 55 | self.dataset = self.mnist.train 56 | 57 | def test_mode(self): 58 | self.dataset = self.mnist.test 59 | 60 | def _choose_next_state(self): 61 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1 62 | index = self.np_random.randint(0, max_index) 63 | image = self.dataset.images[index].reshape(28,28,1)*255 64 | label = self.dataset.labels[index] 65 | self.state = (image, label) 66 | self.time += 1 67 | 68 | def _get_reward(self, actions): 69 | return 1 if self.state[1] == actions else 0 70 | 71 | 72 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_cartpole.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | 4 | from baselines.run import get_learn_function 5 | from baselines.common.tests.util import reward_per_episode_test 6 | from baselines.common.tests import mark_slow 7 | 8 | common_kwargs = dict( 9 | total_timesteps=30000, 10 | network='mlp', 11 | gamma=1.0, 12 | seed=0, 13 | ) 14 | 15 | learn_kwargs = { 16 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05), 17 | 'acer': dict(value_network='copy'), 18 | 'acktr': dict(nsteps=32, value_network='copy', is_async=False), 19 | 'deepq': dict(total_timesteps=20000), 20 | 'ppo2': dict(value_network='copy'), 21 | 'trpo_mpi': {} 22 | } 23 | 24 | @mark_slow 25 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 26 | def test_cartpole(alg): 27 | ''' 28 | Test if the algorithm (with an mlp policy) 29 | can learn to balance the cartpole 30 | ''' 31 | 32 | kwargs = common_kwargs.copy() 33 | kwargs.update(learn_kwargs[alg]) 34 | 35 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 36 | def env_fn(): 37 | 38 | env = gym.make('CartPole-v0') 39 | env.seed(0) 40 | return env 41 | 42 | reward_per_episode_test(env_fn, learn_fn, 100) 43 | 44 | if __name__ == '__main__': 45 | test_cartpole('acer') 46 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_doc_examples.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | try: 3 | import mujoco_py 4 | _mujoco_present = True 5 | except BaseException: 6 | mujoco_py = None 7 | _mujoco_present = False 8 | 9 | 10 | @pytest.mark.skipif( 11 | not _mujoco_present, 12 | reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library' 13 | ) 14 | def test_lstm_example(): 15 | import tensorflow as tf 16 | from baselines.common import policies, models, cmd_util 17 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 18 | 19 | # create vectorized environment 20 | venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)]) 21 | 22 | with tf.Session() as sess: 23 | # build policy based on lstm network with 128 units 24 | policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1) 25 | 26 | # initialize tensorflow variables 27 | sess.run(tf.global_variables_initializer()) 28 | 29 | # prepare environment variables 30 | ob = venv.reset() 31 | state = policy.initial_state 32 | done = [False] 33 | step_counter = 0 34 | 35 | # run a single episode until the end (i.e. until done) 36 | while True: 37 | action, _, state, _ = policy.step(ob, S=state, M=done) 38 | ob, reward, done, _ = venv.step(action) 39 | step_counter += 1 40 | if done: 41 | break 42 | 43 | 44 | assert step_counter > 5 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_env_after_learn.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | import tensorflow as tf 4 | 5 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 6 | from baselines.run import get_learn_function 7 | from baselines.common.tf_util import make_session 8 | 9 | algos = ['a2c', 'acer', 'acktr', 'deepq', 'ppo2', 'trpo_mpi'] 10 | 11 | @pytest.mark.parametrize('algo', algos) 12 | def test_env_after_learn(algo): 13 | def make_env(): 14 | # acktr requires too much RAM, fails on travis 15 | env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4') 16 | return env 17 | 18 | make_session(make_default=True, graph=tf.Graph()) 19 | env = SubprocVecEnv([make_env]) 20 | 21 | learn = get_learn_function(algo) 22 | 23 | # Commenting out the following line resolves the issue, though crash happens at env.reset(). 24 | learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None) 25 | 26 | env.reset() 27 | env.close() 28 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_fetchreach.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | 4 | from baselines.run import get_learn_function 5 | from baselines.common.tests.util import reward_per_episode_test 6 | from baselines.common.tests import mark_slow 7 | 8 | pytest.importorskip('mujoco_py') 9 | 10 | common_kwargs = dict( 11 | network='mlp', 12 | seed=0, 13 | ) 14 | 15 | learn_kwargs = { 16 | 'her': dict(total_timesteps=2000) 17 | } 18 | 19 | @mark_slow 20 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 21 | def test_fetchreach(alg): 22 | ''' 23 | Test if the algorithm (with an mlp policy) 24 | can learn the FetchReach task 25 | ''' 26 | 27 | kwargs = common_kwargs.copy() 28 | kwargs.update(learn_kwargs[alg]) 29 | 30 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 31 | def env_fn(): 32 | 33 | env = gym.make('FetchReach-v1') 34 | env.seed(0) 35 | return env 36 | 37 | reward_per_episode_test(env_fn, learn_fn, -15) 38 | 39 | if __name__ == '__main__': 40 | test_fetchreach('her') 41 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_fixed_sequence.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv 3 | 4 | from baselines.common.tests.util import simple_test 5 | from baselines.run import get_learn_function 6 | from baselines.common.tests import mark_slow 7 | 8 | 9 | common_kwargs = dict( 10 | seed=0, 11 | total_timesteps=50000, 12 | ) 13 | 14 | learn_kwargs = { 15 | 'a2c': {}, 16 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1), 17 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps) 18 | # github issue: https://github.com/openai/baselines/issues/188 19 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001) 20 | } 21 | 22 | 23 | alg_list = learn_kwargs.keys() 24 | rnn_list = ['lstm'] 25 | 26 | @mark_slow 27 | @pytest.mark.parametrize("alg", alg_list) 28 | @pytest.mark.parametrize("rnn", rnn_list) 29 | def test_fixed_sequence(alg, rnn): 30 | ''' 31 | Test if the algorithm (with a given policy) 32 | can learn an identity transformation (i.e. return observation as an action) 33 | ''' 34 | 35 | kwargs = learn_kwargs[alg] 36 | kwargs.update(common_kwargs) 37 | 38 | env_fn = lambda: FixedSequenceEnv(n_actions=10, episode_len=5) 39 | learn = lambda e: get_learn_function(alg)( 40 | env=e, 41 | network=rnn, 42 | **kwargs 43 | ) 44 | 45 | simple_test(env_fn, learn, 0.7) 46 | 47 | 48 | if __name__ == '__main__': 49 | test_fixed_sequence('ppo2', 'lstm') 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_identity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv 3 | from baselines.run import get_learn_function 4 | from baselines.common.tests.util import simple_test 5 | from baselines.common.tests import mark_slow 6 | 7 | common_kwargs = dict( 8 | total_timesteps=30000, 9 | network='mlp', 10 | gamma=0.9, 11 | seed=0, 12 | ) 13 | 14 | learn_kwargs = { 15 | 'a2c' : {}, 16 | 'acktr': {}, 17 | 'deepq': {}, 18 | 'ddpg': dict(layer_norm=True), 19 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0), 20 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01) 21 | } 22 | 23 | 24 | algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi'] 25 | algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi'] 26 | algos_cont = ['a2c', 'acktr', 'ddpg', 'ppo2', 'trpo_mpi'] 27 | 28 | @mark_slow 29 | @pytest.mark.parametrize("alg", algos_disc) 30 | def test_discrete_identity(alg): 31 | ''' 32 | Test if the algorithm (with an mlp policy) 33 | can learn an identity transformation (i.e. return observation as an action) 34 | ''' 35 | 36 | kwargs = learn_kwargs[alg] 37 | kwargs.update(common_kwargs) 38 | 39 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 40 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100) 41 | simple_test(env_fn, learn_fn, 0.9) 42 | 43 | @mark_slow 44 | @pytest.mark.parametrize("alg", algos_multidisc) 45 | def test_multidiscrete_identity(alg): 46 | ''' 47 | Test if the algorithm (with an mlp policy) 48 | can learn an identity transformation (i.e. return observation as an action) 49 | ''' 50 | 51 | kwargs = learn_kwargs[alg] 52 | kwargs.update(common_kwargs) 53 | 54 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 55 | env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100) 56 | simple_test(env_fn, learn_fn, 0.9) 57 | 58 | @mark_slow 59 | @pytest.mark.parametrize("alg", algos_cont) 60 | def test_continuous_identity(alg): 61 | ''' 62 | Test if the algorithm (with an mlp policy) 63 | can learn an identity transformation (i.e. return observation as an action) 64 | to a required precision 65 | ''' 66 | 67 | kwargs = learn_kwargs[alg] 68 | kwargs.update(common_kwargs) 69 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 70 | 71 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100) 72 | simple_test(env_fn, learn_fn, -0.1) 73 | 74 | if __name__ == '__main__': 75 | test_multidiscrete_identity('acktr') 76 | 77 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_mnist.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # from baselines.acer import acer_simple as acer 4 | from baselines.common.tests.envs.mnist_env import MnistEnv 5 | from baselines.common.tests.util import simple_test 6 | from baselines.run import get_learn_function 7 | from baselines.common.tests import mark_slow 8 | 9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem? 10 | # GitHub issue https://github.com/openai/baselines/issues/189 11 | common_kwargs = { 12 | 'seed': 0, 13 | 'network':'cnn', 14 | 'gamma':0.9, 15 | 'pad':'SAME' 16 | } 17 | 18 | learn_args = { 19 | 'a2c': dict(total_timesteps=50000), 20 | 'acer': dict(total_timesteps=20000), 21 | 'deepq': dict(total_timesteps=5000), 22 | 'acktr': dict(total_timesteps=30000), 23 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0), 24 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001) 25 | } 26 | 27 | 28 | #tests pass, but are too slow on travis. Same algorithms are covered 29 | # by other tests with less compute-hungry nn's and by benchmarks 30 | @pytest.mark.skip 31 | @mark_slow 32 | @pytest.mark.parametrize("alg", learn_args.keys()) 33 | def test_mnist(alg): 34 | ''' 35 | Test if the algorithm can learn to classify MNIST digits. 36 | Uses CNN policy. 37 | ''' 38 | 39 | learn_kwargs = learn_args[alg] 40 | learn_kwargs.update(common_kwargs) 41 | 42 | learn = get_learn_function(alg) 43 | learn_fn = lambda e: learn(env=e, **learn_kwargs) 44 | env_fn = lambda: MnistEnv(episode_len=100) 45 | 46 | simple_test(env_fn, learn_fn, 0.6) 47 | 48 | if __name__ == '__main__': 49 | test_mnist('acer') 50 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_plot_util.py: -------------------------------------------------------------------------------- 1 | # smoke tests of plot_util 2 | from baselines.common import plot_util as pu 3 | from baselines.common.tests.util import smoketest 4 | 5 | 6 | def test_plot_util(): 7 | nruns = 4 8 | logdirs = [smoketest('--alg=ppo2 --env=CartPole-v0 --num_timesteps=10000') for _ in range(nruns)] 9 | data = pu.load_results(logdirs) 10 | assert len(data) == 4 11 | 12 | _, axes = pu.plot_results(data[:1]); assert len(axes) == 1 13 | _, axes = pu.plot_results(data, tiling='vertical'); assert axes.shape==(4,1) 14 | _, axes = pu.plot_results(data, tiling='horizontal'); assert axes.shape==(1,4) 15 | _, axes = pu.plot_results(data, tiling='symmetric'); assert axes.shape==(2,2) 16 | _, axes = pu.plot_results(data, split_fn=lambda _: ''); assert len(axes) == 1 17 | 18 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_schedules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule 4 | 5 | 6 | def test_piecewise_schedule(): 7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) 8 | 9 | assert np.isclose(ps.value(-10), 500) 10 | assert np.isclose(ps.value(0), 150) 11 | assert np.isclose(ps.value(5), 200) 12 | assert np.isclose(ps.value(9), 80) 13 | assert np.isclose(ps.value(50), 50) 14 | assert np.isclose(ps.value(80), 50) 15 | assert np.isclose(ps.value(150), 0) 16 | assert np.isclose(ps.value(175), -25) 17 | assert np.isclose(ps.value(201), 500) 18 | assert np.isclose(ps.value(500), 500) 19 | 20 | assert np.isclose(ps.value(200 - 1e-10), -50) 21 | 22 | 23 | def test_constant_schedule(): 24 | cs = ConstantSchedule(5) 25 | for i in range(-100, 100): 26 | assert np.isclose(cs.value(i), 5) 27 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_segment_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 4 | 5 | 6 | def test_tree_set(): 7 | tree = SumSegmentTree(4) 8 | 9 | tree[2] = 1.0 10 | tree[3] = 3.0 11 | 12 | assert np.isclose(tree.sum(), 4.0) 13 | assert np.isclose(tree.sum(0, 2), 0.0) 14 | assert np.isclose(tree.sum(0, 3), 1.0) 15 | assert np.isclose(tree.sum(2, 3), 1.0) 16 | assert np.isclose(tree.sum(2, -1), 1.0) 17 | assert np.isclose(tree.sum(2, 4), 4.0) 18 | 19 | 20 | def test_tree_set_overlap(): 21 | tree = SumSegmentTree(4) 22 | 23 | tree[2] = 1.0 24 | tree[2] = 3.0 25 | 26 | assert np.isclose(tree.sum(), 3.0) 27 | assert np.isclose(tree.sum(2, 3), 3.0) 28 | assert np.isclose(tree.sum(2, -1), 3.0) 29 | assert np.isclose(tree.sum(2, 4), 3.0) 30 | assert np.isclose(tree.sum(1, 2), 0.0) 31 | 32 | 33 | def test_prefixsum_idx(): 34 | tree = SumSegmentTree(4) 35 | 36 | tree[2] = 1.0 37 | tree[3] = 3.0 38 | 39 | assert tree.find_prefixsum_idx(0.0) == 2 40 | assert tree.find_prefixsum_idx(0.5) == 2 41 | assert tree.find_prefixsum_idx(0.99) == 2 42 | assert tree.find_prefixsum_idx(1.01) == 3 43 | assert tree.find_prefixsum_idx(3.00) == 3 44 | assert tree.find_prefixsum_idx(4.00) == 3 45 | 46 | 47 | def test_prefixsum_idx2(): 48 | tree = SumSegmentTree(4) 49 | 50 | tree[0] = 0.5 51 | tree[1] = 1.0 52 | tree[2] = 1.0 53 | tree[3] = 3.0 54 | 55 | assert tree.find_prefixsum_idx(0.00) == 0 56 | assert tree.find_prefixsum_idx(0.55) == 1 57 | assert tree.find_prefixsum_idx(0.99) == 1 58 | assert tree.find_prefixsum_idx(1.51) == 2 59 | assert tree.find_prefixsum_idx(3.00) == 3 60 | assert tree.find_prefixsum_idx(5.50) == 3 61 | 62 | 63 | def test_max_interval_tree(): 64 | tree = MinSegmentTree(4) 65 | 66 | tree[0] = 1.0 67 | tree[2] = 0.5 68 | tree[3] = 3.0 69 | 70 | assert np.isclose(tree.min(), 0.5) 71 | assert np.isclose(tree.min(0, 2), 1.0) 72 | assert np.isclose(tree.min(0, 3), 0.5) 73 | assert np.isclose(tree.min(0, -1), 0.5) 74 | assert np.isclose(tree.min(2, 4), 0.5) 75 | assert np.isclose(tree.min(3, 4), 3.0) 76 | 77 | tree[2] = 0.7 78 | 79 | assert np.isclose(tree.min(), 0.7) 80 | assert np.isclose(tree.min(0, 2), 1.0) 81 | assert np.isclose(tree.min(0, 3), 0.7) 82 | assert np.isclose(tree.min(0, -1), 0.7) 83 | assert np.isclose(tree.min(2, 4), 0.7) 84 | assert np.isclose(tree.min(3, 4), 3.0) 85 | 86 | tree[2] = 4.0 87 | 88 | assert np.isclose(tree.min(), 1.0) 89 | assert np.isclose(tree.min(0, 2), 1.0) 90 | assert np.isclose(tree.min(0, 3), 1.0) 91 | assert np.isclose(tree.min(0, -1), 1.0) 92 | assert np.isclose(tree.min(2, 4), 3.0) 93 | assert np.isclose(tree.min(2, 3), 4.0) 94 | assert np.isclose(tree.min(2, -1), 4.0) 95 | assert np.isclose(tree.min(3, 4), 3.0) 96 | 97 | 98 | if __name__ == '__main__': 99 | test_tree_set() 100 | test_tree_set_overlap() 101 | test_prefixsum_idx() 102 | test_prefixsum_idx2() 103 | test_max_interval_tree() 104 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_tf_util.py: -------------------------------------------------------------------------------- 1 | # tests for tf_util 2 | import tensorflow as tf 3 | from baselines.common.tf_util import ( 4 | function, 5 | initialize, 6 | single_threaded_session 7 | ) 8 | 9 | 10 | def test_function(): 11 | with tf.Graph().as_default(): 12 | x = tf.placeholder(tf.int32, (), name="x") 13 | y = tf.placeholder(tf.int32, (), name="y") 14 | z = 3 * x + 2 * y 15 | lin = function([x, y], z, givens={y: 0}) 16 | 17 | with single_threaded_session(): 18 | initialize() 19 | 20 | assert lin(2) == 6 21 | assert lin(x=3) == 9 22 | assert lin(2, 2) == 10 23 | assert lin(x=2, y=3) == 12 24 | 25 | 26 | def test_multikwargs(): 27 | with tf.Graph().as_default(): 28 | x = tf.placeholder(tf.int32, (), name="x") 29 | with tf.variable_scope("other"): 30 | x2 = tf.placeholder(tf.int32, (), name="x") 31 | z = 3 * x + 2 * x2 32 | 33 | lin = function([x, x2], z, givens={x2: 0}) 34 | with single_threaded_session(): 35 | initialize() 36 | assert lin(2) == 6 37 | assert lin(2, 2) == 10 38 | 39 | 40 | if __name__ == '__main__': 41 | test_function() 42 | test_multikwargs() 43 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tests/test_with_mpi.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import subprocess 4 | import cloudpickle 5 | import base64 6 | import pytest 7 | from functools import wraps 8 | 9 | try: 10 | from mpi4py import MPI 11 | except ImportError: 12 | MPI = None 13 | 14 | def with_mpi(nproc=2, timeout=30, skip_if_no_mpi=True): 15 | def outer_thunk(fn): 16 | @wraps(fn) 17 | def thunk(*args, **kwargs): 18 | serialized_fn = base64.b64encode(cloudpickle.dumps(lambda: fn(*args, **kwargs))) 19 | subprocess.check_call([ 20 | 'mpiexec','-n', str(nproc), 21 | sys.executable, 22 | '-m', 'baselines.common.tests.test_with_mpi', 23 | serialized_fn 24 | ], env=os.environ, timeout=timeout) 25 | 26 | if skip_if_no_mpi: 27 | return pytest.mark.skipif(MPI is None, reason="MPI not present")(thunk) 28 | else: 29 | return thunk 30 | 31 | return outer_thunk 32 | 33 | 34 | if __name__ == '__main__': 35 | if len(sys.argv) > 1: 36 | fn = cloudpickle.loads(base64.b64decode(sys.argv[1])) 37 | assert callable(fn) 38 | fn() 39 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/tile_images.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def tile_images(img_nhwc): 4 | """ 5 | Tile N images into one big PxQ image 6 | (P,Q) are chosen to be as close as possible, and if N 7 | is square, then P=Q. 8 | 9 | input: img_nhwc, list or array of images, ndim=4 once turned into array 10 | n = batch index, h = height, w = width, c = channel 11 | returns: 12 | bigim_HWc, ndarray with ndim=3 13 | """ 14 | img_nhwc = np.asarray(img_nhwc) 15 | N, h, w, c = img_nhwc.shape 16 | H = int(np.ceil(np.sqrt(N))) 17 | W = int(np.ceil(float(N)/H)) 18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) 19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c) 20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) 21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) 22 | return img_Hh_Ww_c 23 | 24 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/__init__.py: -------------------------------------------------------------------------------- 1 | from .vec_env import AlreadySteppingError, NotSteppingError, VecEnv, VecEnvWrapper, VecEnvObservationWrapper, CloudpickleWrapper 2 | from .dummy_vec_env import DummyVecEnv 3 | from .shmem_vec_env import ShmemVecEnv 4 | from .subproc_vec_env import SubprocVecEnv 5 | from .vec_frame_stack import VecFrameStack 6 | from .vec_monitor import VecMonitor 7 | from .vec_normalize import VecNormalize 8 | from .vec_remove_dict_obs import VecExtractDictObs 9 | 10 | __all__ = ['AlreadySteppingError', 'NotSteppingError', 'VecEnv', 'VecEnvWrapper', 'VecEnvObservationWrapper', 'CloudpickleWrapper', 'DummyVecEnv', 'ShmemVecEnv', 'SubprocVecEnv', 'VecFrameStack', 'VecMonitor', 'VecNormalize', 'VecExtractDictObs'] 11 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/dummy_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .vec_env import VecEnv 3 | from .util import copy_obs_dict, dict_to_obs, obs_space_info 4 | 5 | class DummyVecEnv(VecEnv): 6 | """ 7 | VecEnv that does runs multiple environments sequentially, that is, 8 | the step and reset commands are send to one environment at a time. 9 | Useful when debugging and when num_env == 1 (in the latter case, 10 | avoids communication overhead) 11 | """ 12 | def __init__(self, env_fns): 13 | """ 14 | Arguments: 15 | 16 | env_fns: iterable of callables functions that build environments 17 | """ 18 | self.envs = [fn() for fn in env_fns] 19 | env = self.envs[0] 20 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 21 | obs_space = env.observation_space 22 | self.keys, shapes, dtypes = obs_space_info(obs_space) 23 | 24 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } 25 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) 26 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) 27 | self.buf_infos = [{} for _ in range(self.num_envs)] 28 | self.actions = None 29 | self.spec = self.envs[0].spec 30 | 31 | def step_async(self, actions): 32 | listify = True 33 | try: 34 | if len(actions) == self.num_envs: 35 | listify = False 36 | except TypeError: 37 | pass 38 | 39 | if not listify: 40 | self.actions = actions 41 | else: 42 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs) 43 | self.actions = [actions] 44 | 45 | def step_wait(self): 46 | for e in range(self.num_envs): 47 | action = self.actions[e] 48 | # if isinstance(self.envs[e].action_space, spaces.Discrete): 49 | # action = int(action) 50 | 51 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action) 52 | if self.buf_dones[e]: 53 | obs = self.envs[e].reset() 54 | self._save_obs(e, obs) 55 | return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones), 56 | self.buf_infos.copy()) 57 | 58 | def reset(self): 59 | for e in range(self.num_envs): 60 | obs = self.envs[e].reset() 61 | self._save_obs(e, obs) 62 | return self._obs_from_buf() 63 | 64 | def _save_obs(self, e, obs): 65 | for k in self.keys: 66 | if k is None: 67 | self.buf_obs[k][e] = obs 68 | else: 69 | self.buf_obs[k][e] = obs[k] 70 | 71 | def _obs_from_buf(self): 72 | return dict_to_obs(copy_obs_dict(self.buf_obs)) 73 | 74 | def get_images(self): 75 | return [env.render(mode='rgb_array') for env in self.envs] 76 | 77 | def render(self, mode='human'): 78 | if self.num_envs == 1: 79 | return self.envs[0].render(mode=mode) 80 | else: 81 | return super().render(mode=mode) 82 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/test_video_recorder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for asynchronous vectorized environments. 3 | """ 4 | 5 | import gym 6 | import pytest 7 | import os 8 | import glob 9 | import tempfile 10 | 11 | from .dummy_vec_env import DummyVecEnv 12 | from .shmem_vec_env import ShmemVecEnv 13 | from .subproc_vec_env import SubprocVecEnv 14 | from .vec_video_recorder import VecVideoRecorder 15 | 16 | @pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv)) 17 | @pytest.mark.parametrize('num_envs', (1, 4)) 18 | @pytest.mark.parametrize('video_length', (10, 100)) 19 | @pytest.mark.parametrize('video_interval', (1, 50)) 20 | def test_video_recorder(klass, num_envs, video_length, video_interval): 21 | """ 22 | Wrap an existing VecEnv with VevVideoRecorder, 23 | Make (video_interval + video_length + 1) steps, 24 | then check that the file is present 25 | """ 26 | 27 | def make_fn(): 28 | env = gym.make('PongNoFrameskip-v4') 29 | return env 30 | fns = [make_fn for _ in range(num_envs)] 31 | env = klass(fns) 32 | 33 | with tempfile.TemporaryDirectory() as video_path: 34 | env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length) 35 | 36 | env.reset() 37 | for _ in range(video_interval + video_length + 1): 38 | env.step([0] * num_envs) 39 | env.close() 40 | 41 | 42 | recorded_video = glob.glob(os.path.join(video_path, "*.mp4")) 43 | 44 | # first and second step 45 | assert len(recorded_video) == 2 46 | # Files are not empty 47 | assert all(os.stat(p).st_size != 0 for p in recorded_video) 48 | 49 | 50 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for dealing with vectorized environments. 3 | """ 4 | 5 | from collections import OrderedDict 6 | 7 | import gym 8 | import numpy as np 9 | 10 | 11 | def copy_obs_dict(obs): 12 | """ 13 | Deep-copy an observation dict. 14 | """ 15 | return {k: np.copy(v) for k, v in obs.items()} 16 | 17 | 18 | def dict_to_obs(obs_dict): 19 | """ 20 | Convert an observation dict into a raw array if the 21 | original observation space was not a Dict space. 22 | """ 23 | if set(obs_dict.keys()) == {None}: 24 | return obs_dict[None] 25 | return obs_dict 26 | 27 | 28 | def obs_space_info(obs_space): 29 | """ 30 | Get dict-structured information about a gym.Space. 31 | 32 | Returns: 33 | A tuple (keys, shapes, dtypes): 34 | keys: a list of dict keys. 35 | shapes: a dict mapping keys to shapes. 36 | dtypes: a dict mapping keys to dtypes. 37 | """ 38 | if isinstance(obs_space, gym.spaces.Dict): 39 | assert isinstance(obs_space.spaces, OrderedDict) 40 | subspaces = obs_space.spaces 41 | elif isinstance(obs_space, gym.spaces.Tuple): 42 | assert isinstance(obs_space.spaces, tuple) 43 | subspaces = {i: obs_space.spaces[i] for i in range(len(obs_space.spaces))} 44 | else: 45 | subspaces = {None: obs_space} 46 | keys = [] 47 | shapes = {} 48 | dtypes = {} 49 | for key, box in subspaces.items(): 50 | keys.append(key) 51 | shapes[key] = box.shape 52 | dtypes[key] = box.dtype 53 | return keys, shapes, dtypes 54 | 55 | 56 | def obs_to_dict(obs): 57 | """ 58 | Convert an observation into a dict. 59 | """ 60 | if isinstance(obs, dict): 61 | return obs 62 | return {None: obs} 63 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_frame_stack.py: -------------------------------------------------------------------------------- 1 | from .vec_env import VecEnvWrapper 2 | import numpy as np 3 | from gym import spaces 4 | 5 | 6 | class VecFrameStack(VecEnvWrapper): 7 | def __init__(self, venv, nstack): 8 | self.venv = venv 9 | self.nstack = nstack 10 | wos = venv.observation_space # wrapped ob space 11 | low = np.repeat(wos.low, self.nstack, axis=-1) 12 | high = np.repeat(wos.high, self.nstack, axis=-1) 13 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) 14 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) 15 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 16 | 17 | def step_wait(self): 18 | obs, rews, news, infos = self.venv.step_wait() 19 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) 20 | for (i, new) in enumerate(news): 21 | if new: 22 | self.stackedobs[i] = 0 23 | self.stackedobs[..., -obs.shape[-1]:] = obs 24 | return self.stackedobs, rews, news, infos 25 | 26 | def reset(self): 27 | obs = self.venv.reset() 28 | self.stackedobs[...] = 0 29 | self.stackedobs[..., -obs.shape[-1]:] = obs 30 | return self.stackedobs 31 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_monitor.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | from baselines.bench.monitor import ResultsWriter 3 | import numpy as np 4 | import time 5 | from collections import deque 6 | 7 | class VecMonitor(VecEnvWrapper): 8 | def __init__(self, venv, filename=None, keep_buf=0, info_keywords=()): 9 | VecEnvWrapper.__init__(self, venv) 10 | self.eprets = None 11 | self.eplens = None 12 | self.epcount = 0 13 | self.tstart = time.time() 14 | if filename: 15 | self.results_writer = ResultsWriter(filename, header={'t_start': self.tstart}, 16 | extra_keys=info_keywords) 17 | else: 18 | self.results_writer = None 19 | self.info_keywords = info_keywords 20 | self.keep_buf = keep_buf 21 | if self.keep_buf: 22 | self.epret_buf = deque([], maxlen=keep_buf) 23 | self.eplen_buf = deque([], maxlen=keep_buf) 24 | 25 | def reset(self): 26 | obs = self.venv.reset() 27 | self.eprets = np.zeros(self.num_envs, 'f') 28 | self.eplens = np.zeros(self.num_envs, 'i') 29 | return obs 30 | 31 | def step_wait(self): 32 | obs, rews, dones, infos = self.venv.step_wait() 33 | self.eprets += rews 34 | self.eplens += 1 35 | 36 | newinfos = list(infos[:]) 37 | for i in range(len(dones)): 38 | if dones[i]: 39 | info = infos[i].copy() 40 | ret = self.eprets[i] 41 | eplen = self.eplens[i] 42 | epinfo = {'r': ret, 'l': eplen, 't': round(time.time() - self.tstart, 6)} 43 | for k in self.info_keywords: 44 | epinfo[k] = info[k] 45 | info['episode'] = epinfo 46 | if self.keep_buf: 47 | self.epret_buf.append(ret) 48 | self.eplen_buf.append(eplen) 49 | self.epcount += 1 50 | self.eprets[i] = 0 51 | self.eplens[i] = 0 52 | if self.results_writer: 53 | self.results_writer.write_row(epinfo) 54 | newinfos[i] = info 55 | return obs, rews, dones, newinfos 56 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_normalize.py: -------------------------------------------------------------------------------- 1 | from . import VecEnvWrapper 2 | import numpy as np 3 | 4 | class VecNormalize(VecEnvWrapper): 5 | """ 6 | A vectorized wrapper that normalizes the observations 7 | and returns from an environment. 8 | """ 9 | 10 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False): 11 | VecEnvWrapper.__init__(self, venv) 12 | if use_tf: 13 | from baselines.common.running_mean_std import TfRunningMeanStd 14 | self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None 15 | self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None 16 | else: 17 | from baselines.common.running_mean_std import RunningMeanStd 18 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None 19 | self.ret_rms = RunningMeanStd(shape=()) if ret else None 20 | self.clipob = clipob 21 | self.cliprew = cliprew 22 | self.ret = np.zeros(self.num_envs) 23 | self.gamma = gamma 24 | self.epsilon = epsilon 25 | 26 | def step_wait(self): 27 | obs, rews, news, infos = self.venv.step_wait() 28 | 29 | # ADDED: 30 | for rollout in range(len(rews)): 31 | infos[rollout]["raw_reward"] = rews[rollout] 32 | 33 | self.ret = self.ret * self.gamma + rews 34 | obs = self._obfilt(obs) 35 | if self.ret_rms: 36 | self.ret_rms.update(self.ret) 37 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) 38 | self.ret[news] = 0. 39 | return obs, rews, news, infos 40 | 41 | def _obfilt(self, obs): 42 | if self.ob_rms: 43 | self.ob_rms.update(obs) 44 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) 45 | return obs 46 | else: 47 | return obs 48 | 49 | def reset(self): 50 | self.ret = np.zeros(self.num_envs) 51 | obs = self.venv.reset() 52 | return self._obfilt(obs) 53 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_remove_dict_obs.py: -------------------------------------------------------------------------------- 1 | from .vec_env import VecEnvObservationWrapper 2 | 3 | class VecExtractDictObs(VecEnvObservationWrapper): 4 | def __init__(self, venv, key): 5 | self.key = key 6 | super().__init__(venv=venv, 7 | observation_space=venv.observation_space.spaces[self.key]) 8 | 9 | def process(self, obs): 10 | return obs[self.key] 11 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/vec_env/vec_video_recorder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from baselines import logger 3 | from baselines.common.vec_env import VecEnvWrapper 4 | from gym.wrappers.monitoring import video_recorder 5 | 6 | 7 | class VecVideoRecorder(VecEnvWrapper): 8 | """ 9 | Wrap VecEnv to record rendered image as mp4 video. 10 | """ 11 | 12 | def __init__(self, venv, directory, record_video_trigger, video_length=200): 13 | """ 14 | # Arguments 15 | venv: VecEnv to wrap 16 | directory: Where to save videos 17 | record_video_trigger: 18 | Function that defines when to start recording. 19 | The function takes the current number of step, 20 | and returns whether we should start recording or not. 21 | video_length: Length of recorded video 22 | """ 23 | 24 | VecEnvWrapper.__init__(self, venv) 25 | self.record_video_trigger = record_video_trigger 26 | self.video_recorder = None 27 | 28 | self.directory = os.path.abspath(directory) 29 | if not os.path.exists(self.directory): os.mkdir(self.directory) 30 | 31 | self.file_prefix = "vecenv" 32 | self.file_infix = '{}'.format(os.getpid()) 33 | self.step_id = 0 34 | self.video_length = video_length 35 | 36 | self.recording = False 37 | self.recorded_frames = 0 38 | 39 | def reset(self): 40 | obs = self.venv.reset() 41 | 42 | self.start_video_recorder() 43 | 44 | return obs 45 | 46 | def start_video_recorder(self): 47 | self.close_video_recorder() 48 | 49 | base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id)) 50 | self.video_recorder = video_recorder.VideoRecorder( 51 | env=self.venv, 52 | base_path=base_path, 53 | metadata={'step_id': self.step_id} 54 | ) 55 | 56 | self.video_recorder.capture_frame() 57 | self.recorded_frames = 1 58 | self.recording = True 59 | 60 | def _video_enabled(self): 61 | return self.record_video_trigger(self.step_id) 62 | 63 | def step_wait(self): 64 | obs, rews, dones, infos = self.venv.step_wait() 65 | 66 | self.step_id += 1 67 | if self.recording: 68 | self.video_recorder.capture_frame() 69 | self.recorded_frames += 1 70 | if self.recorded_frames > self.video_length: 71 | logger.info("Saving video to ", self.video_recorder.path) 72 | self.close_video_recorder() 73 | elif self._video_enabled(): 74 | self.start_video_recorder() 75 | 76 | return obs, rews, dones, infos 77 | 78 | def close_video_recorder(self): 79 | if self.recording: 80 | self.video_recorder.close() 81 | self.recording = False 82 | self.recorded_frames = 0 83 | 84 | def close(self): 85 | VecEnvWrapper.close(self) 86 | self.close_video_recorder() 87 | 88 | def __del__(self): 89 | self.close() 90 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/common/wrappers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | class TimeLimit(gym.Wrapper): 4 | def __init__(self, env, max_episode_steps=None): 5 | super(TimeLimit, self).__init__(env) 6 | self._max_episode_steps = max_episode_steps 7 | self._elapsed_steps = 0 8 | 9 | def step(self, ac): 10 | observation, reward, done, info = self.env.step(ac) 11 | self._elapsed_steps += 1 12 | if self._elapsed_steps >= self._max_episode_steps: 13 | done = True 14 | info['TimeLimit.truncated'] = True 15 | return observation, reward, done, info 16 | 17 | def reset(self, **kwargs): 18 | self._elapsed_steps = 0 19 | return self.env.reset(**kwargs) 20 | 21 | class ClipActionsWrapper(gym.Wrapper): 22 | def step(self, action): 23 | import numpy as np 24 | action = np.nan_to_num(action) 25 | action = np.clip(action, self.action_space.low, self.action_space.high) 26 | return self.env.step(action) 27 | 28 | def reset(self, **kwargs): 29 | return self.env.reset(**kwargs) 30 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/README.md: -------------------------------------------------------------------------------- 1 | # DDPG 2 | 3 | - Original paper: https://arxiv.org/abs/1509.02971 4 | - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/ 5 | - `python -m baselines.run --alg=ddpg --env=HalfCheetah-v2 --num_timesteps=1e6` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options. 6 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RingBuffer(object): 5 | def __init__(self, maxlen, shape, dtype='float32'): 6 | self.maxlen = maxlen 7 | self.start = 0 8 | self.length = 0 9 | self.data = np.zeros((maxlen,) + shape).astype(dtype) 10 | 11 | def __len__(self): 12 | return self.length 13 | 14 | def __getitem__(self, idx): 15 | if idx < 0 or idx >= self.length: 16 | raise KeyError() 17 | return self.data[(self.start + idx) % self.maxlen] 18 | 19 | def get_batch(self, idxs): 20 | return self.data[(self.start + idxs) % self.maxlen] 21 | 22 | def append(self, v): 23 | if self.length < self.maxlen: 24 | # We have space, simply increase the length. 25 | self.length += 1 26 | elif self.length == self.maxlen: 27 | # No space, "remove" the first item. 28 | self.start = (self.start + 1) % self.maxlen 29 | else: 30 | # This should never happen. 31 | raise RuntimeError() 32 | self.data[(self.start + self.length - 1) % self.maxlen] = v 33 | 34 | 35 | def array_min2d(x): 36 | x = np.array(x) 37 | if x.ndim >= 2: 38 | return x 39 | return x.reshape(-1, 1) 40 | 41 | 42 | class Memory(object): 43 | def __init__(self, limit, action_shape, observation_shape): 44 | self.limit = limit 45 | 46 | self.observations0 = RingBuffer(limit, shape=observation_shape) 47 | self.actions = RingBuffer(limit, shape=action_shape) 48 | self.rewards = RingBuffer(limit, shape=(1,)) 49 | self.terminals1 = RingBuffer(limit, shape=(1,)) 50 | self.observations1 = RingBuffer(limit, shape=observation_shape) 51 | 52 | def sample(self, batch_size): 53 | # Draw such that we always have a proceeding element. 54 | batch_idxs = np.random.randint(self.nb_entries - 2, size=batch_size) 55 | 56 | obs0_batch = self.observations0.get_batch(batch_idxs) 57 | obs1_batch = self.observations1.get_batch(batch_idxs) 58 | action_batch = self.actions.get_batch(batch_idxs) 59 | reward_batch = self.rewards.get_batch(batch_idxs) 60 | terminal1_batch = self.terminals1.get_batch(batch_idxs) 61 | 62 | result = { 63 | 'obs0': array_min2d(obs0_batch), 64 | 'obs1': array_min2d(obs1_batch), 65 | 'rewards': array_min2d(reward_batch), 66 | 'actions': array_min2d(action_batch), 67 | 'terminals1': array_min2d(terminal1_batch), 68 | } 69 | return result 70 | 71 | def append(self, obs0, action, reward, obs1, terminal1, training=True): 72 | if not training: 73 | return 74 | 75 | self.observations0.append(obs0) 76 | self.actions.append(action) 77 | self.rewards.append(reward) 78 | self.observations1.append(obs1) 79 | self.terminals1.append(terminal1) 80 | 81 | @property 82 | def nb_entries(self): 83 | return len(self.observations0) 84 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/models.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from baselines.common.models import get_network_builder 3 | 4 | 5 | class Model(object): 6 | def __init__(self, name, network='mlp', **network_kwargs): 7 | self.name = name 8 | self.network_builder = get_network_builder(network)(**network_kwargs) 9 | 10 | @property 11 | def vars(self): 12 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name) 13 | 14 | @property 15 | def trainable_vars(self): 16 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name) 17 | 18 | @property 19 | def perturbable_vars(self): 20 | return [var for var in self.trainable_vars if 'LayerNorm' not in var.name] 21 | 22 | 23 | class Actor(Model): 24 | def __init__(self, nb_actions, name='actor', network='mlp', **network_kwargs): 25 | super().__init__(name=name, network=network, **network_kwargs) 26 | self.nb_actions = nb_actions 27 | 28 | def __call__(self, obs, reuse=False): 29 | with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): 30 | x = self.network_builder(obs) 31 | x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3)) 32 | x = tf.nn.tanh(x) 33 | return x 34 | 35 | 36 | class Critic(Model): 37 | def __init__(self, name='critic', network='mlp', **network_kwargs): 38 | super().__init__(name=name, network=network, **network_kwargs) 39 | self.layer_norm = True 40 | 41 | def __call__(self, obs, action, reuse=False): 42 | with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE): 43 | x = tf.concat([obs, action], axis=-1) # this assumes observation and action can be concatenated 44 | x = self.network_builder(x) 45 | x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3), name='output') 46 | return x 47 | 48 | @property 49 | def output_vars(self): 50 | output_vars = [var for var in self.trainable_vars if 'output' in var.name] 51 | return output_vars 52 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class AdaptiveParamNoiseSpec(object): 5 | def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01): 6 | self.initial_stddev = initial_stddev 7 | self.desired_action_stddev = desired_action_stddev 8 | self.adoption_coefficient = adoption_coefficient 9 | 10 | self.current_stddev = initial_stddev 11 | 12 | def adapt(self, distance): 13 | if distance > self.desired_action_stddev: 14 | # Decrease stddev. 15 | self.current_stddev /= self.adoption_coefficient 16 | else: 17 | # Increase stddev. 18 | self.current_stddev *= self.adoption_coefficient 19 | 20 | def get_stats(self): 21 | stats = { 22 | 'param_noise_stddev': self.current_stddev, 23 | } 24 | return stats 25 | 26 | def __repr__(self): 27 | fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})' 28 | return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient) 29 | 30 | 31 | class ActionNoise(object): 32 | def reset(self): 33 | pass 34 | 35 | 36 | class NormalActionNoise(ActionNoise): 37 | def __init__(self, mu, sigma): 38 | self.mu = mu 39 | self.sigma = sigma 40 | 41 | def __call__(self): 42 | return np.random.normal(self.mu, self.sigma) 43 | 44 | def __repr__(self): 45 | return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) 46 | 47 | 48 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 49 | class OrnsteinUhlenbeckActionNoise(ActionNoise): 50 | def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None): 51 | self.theta = theta 52 | self.mu = mu 53 | self.sigma = sigma 54 | self.dt = dt 55 | self.x0 = x0 56 | self.reset() 57 | 58 | def __call__(self): 59 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape) 60 | self.x_prev = x 61 | return x 62 | 63 | def reset(self): 64 | self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu) 65 | 66 | def __repr__(self): 67 | return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma) 68 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ddpg/test_smoke.py: -------------------------------------------------------------------------------- 1 | from baselines.common.tests.util import smoketest 2 | def _run(argstr): 3 | smoketest('--alg=ddpg --env=Pendulum-v0 --num_timesteps=0 ' + argstr) 4 | 5 | def test_popart(): 6 | _run('--normalize_returns=True --popart=True') 7 | 8 | def test_noise_normal(): 9 | _run('--noise_type=normal_0.1') 10 | 11 | def test_noise_ou(): 12 | _run('--noise_type=ou_0.1') 13 | 14 | def test_noise_adaptive(): 15 | _run('--noise_type=adaptive-param_0.2,normal_0.1') 16 | 17 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/README.md: -------------------------------------------------------------------------------- 1 | ## If you are curious. 2 | 3 | ##### Train a Cartpole agent and watch it play once it converges! 4 | 5 | Here's a list of commands to run to quickly get a working example: 6 | 7 | 8 | 9 | 10 | ```bash 11 | # Train model and save the results to cartpole_model.pkl 12 | python -m baselines.run --alg=deepq --env=CartPole-v0 --save_path=./cartpole_model.pkl --num_timesteps=1e5 13 | # Load the model saved in cartpole_model.pkl and visualize the learned policy 14 | python -m baselines.run --alg=deepq --env=CartPole-v0 --load_path=./cartpole_model.pkl --num_timesteps=0 --play 15 | ``` 16 | 17 | ## If you wish to apply DQN to solve a problem. 18 | 19 | Check out our simple agent trained with one stop shop `deepq.learn` function. 20 | 21 | - [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent. 22 | 23 | In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. Complimentary file `enjoy_cartpole.py` loads and visualizes the learned policy. 24 | 25 | ## If you wish to experiment with the algorithm 26 | 27 | ##### Check out the examples 28 | 29 | - [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm. 30 | - [baselines/deepq/defaults.py](defaults.py) - settings for training on atari. Run 31 | 32 | ```bash 33 | python -m baselines.run --alg=deepq --env=PongNoFrameskip-v4 34 | ``` 35 | to train on Atari Pong (see more in repo-wide [README.md](../../README.md#training-models)) 36 | 37 | 38 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.deepq import models # noqa 2 | from baselines.deepq.build_graph import build_act, build_train # noqa 3 | from baselines.deepq.deepq import learn, load_act # noqa 4 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa 5 | 6 | def wrap_atari_dqn(env): 7 | from baselines.common.atari_wrappers import wrap_deepmind 8 | return wrap_deepmind(env, frame_stack=True, scale=False) 9 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/defaults.py: -------------------------------------------------------------------------------- 1 | def atari(): 2 | return dict( 3 | network='conv_only', 4 | lr=1e-4, 5 | buffer_size=10000, 6 | exploration_fraction=0.1, 7 | exploration_final_eps=0.01, 8 | train_freq=4, 9 | learning_starts=10000, 10 | target_network_update_freq=1000, 11 | gamma=0.99, 12 | prioritized_replay=True, 13 | prioritized_replay_alpha=0.6, 14 | checkpoint_freq=10000, 15 | checkpoint_path=None, 16 | dueling=True 17 | ) 18 | 19 | def retro(): 20 | return atari() 21 | 22 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/enjoy_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def main(): 7 | env = gym.make("CartPole-v0") 8 | act = deepq.learn(env, network='mlp', total_timesteps=0, load_path="cartpole_model.pkl") 9 | 10 | while True: 11 | obs, done = env.reset(), False 12 | episode_rew = 0 13 | while not done: 14 | env.render() 15 | obs, rew, done, _ = env.step(act(obs[None])[0]) 16 | episode_rew += rew 17 | print("Episode reward", episode_rew) 18 | 19 | 20 | if __name__ == '__main__': 21 | main() 22 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/enjoy_mountaincar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | from baselines.common import models 5 | 6 | 7 | def main(): 8 | env = gym.make("MountainCar-v0") 9 | act = deepq.learn( 10 | env, 11 | network=models.mlp(num_layers=1, num_hidden=64), 12 | total_timesteps=0, 13 | load_path='mountaincar_model.pkl' 14 | ) 15 | 16 | while True: 17 | obs, done = env.reset(), False 18 | episode_rew = 0 19 | while not done: 20 | env.render() 21 | obs, rew, done, _ = env.step(act(obs[None])[0]) 22 | episode_rew += rew 23 | print("Episode reward", episode_rew) 24 | 25 | 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/enjoy_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from baselines import deepq 3 | 4 | 5 | def main(): 6 | env = gym.make("PongNoFrameskip-v4") 7 | env = deepq.wrap_atari_dqn(env) 8 | model = deepq.learn( 9 | env, 10 | "conv_only", 11 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], 12 | hiddens=[256], 13 | dueling=True, 14 | total_timesteps=0 15 | ) 16 | 17 | while True: 18 | obs, done = env.reset(), False 19 | episode_rew = 0 20 | while not done: 21 | env.render() 22 | obs, rew, done, _ = env.step(model(obs[None])[0]) 23 | episode_rew += rew 24 | print("Episode reward", episode_rew) 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/train_cartpole.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | 5 | 6 | def callback(lcl, _glb): 7 | # stop training if reward exceeds 199 8 | is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199 9 | return is_solved 10 | 11 | 12 | def main(): 13 | env = gym.make("CartPole-v0") 14 | act = deepq.learn( 15 | env, 16 | network='mlp', 17 | lr=1e-3, 18 | total_timesteps=100000, 19 | buffer_size=50000, 20 | exploration_fraction=0.1, 21 | exploration_final_eps=0.02, 22 | print_freq=10, 23 | callback=callback 24 | ) 25 | print("Saving model to cartpole_model.pkl") 26 | act.save("cartpole_model.pkl") 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/train_mountaincar.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from baselines import deepq 4 | from baselines.common import models 5 | 6 | 7 | def main(): 8 | env = gym.make("MountainCar-v0") 9 | # Enabling layer_norm here is import for parameter space noise! 10 | act = deepq.learn( 11 | env, 12 | network=models.mlp(num_hidden=64, num_layers=1), 13 | lr=1e-3, 14 | total_timesteps=100000, 15 | buffer_size=50000, 16 | exploration_fraction=0.1, 17 | exploration_final_eps=0.1, 18 | print_freq=10, 19 | param_noise=True 20 | ) 21 | print("Saving model to mountaincar_model.pkl") 22 | act.save("mountaincar_model.pkl") 23 | 24 | 25 | if __name__ == '__main__': 26 | main() 27 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/experiments/train_pong.py: -------------------------------------------------------------------------------- 1 | from baselines import deepq 2 | from baselines import bench 3 | from baselines import logger 4 | from baselines.common.atari_wrappers import make_atari 5 | 6 | 7 | def main(): 8 | logger.configure() 9 | env = make_atari('PongNoFrameskip-v4') 10 | env = bench.Monitor(env, logger.get_dir()) 11 | env = deepq.wrap_atari_dqn(env) 12 | 13 | model = deepq.learn( 14 | env, 15 | "conv_only", 16 | convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], 17 | hiddens=[256], 18 | dueling=True, 19 | lr=1e-4, 20 | total_timesteps=int(1e7), 21 | buffer_size=10000, 22 | exploration_fraction=0.1, 23 | exploration_final_eps=0.01, 24 | train_freq=4, 25 | learning_starts=10000, 26 | target_network_update_freq=1000, 27 | gamma=0.99, 28 | ) 29 | 30 | model.save('pong_model.pkl') 31 | env.close() 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/models.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib.layers as layers 3 | 4 | 5 | def build_q_func(network, hiddens=[256], dueling=True, layer_norm=False, **network_kwargs): 6 | if isinstance(network, str): 7 | from baselines.common.models import get_network_builder 8 | network = get_network_builder(network)(**network_kwargs) 9 | 10 | def q_func_builder(input_placeholder, num_actions, scope, reuse=False): 11 | with tf.variable_scope(scope, reuse=reuse): 12 | latent = network(input_placeholder) 13 | if isinstance(latent, tuple): 14 | if latent[1] is not None: 15 | raise NotImplementedError("DQN is not compatible with recurrent policies yet") 16 | latent = latent[0] 17 | 18 | latent = layers.flatten(latent) 19 | 20 | with tf.variable_scope("action_value"): 21 | action_out = latent 22 | for hidden in hiddens: 23 | action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None) 24 | if layer_norm: 25 | action_out = layers.layer_norm(action_out, center=True, scale=True) 26 | action_out = tf.nn.relu(action_out) 27 | action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None) 28 | 29 | if dueling: 30 | with tf.variable_scope("state_value"): 31 | state_out = latent 32 | for hidden in hiddens: 33 | state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None) 34 | if layer_norm: 35 | state_out = layers.layer_norm(state_out, center=True, scale=True) 36 | state_out = tf.nn.relu(state_out) 37 | state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None) 38 | action_scores_mean = tf.reduce_mean(action_scores, 1) 39 | action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1) 40 | q_out = state_score + action_scores_centered 41 | else: 42 | q_out = action_scores 43 | return q_out 44 | 45 | return q_func_builder 46 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/deepq/utils.py: -------------------------------------------------------------------------------- 1 | from baselines.common.input import observation_input 2 | from baselines.common.tf_util import adjust_shape 3 | 4 | # ================================================================ 5 | # Placeholders 6 | # ================================================================ 7 | 8 | 9 | class TfInput(object): 10 | def __init__(self, name="(unnamed)"): 11 | """Generalized Tensorflow placeholder. The main differences are: 12 | - possibly uses multiple placeholders internally and returns multiple values 13 | - can apply light postprocessing to the value feed to placeholder. 14 | """ 15 | self.name = name 16 | 17 | def get(self): 18 | """Return the tf variable(s) representing the possibly postprocessed value 19 | of placeholder(s). 20 | """ 21 | raise NotImplementedError 22 | 23 | def make_feed_dict(self, data): 24 | """Given data input it to the placeholder(s).""" 25 | raise NotImplementedError 26 | 27 | 28 | class PlaceholderTfInput(TfInput): 29 | def __init__(self, placeholder): 30 | """Wrapper for regular tensorflow placeholder.""" 31 | super().__init__(placeholder.name) 32 | self._placeholder = placeholder 33 | 34 | def get(self): 35 | return self._placeholder 36 | 37 | def make_feed_dict(self, data): 38 | return {self._placeholder: adjust_shape(self._placeholder, data)} 39 | 40 | 41 | class ObservationInput(PlaceholderTfInput): 42 | def __init__(self, observation_space, name=None): 43 | """Creates an input placeholder tailored to a specific observation space 44 | 45 | Parameters 46 | ---------- 47 | 48 | observation_space: 49 | observation space of the environment. Should be one of the gym.spaces types 50 | name: str 51 | tensorflow name of the underlying placeholder 52 | """ 53 | inpt, self.processed_inpt = observation_input(observation_space, name=name) 54 | super().__init__(inpt) 55 | 56 | def get(self): 57 | return self.processed_inpt 58 | 59 | 60 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/README.md: -------------------------------------------------------------------------------- 1 | # Generative Adversarial Imitation Learning (GAIL) 2 | 3 | - Original paper: https://arxiv.org/abs/1606.03476 4 | 5 | For results benchmarking on MuJoCo, please navigate to [here](result/gail-result.md) 6 | 7 | ## If you want to train an imitation learning agent 8 | 9 | ### Step 1: Download expert data 10 | 11 | Download the expert data into `./data`, [download link](https://drive.google.com/drive/folders/1h3H4AY_ZBx08hz-Ct0Nxxus-V1melu1U?usp=sharing) 12 | 13 | ### Step 2: Run GAIL 14 | 15 | Run with single rank: 16 | 17 | ```bash 18 | python -m baselines.gail.run_mujoco 19 | ``` 20 | 21 | Run with multiple ranks: 22 | 23 | ```bash 24 | mpirun -np 16 python -m baselines.gail.run_mujoco 25 | ``` 26 | 27 | See help (`-h`) for more options. 28 | 29 | #### In case you want to run Behavior Cloning (BC) 30 | 31 | ```bash 32 | python -m baselines.gail.behavior_clone 33 | ``` 34 | 35 | See help (`-h`) for more options. 36 | 37 | 38 | ## Contributing 39 | 40 | Bug reports and pull requests are welcome on GitHub at https://github.com/openai/baselines/pulls. 41 | 42 | ## Maintainers 43 | 44 | - Yuan-Hong Liao, andrewliao11_at_gmail_dot_com 45 | - Ryan Julian, ryanjulian_at_gmail_dot_com 46 | 47 | ## Others 48 | 49 | Thanks to the open source: 50 | 51 | - @openai/imitation 52 | - @carpedm20/deep-rl-tensorflow 53 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/dataset/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/mlp_policy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | from baselines/ppo1/mlp_policy.py and add simple modification 3 | (1) add reuse argument 4 | (2) cache the `stochastic` placeholder 5 | ''' 6 | import tensorflow as tf 7 | import gym 8 | 9 | import baselines.common.tf_util as U 10 | from baselines.common.mpi_running_mean_std import RunningMeanStd 11 | from baselines.common.distributions import make_pdtype 12 | from baselines.acktr.utils import dense 13 | 14 | 15 | class MlpPolicy(object): 16 | recurrent = False 17 | 18 | def __init__(self, name, reuse=False, *args, **kwargs): 19 | with tf.variable_scope(name): 20 | if reuse: 21 | tf.get_variable_scope().reuse_variables() 22 | self._init(*args, **kwargs) 23 | self.scope = tf.get_variable_scope().name 24 | 25 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): 26 | assert isinstance(ob_space, gym.spaces.Box) 27 | 28 | self.pdtype = pdtype = make_pdtype(ac_space) 29 | sequence_length = None 30 | 31 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) 32 | 33 | with tf.variable_scope("obfilter"): 34 | self.ob_rms = RunningMeanStd(shape=ob_space.shape) 35 | 36 | obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) 37 | last_out = obz 38 | for i in range(num_hid_layers): 39 | last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), weight_init=U.normc_initializer(1.0))) 40 | self.vpred = dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0] 41 | 42 | last_out = obz 43 | for i in range(num_hid_layers): 44 | last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), weight_init=U.normc_initializer(1.0))) 45 | 46 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): 47 | mean = dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01)) 48 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) 49 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) 50 | else: 51 | pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01)) 52 | 53 | self.pd = pdtype.pdfromflat(pdparam) 54 | 55 | self.state_in = [] 56 | self.state_out = [] 57 | 58 | # change for BC 59 | stochastic = U.get_placeholder(name="stochastic", dtype=tf.bool, shape=()) 60 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) 61 | self.ac = ac 62 | self._act = U.function([stochastic, ob], [ac, self.vpred]) 63 | 64 | def act(self, stochastic, ob): 65 | ac1, vpred1 = self._act(stochastic, ob[None]) 66 | return ac1[0], vpred1[0] 67 | 68 | def get_variables(self): 69 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 70 | 71 | def get_trainable_variables(self): 72 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 73 | 74 | def get_initial_state(self): 75 | return [] 76 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Hopper-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/gail-result.md: -------------------------------------------------------------------------------- 1 | # Results of GAIL/BC on Mujoco 2 | 3 | Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including 4 | Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0. 5 | 6 | ## Results 7 | 8 | ### Training through iterations 9 | 10 | - Hoppers-v1 11 | 12 | 13 | - HalfCheetah-v1 14 | 15 | 16 | - Walker2d-v1 17 | 18 | 19 | - Humanoid-v1 20 | 21 | 22 | - HumanoidStandup-v1 23 | 24 | 25 | For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing) 26 | 27 | ### Determinstic Policy (Set std=0) 28 | | | Un-normalized | Normalized | 29 | |---|---|---| 30 | | Hopper-v1 | | | 31 | | HalfCheetah-v1 | | | 32 | | Walker2d-v1 | | | 33 | | Humanoid-v1 | | | 34 | | HumanoidStandup-v1 | | | 35 | 36 | ### Stochatic Policy 37 | | | Un-normalized | Normalized | 38 | |---|---|---| 39 | | Hopper-v1 | | | 40 | | HalfCheetah-v1 | | | 41 | | Walker2d-v1 | | | 42 | | Humanoid-v1 | | | 43 | | HumanoidStandup-v1 | | | 44 | 45 | ### details about GAIL imitator 46 | 47 | For all environments, the 48 | imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most 49 | 1024 transitions, and seed 0, 1, 2, 3, respectively. 50 | 51 | ### details about the BC imitators 52 | 53 | All BC imitators are trained with seed 0. 54 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/halfcheetah-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/halfcheetah-training.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/hopper-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/hopper-training.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/humanoid-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/humanoid-training.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/humanoidstandup-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/humanoidstandup-training.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/walker2d-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/result/walker2d-training.png -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/gail/statistics.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py 3 | ''' 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | import baselines.common.tf_util as U 9 | 10 | 11 | class stats(): 12 | 13 | def __init__(self, scalar_keys=[], histogram_keys=[]): 14 | self.scalar_keys = scalar_keys 15 | self.histogram_keys = histogram_keys 16 | self.scalar_summaries = [] 17 | self.scalar_summaries_ph = [] 18 | self.histogram_summaries_ph = [] 19 | self.histogram_summaries = [] 20 | with tf.variable_scope('summary'): 21 | for k in scalar_keys: 22 | ph = tf.placeholder('float32', None, name=k+'.scalar.summary') 23 | sm = tf.summary.scalar(k+'.scalar.summary', ph) 24 | self.scalar_summaries_ph.append(ph) 25 | self.scalar_summaries.append(sm) 26 | for k in histogram_keys: 27 | ph = tf.placeholder('float32', None, name=k+'.histogram.summary') 28 | sm = tf.summary.scalar(k+'.histogram.summary', ph) 29 | self.histogram_summaries_ph.append(ph) 30 | self.histogram_summaries.append(sm) 31 | 32 | self.summaries = tf.summary.merge(self.scalar_summaries+self.histogram_summaries) 33 | 34 | def add_all_summary(self, writer, values, iter): 35 | # Note that the order of the incoming ```values``` should be the same as the that of the 36 | # ```scalar_keys``` given in ```__init__``` 37 | if np.sum(np.isnan(values)+0) != 0: 38 | return 39 | sess = U.get_session() 40 | keys = self.scalar_summaries_ph + self.histogram_summaries_ph 41 | feed_dict = {} 42 | for k, v in zip(keys, values): 43 | feed_dict.update({k: v}) 44 | summaries_str = sess.run(self.summaries, feed_dict) 45 | writer.add_summary(summaries_str, iter) 46 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/actor_critic.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from baselines.her.util import store_args, nn 3 | 4 | 5 | class ActorCritic: 6 | @store_args 7 | def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, 8 | **kwargs): 9 | """The actor-critic network and related training code. 10 | 11 | Args: 12 | inputs_tf (dict of tensors): all necessary inputs for the network: the 13 | observation (o), the goal (g), and the action (u) 14 | dimo (int): the dimension of the observations 15 | dimg (int): the dimension of the goals 16 | dimu (int): the dimension of the actions 17 | max_u (float): the maximum magnitude of actions; action outputs will be scaled 18 | accordingly 19 | o_stats (baselines.her.Normalizer): normalizer for observations 20 | g_stats (baselines.her.Normalizer): normalizer for goals 21 | hidden (int): number of hidden units that should be used in hidden layers 22 | layers (int): number of hidden layers 23 | """ 24 | self.o_tf = inputs_tf['o'] 25 | self.g_tf = inputs_tf['g'] 26 | self.u_tf = inputs_tf['u'] 27 | 28 | # Prepare inputs for actor and critic. 29 | o = self.o_stats.normalize(self.o_tf) 30 | g = self.g_stats.normalize(self.g_tf) 31 | input_pi = tf.concat(axis=1, values=[o, g]) # for actor 32 | 33 | # Networks. 34 | with tf.variable_scope('pi'): 35 | self.pi_tf = self.max_u * tf.tanh(nn( 36 | input_pi, [self.hidden] * self.layers + [self.dimu])) 37 | with tf.variable_scope('Q'): 38 | # for policy training 39 | input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u]) 40 | self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1]) 41 | # for critic training 42 | input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u]) 43 | self._input_Q = input_Q # exposed for tests 44 | self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True) 45 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/experiment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/experiment/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/experiment/play.py: -------------------------------------------------------------------------------- 1 | # DEPRECATED, use --play flag to baselines.run instead 2 | import click 3 | import numpy as np 4 | import pickle 5 | 6 | from baselines import logger 7 | from baselines.common import set_global_seeds 8 | import baselines.her.experiment.config as config 9 | from baselines.her.rollout import RolloutWorker 10 | 11 | 12 | @click.command() 13 | @click.argument('policy_file', type=str) 14 | @click.option('--seed', type=int, default=0) 15 | @click.option('--n_test_rollouts', type=int, default=10) 16 | @click.option('--render', type=int, default=1) 17 | def main(policy_file, seed, n_test_rollouts, render): 18 | set_global_seeds(seed) 19 | 20 | # Load policy. 21 | with open(policy_file, 'rb') as f: 22 | policy = pickle.load(f) 23 | env_name = policy.info['env_name'] 24 | 25 | # Prepare params. 26 | params = config.DEFAULT_PARAMS 27 | if env_name in config.DEFAULT_ENV_PARAMS: 28 | params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in 29 | params['env_name'] = env_name 30 | params = config.prepare_params(params) 31 | config.log_params(params, logger=logger) 32 | 33 | dims = config.configure_dims(params) 34 | 35 | eval_params = { 36 | 'exploit': True, 37 | 'use_target_net': params['test_with_polyak'], 38 | 'compute_Q': True, 39 | 'rollout_batch_size': 1, 40 | 'render': bool(render), 41 | } 42 | 43 | for name in ['T', 'gamma', 'noise_eps', 'random_eps']: 44 | eval_params[name] = params[name] 45 | 46 | evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) 47 | evaluator.seed(seed) 48 | 49 | # Run evaluation. 50 | evaluator.clear_history() 51 | for _ in range(n_test_rollouts): 52 | evaluator.generate_rollouts() 53 | 54 | # record logs 55 | for key, val in evaluator.logs('test'): 56 | logger.record_tabular(key, np.mean(val)) 57 | logger.dump_tabular() 58 | 59 | 60 | if __name__ == '__main__': 61 | main() 62 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/her/her_sampler.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun): 5 | """Creates a sample function that can be used for HER experience replay. 6 | 7 | Args: 8 | replay_strategy (in ['future', 'none']): the HER replay strategy; if set to 'none', 9 | regular DDPG experience replay is used 10 | replay_k (int): the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times 11 | as many HER replays as regular replays are used) 12 | reward_fun (function): function to re-compute the reward with substituted goals 13 | """ 14 | if replay_strategy == 'future': 15 | future_p = 1 - (1. / (1 + replay_k)) 16 | else: # 'replay_strategy' == 'none' 17 | future_p = 0 18 | 19 | def _sample_her_transitions(episode_batch, batch_size_in_transitions): 20 | """episode_batch is {key: array(buffer_size x T x dim_key)} 21 | """ 22 | T = episode_batch['u'].shape[1] 23 | rollout_batch_size = episode_batch['u'].shape[0] 24 | batch_size = batch_size_in_transitions 25 | 26 | # Select which episodes and time steps to use. 27 | episode_idxs = np.random.randint(0, rollout_batch_size, batch_size) 28 | t_samples = np.random.randint(T, size=batch_size) 29 | transitions = {key: episode_batch[key][episode_idxs, t_samples].copy() 30 | for key in episode_batch.keys()} 31 | 32 | # Select future time indexes proportional with probability future_p. These 33 | # will be used for HER replay by substituting in future goals. 34 | her_indexes = np.where(np.random.uniform(size=batch_size) < future_p) 35 | future_offset = np.random.uniform(size=batch_size) * (T - t_samples) 36 | future_offset = future_offset.astype(int) 37 | future_t = (t_samples + 1 + future_offset)[her_indexes] 38 | 39 | # Replace goal with achieved goal but only for the previously-selected 40 | # HER transitions (as defined by her_indexes). For the other transitions, 41 | # keep the original goal. 42 | future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t] 43 | transitions['g'][her_indexes] = future_ag 44 | 45 | # Reconstruct info dictionary for reward computation. 46 | info = {} 47 | for key, value in transitions.items(): 48 | if key.startswith('info_'): 49 | info[key.replace('info_', '')] = value 50 | 51 | # Re-compute reward since we may have substituted the goal. 52 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']} 53 | reward_params['info'] = info 54 | transitions['r'] = reward_fun(**reward_params) 55 | 56 | transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) 57 | for k in transitions.keys()} 58 | 59 | assert(transitions['u'].shape[0] == batch_size_in_transitions) 60 | 61 | return transitions 62 | 63 | return _sample_her_transitions 64 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/README.md: -------------------------------------------------------------------------------- 1 | # PPOSGD 2 | 3 | - Original paper: https://arxiv.org/abs/1707.06347 4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. 6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment. 7 | 8 | - Train mujoco 3d humanoid (with optimal-ish hyperparameters): `mpirun -np 16 python -m baselines.ppo1.run_humanoid --model-path=/path/to/model` 9 | - Render the 3d humanoid: `python -m baselines.ppo1.run_humanoid --play --model-path=/path/to/model` 10 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/cnn_policy.py: -------------------------------------------------------------------------------- 1 | import baselines.common.tf_util as U 2 | import tensorflow as tf 3 | import gym 4 | from baselines.common.distributions import make_pdtype 5 | 6 | class CnnPolicy(object): 7 | recurrent = False 8 | def __init__(self, name, ob_space, ac_space, kind='large'): 9 | with tf.variable_scope(name): 10 | self._init(ob_space, ac_space, kind) 11 | self.scope = tf.get_variable_scope().name 12 | 13 | def _init(self, ob_space, ac_space, kind): 14 | assert isinstance(ob_space, gym.spaces.Box) 15 | 16 | self.pdtype = pdtype = make_pdtype(ac_space) 17 | sequence_length = None 18 | 19 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) 20 | 21 | x = ob / 255.0 22 | if kind == 'small': # from A3C paper 23 | x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID")) 24 | x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID")) 25 | x = U.flattenallbut0(x) 26 | x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0))) 27 | elif kind == 'large': # Nature DQN 28 | x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID")) 29 | x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID")) 30 | x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID")) 31 | x = U.flattenallbut0(x) 32 | x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0))) 33 | else: 34 | raise NotImplementedError 35 | 36 | logits = tf.layers.dense(x, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01)) 37 | self.pd = pdtype.pdfromflat(logits) 38 | self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:,0] 39 | 40 | self.state_in = [] 41 | self.state_out = [] 42 | 43 | stochastic = tf.placeholder(dtype=tf.bool, shape=()) 44 | ac = self.pd.sample() # XXX 45 | self._act = U.function([stochastic, ob], [ac, self.vpred]) 46 | 47 | def act(self, stochastic, ob): 48 | ac1, vpred1 = self._act(stochastic, ob[None]) 49 | return ac1[0], vpred1[0] 50 | def get_variables(self): 51 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 52 | def get_trainable_variables(self): 53 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 54 | def get_initial_state(self): 55 | return [] 56 | 57 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/mlp_policy.py: -------------------------------------------------------------------------------- 1 | from baselines.common.mpi_running_mean_std import RunningMeanStd 2 | import baselines.common.tf_util as U 3 | import tensorflow as tf 4 | import gym 5 | from baselines.common.distributions import make_pdtype 6 | 7 | class MlpPolicy(object): 8 | recurrent = False 9 | def __init__(self, name, *args, **kwargs): 10 | with tf.variable_scope(name): 11 | self._init(*args, **kwargs) 12 | self.scope = tf.get_variable_scope().name 13 | 14 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): 15 | assert isinstance(ob_space, gym.spaces.Box) 16 | 17 | self.pdtype = pdtype = make_pdtype(ac_space) 18 | sequence_length = None 19 | 20 | ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape)) 21 | 22 | with tf.variable_scope("obfilter"): 23 | self.ob_rms = RunningMeanStd(shape=ob_space.shape) 24 | 25 | with tf.variable_scope('vf'): 26 | obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) 27 | last_out = obz 28 | for i in range(num_hid_layers): 29 | last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))) 30 | self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0] 31 | 32 | with tf.variable_scope('pol'): 33 | last_out = obz 34 | for i in range(num_hid_layers): 35 | last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))) 36 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): 37 | mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01)) 38 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer()) 39 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) 40 | else: 41 | pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01)) 42 | 43 | self.pd = pdtype.pdfromflat(pdparam) 44 | 45 | self.state_in = [] 46 | self.state_out = [] 47 | 48 | stochastic = tf.placeholder(dtype=tf.bool, shape=()) 49 | ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) 50 | self._act = U.function([stochastic, ob], [ac, self.vpred]) 51 | 52 | def act(self, stochastic, ob): 53 | ac1, vpred1 = self._act(stochastic, ob[None]) 54 | return ac1[0], vpred1[0] 55 | def get_variables(self): 56 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 57 | def get_trainable_variables(self): 58 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 59 | def get_initial_state(self): 60 | return [] 61 | 62 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/run_atari.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from mpi4py import MPI 4 | from baselines.common import set_global_seeds 5 | from baselines import bench 6 | import os.path as osp 7 | from baselines import logger 8 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 9 | from baselines.common.cmd_util import atari_arg_parser 10 | 11 | def train(env_id, num_timesteps, seed): 12 | from baselines.ppo1 import pposgd_simple, cnn_policy 13 | import baselines.common.tf_util as U 14 | rank = MPI.COMM_WORLD.Get_rank() 15 | sess = U.single_threaded_session() 16 | sess.__enter__() 17 | if rank == 0: 18 | logger.configure() 19 | else: 20 | logger.configure(format_strs=[]) 21 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None 22 | set_global_seeds(workerseed) 23 | env = make_atari(env_id) 24 | def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 25 | return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) 26 | env = bench.Monitor(env, logger.get_dir() and 27 | osp.join(logger.get_dir(), str(rank))) 28 | env.seed(workerseed) 29 | 30 | env = wrap_deepmind(env) 31 | env.seed(workerseed) 32 | 33 | pposgd_simple.learn(env, policy_fn, 34 | max_timesteps=int(num_timesteps * 1.1), 35 | timesteps_per_actorbatch=256, 36 | clip_param=0.2, entcoeff=0.01, 37 | optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, 38 | gamma=0.99, lam=0.95, 39 | schedule='linear' 40 | ) 41 | env.close() 42 | 43 | def main(): 44 | args = atari_arg_parser().parse_args() 45 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 46 | 47 | if __name__ == '__main__': 48 | main() 49 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/run_humanoid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 4 | from baselines.common import tf_util as U 5 | from baselines import logger 6 | 7 | import gym 8 | 9 | def train(num_timesteps, seed, model_path=None): 10 | env_id = 'Humanoid-v2' 11 | from baselines.ppo1 import mlp_policy, pposgd_simple 12 | U.make_session(num_cpu=1).__enter__() 13 | def policy_fn(name, ob_space, ac_space): 14 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 15 | hid_size=64, num_hid_layers=2) 16 | env = make_mujoco_env(env_id, seed) 17 | 18 | # parameters below were the best found in a simple random search 19 | # these are good enough to make humanoid walk, but whether those are 20 | # an absolute best or not is not certain 21 | env = RewScale(env, 0.1) 22 | logger.log("NOTE: reward will be scaled by a factor of 10 in logged stats. Check the monitor for unscaled reward.") 23 | pi = pposgd_simple.learn(env, policy_fn, 24 | max_timesteps=num_timesteps, 25 | timesteps_per_actorbatch=2048, 26 | clip_param=0.1, entcoeff=0.0, 27 | optim_epochs=10, 28 | optim_stepsize=1e-4, 29 | optim_batchsize=64, 30 | gamma=0.99, 31 | lam=0.95, 32 | schedule='constant', 33 | ) 34 | env.close() 35 | if model_path: 36 | U.save_state(model_path) 37 | 38 | return pi 39 | 40 | class RewScale(gym.RewardWrapper): 41 | def __init__(self, env, scale): 42 | gym.RewardWrapper.__init__(self, env) 43 | self.scale = scale 44 | def reward(self, r): 45 | return r * self.scale 46 | 47 | def main(): 48 | logger.configure() 49 | parser = mujoco_arg_parser() 50 | parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) 51 | parser.set_defaults(num_timesteps=int(5e7)) 52 | 53 | args = parser.parse_args() 54 | 55 | if not args.play: 56 | # train the model 57 | train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) 58 | else: 59 | # construct the model object, load pre-trained model and render 60 | pi = train(num_timesteps=1, seed=args.seed) 61 | U.load_state(args.model_path) 62 | env = make_mujoco_env('Humanoid-v2', seed=0) 63 | 64 | ob = env.reset() 65 | while True: 66 | action = pi.act(stochastic=False, ob=ob)[0] 67 | ob, _, done, _ = env.step(action) 68 | env.render() 69 | if done: 70 | ob = env.reset() 71 | 72 | if __name__ == '__main__': 73 | main() 74 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 4 | from baselines.common import tf_util as U 5 | from baselines import logger 6 | 7 | def train(env_id, num_timesteps, seed): 8 | from baselines.ppo1 import mlp_policy, pposgd_simple 9 | U.make_session(num_cpu=1).__enter__() 10 | def policy_fn(name, ob_space, ac_space): 11 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 12 | hid_size=64, num_hid_layers=2) 13 | env = make_mujoco_env(env_id, seed) 14 | pposgd_simple.learn(env, policy_fn, 15 | max_timesteps=num_timesteps, 16 | timesteps_per_actorbatch=2048, 17 | clip_param=0.2, entcoeff=0.0, 18 | optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, 19 | gamma=0.99, lam=0.95, schedule='linear', 20 | ) 21 | env.close() 22 | 23 | def main(): 24 | args = mujoco_arg_parser().parse_args() 25 | logger.configure() 26 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 27 | 28 | if __name__ == '__main__': 29 | main() 30 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo1/run_robotics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from mpi4py import MPI 4 | from baselines.common import set_global_seeds 5 | from baselines import logger 6 | from baselines.common.cmd_util import make_robotics_env, robotics_arg_parser 7 | import mujoco_py 8 | 9 | 10 | def train(env_id, num_timesteps, seed): 11 | from baselines.ppo1 import mlp_policy, pposgd_simple 12 | import baselines.common.tf_util as U 13 | rank = MPI.COMM_WORLD.Get_rank() 14 | sess = U.single_threaded_session() 15 | sess.__enter__() 16 | mujoco_py.ignore_mujoco_warnings().__enter__() 17 | workerseed = seed + 10000 * rank 18 | set_global_seeds(workerseed) 19 | env = make_robotics_env(env_id, workerseed, rank=rank) 20 | def policy_fn(name, ob_space, ac_space): 21 | return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, 22 | hid_size=256, num_hid_layers=3) 23 | 24 | pposgd_simple.learn(env, policy_fn, 25 | max_timesteps=num_timesteps, 26 | timesteps_per_actorbatch=2048, 27 | clip_param=0.2, entcoeff=0.0, 28 | optim_epochs=5, optim_stepsize=3e-4, optim_batchsize=256, 29 | gamma=0.99, lam=0.95, schedule='linear', 30 | ) 31 | env.close() 32 | 33 | 34 | def main(): 35 | args = robotics_arg_parser().parse_args() 36 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/README.md: -------------------------------------------------------------------------------- 1 | # PPO2 2 | 3 | - Original paper: https://arxiv.org/abs/1707.06347 4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 5 | 6 | - `python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options. 7 | - `python -m baselines.run --alg=ppo2 --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M frames on a Mujoco Ant environment. 8 | - also refer to the repo-wide [README.md](../../README.md#training-models) 9 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/defaults.py: -------------------------------------------------------------------------------- 1 | def mujoco(): 2 | return dict( 3 | nsteps=2048, 4 | nminibatches=32, 5 | lam=0.95, 6 | gamma=0.99, 7 | noptepochs=10, 8 | log_interval=1, 9 | ent_coef=0.0, 10 | lr=lambda f: 3e-4 * f, 11 | cliprange=0.2, 12 | value_network='copy' 13 | ) 14 | 15 | def atari(): 16 | return dict( 17 | nsteps=128, nminibatches=4, 18 | lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, 19 | ent_coef=.01, 20 | lr=lambda f : f * 2.5e-4, 21 | cliprange=0.1, 22 | ) 23 | 24 | def retro(): 25 | return atari() 26 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/microbatched_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from baselines.ppo2.model import Model 4 | 5 | class MicrobatchedModel(Model): 6 | """ 7 | Model that does training one microbatch at a time - when gradient computation 8 | on the entire minibatch causes some overflow 9 | """ 10 | def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, 11 | nsteps, ent_coef, vf_coef, max_grad_norm, mpi_rank_weight, comm, microbatch_size): 12 | 13 | self.nmicrobatches = nbatch_train // microbatch_size 14 | self.microbatch_size = microbatch_size 15 | assert nbatch_train % microbatch_size == 0, 'microbatch_size ({}) should divide nbatch_train ({}) evenly'.format(microbatch_size, nbatch_train) 16 | 17 | super().__init__( 18 | policy=policy, 19 | ob_space=ob_space, 20 | ac_space=ac_space, 21 | nbatch_act=nbatch_act, 22 | nbatch_train=microbatch_size, 23 | nsteps=nsteps, 24 | ent_coef=ent_coef, 25 | vf_coef=vf_coef, 26 | max_grad_norm=max_grad_norm, 27 | mpi_rank_weight=mpi_rank_weight, 28 | comm=comm) 29 | 30 | self.grads_ph = [tf.placeholder(dtype=g.dtype, shape=g.shape) for g in self.grads] 31 | grads_ph_and_vars = list(zip(self.grads_ph, self.var)) 32 | self._apply_gradients_op = self.trainer.apply_gradients(grads_ph_and_vars) 33 | 34 | 35 | def train(self, lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): 36 | assert states is None, "microbatches with recurrent models are not supported yet" 37 | 38 | # Here we calculate advantage A(s,a) = R + yV(s') - V(s) 39 | # Returns = R + yV(s') 40 | advs = returns - values 41 | 42 | # Normalize the advantages 43 | advs = (advs - advs.mean()) / (advs.std() + 1e-8) 44 | 45 | # Initialize empty list for per-microbatch stats like pg_loss, vf_loss, entropy, approxkl (whatever is in self.stats_list) 46 | stats_vs = [] 47 | 48 | for microbatch_idx in range(self.nmicrobatches): 49 | _sli = range(microbatch_idx * self.microbatch_size, (microbatch_idx+1) * self.microbatch_size) 50 | td_map = { 51 | self.train_model.X: obs[_sli], 52 | self.A:actions[_sli], 53 | self.ADV:advs[_sli], 54 | self.R:returns[_sli], 55 | self.CLIPRANGE:cliprange, 56 | self.OLDNEGLOGPAC:neglogpacs[_sli], 57 | self.OLDVPRED:values[_sli] 58 | } 59 | 60 | # Compute gradient on a microbatch (note that variables do not change here) ... 61 | grad_v, stats_v = self.sess.run([self.grads, self.stats_list], td_map) 62 | if microbatch_idx == 0: 63 | sum_grad_v = grad_v 64 | else: 65 | # .. and add to the total of the gradients 66 | for i, g in enumerate(grad_v): 67 | sum_grad_v[i] += g 68 | stats_vs.append(stats_v) 69 | 70 | feed_dict = {ph: sum_g / self.nmicrobatches for ph, sum_g in zip(self.grads_ph, sum_grad_v)} 71 | feed_dict[self.LR] = lr 72 | # Update variables using average of the gradients 73 | self.sess.run(self._apply_gradients_op, feed_dict) 74 | # Return average of the stats 75 | return np.mean(np.array(stats_vs), axis=0).tolist() 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/runner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from baselines.common.runners import AbstractEnvRunner 3 | 4 | class Runner(AbstractEnvRunner): 5 | """ 6 | We use this object to make a mini batch of experiences 7 | __init__: 8 | - Initialize the runner 9 | 10 | run(): 11 | - Make a mini batch 12 | """ 13 | def __init__(self, *, env, model, nsteps, gamma, lam): 14 | super().__init__(env=env, model=model, nsteps=nsteps) 15 | # Lambda used in GAE (General Advantage Estimation) 16 | self.lam = lam 17 | # Discount rate 18 | self.gamma = gamma 19 | 20 | def run(self): 21 | # Here, we init the lists that will contain the mb of experiences 22 | mb_obs, mb_rewards, mb_actions, mb_values, mb_dones, mb_neglogpacs = [],[],[],[],[],[] 23 | mb_states = self.states 24 | epinfos = [] 25 | # For n in range number of steps 26 | for _ in range(self.nsteps): 27 | # Given observations, get action value and neglopacs 28 | # We already have self.obs because Runner superclass run self.obs[:] = env.reset() on init 29 | actions, values, self.states, neglogpacs = self.model.step(self.obs, S=self.states, M=self.dones) 30 | mb_obs.append(self.obs.copy()) 31 | mb_actions.append(actions) 32 | mb_values.append(values) 33 | mb_neglogpacs.append(neglogpacs) 34 | mb_dones.append(self.dones) 35 | 36 | # Take actions in env and look the results 37 | # Infos contains a ton of useful informations 38 | self.obs[:], rewards, self.dones, infos = self.env.step(actions) 39 | for info in infos: 40 | maybeepinfo = info.get('episode') 41 | if maybeepinfo: epinfos.append(maybeepinfo) 42 | mb_rewards.append(rewards) 43 | #batch of steps to batch of rollouts 44 | mb_obs = np.asarray(mb_obs, dtype=self.obs.dtype) 45 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32) 46 | mb_actions = np.asarray(mb_actions) 47 | mb_values = np.asarray(mb_values, dtype=np.float32) 48 | mb_neglogpacs = np.asarray(mb_neglogpacs, dtype=np.float32) 49 | mb_dones = np.asarray(mb_dones, dtype=np.bool) 50 | last_values = self.model.value(self.obs, S=self.states, M=self.dones) 51 | 52 | # discount/bootstrap off value fn 53 | mb_returns = np.zeros_like(mb_rewards) 54 | mb_advs = np.zeros_like(mb_rewards) 55 | lastgaelam = 0 56 | for t in reversed(range(self.nsteps)): 57 | if t == self.nsteps - 1: 58 | nextnonterminal = 1.0 - self.dones 59 | nextvalues = last_values 60 | else: 61 | nextnonterminal = 1.0 - mb_dones[t+1] 62 | nextvalues = mb_values[t+1] 63 | delta = mb_rewards[t] + self.gamma * nextvalues * nextnonterminal - mb_values[t] 64 | mb_advs[t] = lastgaelam = delta + self.gamma * self.lam * nextnonterminal * lastgaelam 65 | mb_returns = mb_advs + mb_values 66 | return (*map(sf01, (mb_obs, mb_returns, mb_dones, mb_actions, mb_values, mb_neglogpacs)), 67 | mb_states, epinfos) 68 | # obs, returns, masks, actions, values, neglogpacs, states = runner.run() 69 | def sf01(arr): 70 | """ 71 | swap and then flatten axes 0 and 1 72 | """ 73 | s = arr.shape 74 | return arr.swapaxes(0, 1).reshape(s[0] * s[1], *s[2:]) 75 | 76 | 77 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/ppo2/test_microbatches.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import tensorflow as tf 3 | import numpy as np 4 | from functools import partial 5 | 6 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 7 | from baselines.common.tf_util import make_session 8 | from baselines.ppo2.ppo2 import learn 9 | 10 | from baselines.ppo2.microbatched_model import MicrobatchedModel 11 | 12 | def test_microbatches(): 13 | def env_fn(): 14 | env = gym.make('CartPole-v0') 15 | env.seed(0) 16 | return env 17 | 18 | learn_fn = partial(learn, network='mlp', nsteps=32, total_timesteps=32, seed=0) 19 | 20 | env_ref = DummyVecEnv([env_fn]) 21 | sess_ref = make_session(make_default=True, graph=tf.Graph()) 22 | learn_fn(env=env_ref) 23 | vars_ref = {v.name: sess_ref.run(v) for v in tf.trainable_variables()} 24 | 25 | env_test = DummyVecEnv([env_fn]) 26 | sess_test = make_session(make_default=True, graph=tf.Graph()) 27 | learn_fn(env=env_test, model_fn=partial(MicrobatchedModel, microbatch_size=2)) 28 | # learn_fn(env=env_test) 29 | vars_test = {v.name: sess_test.run(v) for v in tf.trainable_variables()} 30 | 31 | for v in vars_ref: 32 | np.testing.assert_allclose(vars_ref[v], vars_test[v], atol=3e-3) 33 | 34 | if __name__ == '__main__': 35 | test_microbatches() 36 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/trpo_mpi/README.md: -------------------------------------------------------------------------------- 1 | # trpo_mpi 2 | 3 | - Original paper: https://arxiv.org/abs/1502.05477 4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/ 5 | - `mpirun -np 16 python -m baselines.run --alg=trpo_mpi --env=PongNoFrameskip-v4` runs the algorithm for 40M frames = 10M timesteps on an Atari Pong. See help (`-h`) for more options. 6 | - `python -m baselines.run --alg=trpo_mpi --env=Ant-v2 --num_timesteps=1e6` runs the algorithm for 1M timesteps on a Mujoco Ant environment. 7 | - also refer to the repo-wide [README.md](../../README.md#training-models) 8 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/trpo_mpi/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/trpo_mpi/__init__.py -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/baselines/trpo_mpi/defaults.py: -------------------------------------------------------------------------------- 1 | from baselines.common.models import mlp, cnn_small 2 | 3 | 4 | def atari(): 5 | return dict( 6 | network = cnn_small(), 7 | timesteps_per_batch=512, 8 | max_kl=0.001, 9 | cg_iters=10, 10 | cg_damping=1e-3, 11 | gamma=0.98, 12 | lam=1.0, 13 | vf_iters=3, 14 | vf_stepsize=1e-4, 15 | entcoeff=0.00, 16 | ) 17 | 18 | def mujoco(): 19 | return dict( 20 | network = mlp(num_hidden=32, num_layers=2), 21 | timesteps_per_batch=1024, 22 | max_kl=0.01, 23 | cg_iters=10, 24 | cg_damping=0.1, 25 | gamma=0.99, 26 | lam=0.98, 27 | vf_iters=5, 28 | vf_stepsize=1e-3, 29 | normalize_observations=True, 30 | ) 31 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = F,E999,W291,W293 3 | exclude = 4 | .git, 5 | __pycache__, 6 | baselines/ppo1, 7 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/baselines_modified/setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from setuptools import setup, find_packages 3 | import sys 4 | 5 | if sys.version_info.major != 3: 6 | print('This Python is only compatible with Python 3, but you are running ' 7 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major)) 8 | 9 | 10 | extras = { 11 | 'test': [ 12 | 'filelock', 13 | 'pytest', 14 | 'pytest-forked', 15 | 'atari-py', 16 | 'matplotlib', 17 | 'pandas' 18 | ], 19 | 'mpi': [ 20 | 'mpi4py' 21 | ] 22 | } 23 | 24 | all_deps = [] 25 | for group_name in extras: 26 | all_deps += extras[group_name] 27 | 28 | extras['all'] = all_deps 29 | 30 | setup(name='baselines', 31 | packages=[package for package in find_packages() 32 | if package.startswith('baselines')], 33 | install_requires=[ 34 | 'gym>=0.10.0, <1.0.0', 35 | 'scipy', 36 | 'tqdm', 37 | 'joblib', 38 | 'cloudpickle', 39 | 'click', 40 | 'opencv-python' 41 | ], 42 | extras_require=extras, 43 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', 44 | author='OpenAI', 45 | url='https://github.com/openai/baselines', 46 | author_email='gym@openai.com', 47 | version='0.1.6') 48 | 49 | 50 | # ensure there is some tensorflow build with version above 1.4 51 | import pkg_resources 52 | tf_pkg = None 53 | for tf_pkg_name in ['tensorflow', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-gpu']: 54 | try: 55 | tf_pkg = pkg_resources.get_distribution(tf_pkg_name) 56 | except pkg_resources.DistributionNotFound: 57 | pass 58 | assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4' 59 | from distutils.version import LooseVersion 60 | assert LooseVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= LooseVersion('1.4.0') 61 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/enjoy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | # workaround to unpickle olf model files 4 | import sys 5 | 6 | import numpy as np 7 | import torch 8 | 9 | from a2c_ppo_acktr.envs import VecPyTorch, make_vec_envs 10 | from a2c_ppo_acktr.utils import get_render_func, get_vec_normalize 11 | 12 | sys.path.append('a2c_ppo_acktr') 13 | 14 | parser = argparse.ArgumentParser(description='RL') 15 | parser.add_argument( 16 | '--seed', type=int, default=1, help='random seed (default: 1)') 17 | parser.add_argument( 18 | '--log-interval', 19 | type=int, 20 | default=10, 21 | help='log interval, one log per n updates (default: 10)') 22 | parser.add_argument( 23 | '--env-name', 24 | default='PongNoFrameskip-v4', 25 | help='environment to train on (default: PongNoFrameskip-v4)') 26 | parser.add_argument( 27 | '--load-dir', 28 | default='./trained_models/', 29 | help='directory to save agent logs (default: ./trained_models/)') 30 | parser.add_argument( 31 | '--non-det', 32 | action='store_true', 33 | default=False, 34 | help='whether to use a non-deterministic policy') 35 | args = parser.parse_args() 36 | 37 | args.det = not args.non_det 38 | 39 | env = make_vec_envs( 40 | args.env_name, 41 | args.seed + 1000, 42 | 1, 43 | None, 44 | None, 45 | device='cpu', 46 | allow_early_resets=False) 47 | 48 | # Get a render function 49 | render_func = get_render_func(env) 50 | 51 | # We need to use the same statistics for normalization as used in training 52 | actor_critic, ob_rms = \ 53 | torch.load(os.path.join(args.load_dir, args.env_name + ".pt")) 54 | 55 | vec_norm = get_vec_normalize(env) 56 | if vec_norm is not None: 57 | vec_norm.eval() 58 | vec_norm.ob_rms = ob_rms 59 | 60 | recurrent_hidden_states = torch.zeros(1, 61 | actor_critic.recurrent_hidden_state_size) 62 | masks = torch.zeros(1, 1) 63 | 64 | obs = env.reset() 65 | 66 | if render_func is not None: 67 | render_func('human') 68 | 69 | if args.env_name.find('Bullet') > -1: 70 | import pybullet as p 71 | 72 | torsoId = -1 73 | for i in range(p.getNumBodies()): 74 | if (p.getBodyInfo(i)[0].decode() == "torso"): 75 | torsoId = i 76 | 77 | while True: 78 | with torch.no_grad(): 79 | value, action, _, recurrent_hidden_states = actor_critic.act( 80 | obs, recurrent_hidden_states, masks, deterministic=args.det) 81 | 82 | # Obser reward and next obs 83 | obs, reward, done, _ = env.step(action) 84 | 85 | masks.fill_(0.0 if done else 1.0) 86 | 87 | if args.env_name.find('Bullet') > -1: 88 | if torsoId > -1: 89 | distance = 5 90 | yaw = 0 91 | humanPos, humanOrn = p.getBasePositionAndOrientation(torsoId) 92 | p.resetDebugVisualizerCamera(distance, yaw, -20, humanPos) 93 | 94 | if render_func is not None: 95 | render_func('human') 96 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from a2c_ppo_acktr import utils 5 | from a2c_ppo_acktr.envs import make_vec_envs 6 | 7 | 8 | def evaluate(actor_critic, ob_rms, env_name, seed, num_processes, eval_log_dir, 9 | device): 10 | eval_envs = make_vec_envs(env_name, seed + num_processes, num_processes, 11 | None, eval_log_dir, device, True) 12 | 13 | vec_norm = utils.get_vec_normalize(eval_envs) 14 | if vec_norm is not None: 15 | vec_norm.eval() 16 | vec_norm.ob_rms = ob_rms 17 | 18 | eval_episode_rewards = [] 19 | 20 | obs = eval_envs.reset() 21 | eval_recurrent_hidden_states = torch.zeros( 22 | num_processes, actor_critic.recurrent_hidden_state_size, device=device) 23 | eval_masks = torch.zeros(num_processes, 1, device=device) 24 | 25 | while len(eval_episode_rewards) < 10: 26 | with torch.no_grad(): 27 | _, action, _, eval_recurrent_hidden_states = actor_critic.act( 28 | obs, 29 | eval_recurrent_hidden_states, 30 | eval_masks, 31 | deterministic=True) 32 | 33 | # Obser reward and next obs 34 | obs, _, done, infos = eval_envs.step(action) 35 | 36 | eval_masks = torch.tensor( 37 | [[0.0] if done_ else [1.0] for done_ in done], 38 | dtype=torch.float32, 39 | device=device) 40 | 41 | for info in infos: 42 | if 'episode' in info.keys(): 43 | eval_episode_rewards.append(info['episode']['r']) 44 | 45 | eval_envs.close() 46 | 47 | print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( 48 | len(eval_episode_rewards), np.mean(eval_episode_rewards))) 49 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/gail_experts/README.md: -------------------------------------------------------------------------------- 1 | ## Data 2 | 3 | Download from 4 | https://drive.google.com/open?id=1Ipu5k99nwewVDG1yFetUxqtwVlgBg5su 5 | 6 | and store in this folder. 7 | 8 | ## Convert to pytorch 9 | 10 | ```bash 11 | python convert_to_pytorch.py --h5-file trajs_halfcheetah.h5 12 | ``` 13 | 14 | ## Run 15 | 16 | ```bash 17 | python main.py --env-name "HalfCheetah-v2" --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --gae-lambda 0.95 --num-env-steps 10000000 --use-linear-lr-decay --use-proper-time-limits --gail 18 | ``` 19 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/gail_experts/convert_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import sys 4 | 5 | import h5py 6 | import numpy as np 7 | import torch 8 | 9 | 10 | def main(): 11 | parser = argparse.ArgumentParser( 12 | 'Converts expert trajectories from h5 to pt format.') 13 | parser.add_argument( 14 | '--h5-file', 15 | default='trajs_halfcheetah.h5', 16 | help='input h5 file', 17 | type=str) 18 | parser.add_argument( 19 | '--pt-file', 20 | default=None, 21 | help='output pt file, by default replaces file extension with pt', 22 | type=str) 23 | args = parser.parse_args() 24 | 25 | if args.pt_file is None: 26 | args.pt_file = os.path.splitext(args.h5_file)[0] + '.pt' 27 | 28 | with h5py.File(args.h5_file, 'r') as f: 29 | dataset_size = f['obs_B_T_Do'].shape[0] # full dataset size 30 | 31 | states = f['obs_B_T_Do'][:dataset_size, ...][...] 32 | actions = f['a_B_T_Da'][:dataset_size, ...][...] 33 | rewards = f['r_B_T'][:dataset_size, ...][...] 34 | lens = f['len_B'][:dataset_size, ...][...] 35 | 36 | states = torch.from_numpy(states).float() 37 | actions = torch.from_numpy(actions).float() 38 | rewards = torch.from_numpy(rewards).float() 39 | lens = torch.from_numpy(lens).long() 40 | 41 | data = { 42 | 'states': states, 43 | 'actions': actions, 44 | 'rewards': rewards, 45 | 'lengths': lens 46 | } 47 | 48 | torch.save(data, args.pt_file) 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/generate_tmux_yaml.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import yaml 4 | 5 | parser = argparse.ArgumentParser(description='Process some integers.') 6 | parser.add_argument( 7 | '--num-seeds', 8 | type=int, 9 | default=4, 10 | help='number of random seeds to generate') 11 | parser.add_argument( 12 | '--env-names', 13 | default="PongNoFrameskip-v4", 14 | help='environment name separated by semicolons') 15 | args = parser.parse_args() 16 | 17 | ppo_mujoco_template = "python main.py --env-name {0} --algo ppo --use-gae --log-interval 1 --num-steps 2048 --num-processes 1 --lr 3e-4 --entropy-coef 0 --value-loss-coef 0.5 --ppo-epoch 10 --num-mini-batch 32 --gamma 0.99 --tau 0.95 --num-env-steps 1000000 --use-linear-lr-decay --no-cuda --log-dir /tmp/gym/{1}/{1}-{2} --seed {2} --use-proper-time-limits" 18 | 19 | ppo_atari_template = "env CUDA_VISIBLE_DEVICES={2} python main.py --env-name {0} --algo ppo --use-gae --lr 2.5e-4 --clip-param 0.1 --value-loss-coef 0.5 --num-processes 8 --num-steps 128 --num-mini-batch 4 --log-interval 1 --use-linear-lr-decay --entropy-coef 0.01 --log-dir /tmp/gym/{1}/{1}-{2} --seed {2}" 20 | 21 | template = ppo_atari_template 22 | 23 | config = {"session_name": "run-all", "windows": []} 24 | 25 | for i in range(args.num_seeds): 26 | panes_list = [] 27 | for env_name in args.env_names.split(';'): 28 | panes_list.append( 29 | template.format(env_name, 30 | env_name.split('-')[0].lower(), i)) 31 | 32 | config["windows"].append({ 33 | "window_name": "seed-{}".format(i), 34 | "panes": panes_list 35 | }) 36 | 37 | yaml.dump(config, open("run_all.yaml", "w"), default_flow_style=False) 38 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/requirements.txt: -------------------------------------------------------------------------------- 1 | gym 2 | matplotlib 3 | pybullet 4 | -------------------------------------------------------------------------------- /pytorch-a2c-ppo-acktr-gail_modified/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name='a2c-ppo-acktr', 5 | packages=find_packages(), 6 | version='0.0.1', 7 | install_requires=['gym', 'matplotlib', 'pybullet']) 8 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorboardX 2 | sklearn 3 | numpy 4 | matplotlib 5 | torch-ac 6 | gym 7 | gym-minigrid 8 | colored_traceback 9 | graphviz 10 | gym[atari] 11 | box2d-py 12 | opencv-python 13 | torchvision 14 | pybullet 15 | tqdm 16 | tensorflow-gpu 17 | kdtree -------------------------------------------------------------------------------- /scripts/__pycache__/analyze_synthesized_programs.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mfranzs/meta-learning-curiosity-algorithms/451161540c0014c8aec8f6c55bf40d53f3ff5c5e/scripts/__pycache__/analyze_synthesized_programs.cpython-37.pyc --------------------------------------------------------------------------------